ChatterjeeLab
/

moPPIt-v2

Model card Files Files and versions

moPPIt-v2 / scripts /train_qm9_pplm_classifier.sh

AlienChen's picture

Upload 106 files

c4b1fea verified over 1 year ago

history blame contribute delete

3.26 kB

	#!/bin/bash
	#SBATCH -o ../watch_folder/%x_%j.out # output file (%j expands to jobID)
	#SBATCH -N 1 # Total number of nodes requested
	#SBATCH --get-user-env # retrieve the users login environment
	#SBATCH --mem=64000 # server memory requested (per node)
	#SBATCH -t 960:00:00 # Time limit (hh:mm:ss)
	#SBATCH --constraint="[a100\|a6000\|a5000\|3090]"
	#SBATCH --ntasks-per-node=4
	#SBATCH --gres=gpu:4 # Type/number of GPUs needed
	#SBATCH --open-mode=append # Do not overwrite logs
	#SBATCH --requeue # Requeue upon preemption

	<<comment
	# Usage:
	cd scripts/
	MODEL=<ar\|mdlm\|udlm>
	PROP=<qed\|ring_count>
	sbatch \
	--export=ALL,MODEL=${MODEL},PROP=${PROP} \
	--job-name=train_qm9_pplm_classifier_${PROP}_${MODEL} \
	train_qm9_pplm_classifier.sh
	comment

	# Setup environment
	cd ../ \|\| exit # Go to the root directory of the repo
	source setup_env.sh
	export HYDRA_FULL_ERROR=1
	export NCCL_P2P_LEVEL=NVL

	# Expecting:
	# - MODEL (ar, mdlm, or udlm)
	# - PROP (qed or ring_count)
	if [ -z "${MODEL}" ]; then
	echo "MODEL is not set"
	exit 1
	fi
	if [ -z "${PROP}" ]; then
	echo "PROP is not set"
	exit 1
	fi
	LABEL_SMOOTHING=FALSE
	RUN_NAME="${PROP}_${MODEL}"

	if [ "${MODEL}" = "ar" ]; then
	# AR
	PARAMETERIZATION="ar"
	PRETRAINED_PATH="${PWD}/outputs/qm9/${MODEL}_no-guidance/checkpoints/best.ckpt"
	POOLING="attention_mean"
	# dummy properties
	DIFFUSION="absorbing_state"
	T=0
	TIME_COND=False
	elif [ "${MODEL}" = "mdlm" ]; then
	# MDLM
	DIFFUSION="absorbing_state"
	PARAMETERIZATION="subs"
	T=0
	TIME_COND=False
	PRETRAINED_PATH="${PWD}/outputs/qm9/${MODEL}_no-guidance/checkpoints/best.ckpt"
	POOLING="mean"
	elif [ "${MODEL}" = "udlm" ]; then
	# UDLM
	DIFFUSION="uniform"
	PARAMETERIZATION="d3pm"
	T=0
	TIME_COND=True
	PRETRAINED_PATH="${PWD}/outputs/qm9/${MODEL}_no-guidance/checkpoints/best.ckpt"
	POOLING="mean"
	else
	echo "MODEL must be one of ar, mdlm, udlm"
	exit 1
	fi

	# To enable preemption re-loading, set `hydra.run.dir` or
	srun python -u -m main \
	mode=train_classifier \
	+is_pplm_classifier=True \
	+use_label_smoothing=${LABEL_SMOOTHING} \
	eval.checkpoint_path="${PRETRAINED_PATH}" \
	parameterization=${PARAMETERIZATION} \
	time_conditioning=${TIME_COND} \
	diffusion=${DIFFUSION} \
	T=${T} \
	data=qm9 \
	data.label_col="${PROP}" \
	data.label_col_pctile=90 \
	data.num_classes=2 \
	loader.global_batch_size=2048 \
	loader.eval_global_batch_size=4096 \
	classifier_model=small-classifier \
	classifier_backbone=dit \
	classifier_model.pooling=${POOLING} \
	model.length=32 \
	+classifier_model.freeze_encoder=True \
	+classifier_model.use_encoder_ema=True \
	optim.lr=3e-5 \
	lr_scheduler=cosine_decay_warmup \
	lr_scheduler.warmup_t=1000 \
	lr_scheduler.lr_min=3e-7 \
	training.guidance=null \
	+training.use_label_smoothing=${LABEL_SMOOTHING} \
	callbacks.checkpoint_every_n_steps.every_n_train_steps=5_000 \
	callbacks.checkpoint_monitor.monitor=val/cross_entropy \
	trainer.val_check_interval=1.0 \
	trainer.max_steps=25_000 \
	wandb.group=train_classifier \
	wandb.name="qm9-pplm_classifier_${RUN_NAME}" \
	hydra.run.dir="./outputs/qm9/pplm_classifier/${RUN_NAME}"