Upload folder using huggingface_hub

714cf46 verified 3 days ago

9.75 kB

	import entrypoint_setup

	import random
	import tempfile
	import os
	import torch

	from esm2.modeling_fastesm import FastEsmForMaskedLM
	from esm_plusplus.modeling_esm_plusplus import ESMplusplusForMaskedLM
	from e1_fastplms.modeling_e1 import E1ForMaskedLM
	from dplm_fastplms.modeling_dplm import DPLMForMaskedLM
	from dplm2_fastplms.modeling_dplm2 import (
	DPLM2ForMaskedLM,
	_has_packed_multimodal_layout,
	_normalize_dplm2_input_ids,
	)
	from embedding_mixin import parse_fasta


	CANONICAL_AAS = "ACDEFGHIKLMNPQRSTVWY"
	SEED = 42
	DEFAULT_BATCH_SIZE = 4
	MAX_EMBED_LEN = 128 # fixed pad length used to keep max_seqlen identical across runs


	# (display_name, model_class, hf_path, use_model_tokenizer)
	MODEL_CONFIGS = [
	("ESM2", FastEsmForMaskedLM, "Synthyra/ESM2-8M", True),
	("ESM++", ESMplusplusForMaskedLM, "Synthyra/ESMplusplus_small", True),
	("E1", E1ForMaskedLM, "Synthyra/Profluent-E1-150M", False),
	("DPLM", DPLMForMaskedLM, "Synthyra/DPLM-150M", True),
	("DPLM2", DPLM2ForMaskedLM, "Synthyra/DPLM2-150M", True),
	]


	def test_parse_fasta() -> None:
	"""Test parse_fasta with single-line and multi-line sequences."""
	fasta_content = (
	">seq1 a simple protein\n"
	"MKTLLLTLVVVTIVCLDLGYT\n"
	">seq2 multi-line sequence\n"
	"ACDEFGHIKL\n"
	"MNPQRSTVWY\n"
	">seq3 another entry\n"
	"MALWMRLLPLLALL\n"
	)
	expected = [
	"MKTLLLTLVVVTIVCLDLGYT",
	"ACDEFGHIKLMNPQRSTVWY",
	"MALWMRLLPLLALL",
	]
	with tempfile.NamedTemporaryFile(mode='w', suffix='.fasta', delete=False) as f:
	f.write(fasta_content)
	tmp_path = f.name
	parsed = parse_fasta(tmp_path)
	os.unlink(tmp_path)
	assert parsed == expected, f"parse_fasta mismatch:\n got: {parsed}\n expected: {expected}"
	print("test_parse_fasta: OK")


	class FixedLengthTokenizer:
	"""Wraps a tokenizer so every call pads to exactly MAX_EMBED_LEN tokens.

	Both batch=1 and batch=N therefore receive tensors of the same shape,
	keeping max_seqlen_in_batch identical and eliminating floating-point
	variability from different softmax vector lengths / flash-attention tile sizes.
	"""
	def __init__(self, tokenizer, max_length: int = MAX_EMBED_LEN):
	self._tok = tokenizer
	self.max_length = max_length

	def __call__(self, sequences, **kwargs):
	return self._tok(
	sequences,
	return_tensors="pt",
	padding="max_length",
	max_length=self.max_length,
	truncation=True,
	)


	def random_sequences(n: int, min_len: int = 8, max_len: int = 64) -> list[str]:
	"""Variable-length sequences; used for the NaN test."""
	return [
	"M" + "".join(random.choices(CANONICAL_AAS, k=random.randint(min_len, max_len)))
	for _ in range(n)
	]


	def random_sequences_fixed_len(n: int, length: int = 64) -> list[str]:
	"""Fixed-length sequences; used for the match test with E1 (sequence mode)."""
	return [
	"M" + "".join(random.choices(CANONICAL_AAS, k=length - 1))
	for _ in range(n)
	]


	def assert_no_nan(embeddings: dict[str, torch.Tensor], label: str) -> None:
	for seq, emb in embeddings.items():
	assert not torch.isnan(emb).any(), (
	f"[{label}] NaN found in embedding for sequence '{seq[:20]}...'"
	)


	def assert_embeddings_match(
	a: dict[str, torch.Tensor],
	b: dict[str, torch.Tensor],
	label: str,
	atol: float = 5e-3,
	) -> None:
	"""Compare real-token embeddings from two runs.

	full_embeddings=True already strips padding via emb[mask.bool()], so both
	dicts contain only non-pad token rows and the comparison is over those rows.
	"""
	assert set(a) == set(b), f"[{label}] Key sets differ between batch and single runs"
	for seq in a:
	ea, eb = a[seq].float(), b[seq].float()
	assert ea.shape == eb.shape, (
	f"[{label}] Shape mismatch for '{seq[:20]}': {ea.shape} vs {eb.shape}"
	)
	max_diff = (ea - eb).abs().max().item()
	assert max_diff <= atol, (
	f"[{label}] Max abs diff {max_diff:.5f} > {atol} for '{seq[:20]}'"
	)


	def test_dplm2_multimodal_layout_guard() -> None:
	plain_sequence_type_ids = torch.tensor([
	[1, 1, 1, 1, 1, 1, 0, 2],
	[1, 1, 1, 1, 1, 0, 2, 2],
	])
	packed_multimodal_type_ids = torch.tensor([
	[1, 1, 1, 2, 0, 0, 0, 2],
	[1, 1, 2, 2, 0, 0, 2, 2],
	])
	mismatched_multimodal_type_ids = torch.tensor([
	[1, 1, 1, 2, 0, 0, 2, 2],
	])

	assert not _has_packed_multimodal_layout(plain_sequence_type_ids, aa_type=1, struct_type=0, pad_type=2)
	assert _has_packed_multimodal_layout(packed_multimodal_type_ids, aa_type=1, struct_type=0, pad_type=2)
	assert not _has_packed_multimodal_layout(mismatched_multimodal_type_ids, aa_type=1, struct_type=0, pad_type=2)
	print("test_dplm2_multimodal_layout_guard: OK")


	def test_dplm2_special_token_normalization() -> None:
	input_ids = torch.tensor([[8231, 5, 23, 13, 8229, 1, 8232, -100]])
	normalized_input_ids = _normalize_dplm2_input_ids(input_ids, vocab_size=8229)
	expected = torch.tensor([[0, 5, 23, 13, 2, 1, 32, -100]])
	assert torch.equal(normalized_input_ids, expected), (
	f"DPLM2 special-token normalization mismatch:\n"
	f" got: {normalized_input_ids.tolist()}\n"
	f" expected: {expected.tolist()}"
	)
	print("test_dplm2_special_token_normalization: OK")


	def test_model(name: str, model_cls, model_path: str, use_model_tokenizer: bool, batch_size: int) -> None:
	print(f"\n--- {name} ({model_path}) ---")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model = model_cls.from_pretrained(
	model_path,
	dtype=torch.bfloat16,
	device_map=device,
	trust_remote_code=True,
	).eval()

	if use_model_tokenizer:
	# FixedLengthTokenizer pads every batch to MAX_EMBED_LEN regardless of
	# actual sequence lengths, so batch=1 and batch=N see the same tensor
	# shape and produce numerically identical real-token outputs.
	tokenizer = FixedLengthTokenizer(model.tokenizer)
	sequences = random_sequences(n=8) # variable lengths, all padded to MAX_EMBED_LEN
	else:
	# E1 (sequence mode): control padding length via fixed-length sequences
	# so max_seqlen_in_batch is the same in every forward call.
	tokenizer = None
	sequences = random_sequences_fixed_len(n=8) # fixed length, no padding variability

	nan_kwargs = dict(
	tokenizer=tokenizer,
	full_embeddings=True, # extracts only real (non-pad) token rows via emb[mask.bool()]
	embed_dtype=torch.bfloat16,
	save=False,
	)

	# NaN test ----------------------------------------------------------------
	# Run in bfloat16 to match the real-world user scenario.
	# batch_size > 1 with padding present must produce no NaN in real-token rows.
	nan_embs = model.embed_dataset(sequences=sequences, batch_size=batch_size, **nan_kwargs)
	assert_no_nan(nan_embs, f"{name} NaN check batch_size={batch_size}")
	shapes = [tuple(e.shape) for e in list(nan_embs.values())[:3]]
	print(f" NaN check batch_size={batch_size}: OK sample shapes={shapes}")

	# Match test (tokenizer / SDPA models only) --------------------------------
	# The NaN fix only touches SDPA backends; E1 uses flash varlen which
	# inherently unpads and is unaffected. Flash varlen is also NOT
	# bit-deterministic across different batch sizes (different numbers of
	# packed query blocks → different online-softmax accumulation order), so
	# a tight match test for E1 is not meaningful.
	#
	# For SDPA models we cast to float32: bfloat16 CUBLAS selects different
	# mat-mul algorithms for batch=1 vs batch=N (simple vs batched GEMM),
	# producing 1-ULP differences. Float32 differences are < 1e-3.
	if not use_model_tokenizer:
	return

	model.to(torch.float32)
	batch_embs = model.embed_dataset(
	sequences=sequences, batch_size=batch_size,
	tokenizer=tokenizer, full_embeddings=True, embed_dtype=torch.float32, save=False,
	)
	single_embs = model.embed_dataset(
	sequences=sequences, batch_size=1,
	tokenizer=tokenizer, full_embeddings=True, embed_dtype=torch.float32, save=False,
	)
	assert_no_nan(batch_embs, f"{name} match test batch_size={batch_size}")
	assert_no_nan(single_embs, f"{name} match test batch_size=1")
	assert_embeddings_match(batch_embs, single_embs, name)
	print(f" Match test batch_size={batch_size} vs 1: OK (non-pad tokens only)")


	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser(description="Test embed_dataset produces no NaN with batch_size > 1.")
	parser.add_argument("--models", nargs="+", default=["ESM2", "ESM++", "E1", "DPLM", "DPLM2"])
	parser.add_argument("--batch_size", type=int, default=DEFAULT_BATCH_SIZE)
	args = parser.parse_args()

	random.seed(SEED)
	test_parse_fasta()
	test_dplm2_multimodal_layout_guard()
	test_dplm2_special_token_normalization()

	valid_names = {cfg[0] for cfg in MODEL_CONFIGS}
	for name in args.models:
	assert name in valid_names, f"Unknown model '{name}'. Choose from {sorted(valid_names)}"

	configs_by_name = {cfg[0]: cfg for cfg in MODEL_CONFIGS}
	for model_name in args.models:
	name, model_cls, model_path, use_model_tokenizer = configs_by_name[model_name]
	test_model(name, model_cls, model_path, use_model_tokenizer, args.batch_size)

	print("\nAll tests passed!")