nikraf
/

directionality_probe

Feature Extraction

Model card Files Files and versions

directionality_probe / protify /base_models /esm2.py

nikraf's picture

Upload folder using huggingface_hub

714cf46 verified 17 days ago

history blame contribute delete

3.69 kB

	"""
	We use the FastESM2 implementation of ESM2.
	"""
	import sys
	import os
	import torch
	import torch.nn as nn
	from typing import Optional, Union, List, Dict

	_FASTPLMS = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'FastPLMs')
	if _FASTPLMS not in sys.path:
	sys.path.insert(0, _FASTPLMS)

	from esm2.modeling_fastesm import (
	FastEsmModel,
	FastEsmForMaskedLM,
	FastEsmForSequenceClassification,
	FastEsmForTokenClassification,
	)
	from transformers import EsmTokenizer
	from .base_tokenizer import BaseSequenceTokenizer


	presets = {
	'ESM2-8': 'Synthyra/ESM2-8M',
	'ESM2-35': 'Synthyra/ESM2-35M',
	'ESM2-150': 'Synthyra/ESM2-150M',
	'ESM2-650': 'Synthyra/ESM2-650M',
	'ESM2-3B': 'Synthyra/ESM2-3B',
	'DSM-150': 'GleghornLab/DSM_150',
	'DSM-650': 'GleghornLab/DSM_650',
	'DSM-PPI': 'Synthyra/DSM_ppi_full',
	}


	class ESM2TokenizerWrapper(BaseSequenceTokenizer):
	def __init__(self, tokenizer: EsmTokenizer):
	super().__init__(tokenizer)

	def __call__(self, sequences: Union[str, List[str]], **kwargs) -> Dict[str, torch.Tensor]:
	if isinstance(sequences, str):
	sequences = [sequences]
	kwargs.setdefault('return_tensors', 'pt')
	kwargs.setdefault('padding', 'longest')
	kwargs.setdefault('add_special_tokens', True)
	tokenized = self.tokenizer(sequences, **kwargs)
	return tokenized


	class FastEsmForEmbedding(nn.Module):
	def __init__(self, model_path: str, dtype: torch.dtype = None):
	super().__init__()
	self.esm = FastEsmModel.from_pretrained(model_path, dtype=dtype)

	def forward(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = False,
	**kwargs,
	) -> torch.Tensor:
	if output_attentions:
	out = self.esm(input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions)
	return out.last_hidden_state, out.attentions
	else:
	return self.esm(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state


	def get_esm2_tokenizer(preset: str, model_path: str = None):
	return ESM2TokenizerWrapper(EsmTokenizer.from_pretrained('facebook/esm2_t6_8M_UR50D'))


	def build_esm2_model(preset: str, masked_lm: bool = False, dtype: torch.dtype = None, model_path: str = None, **kwargs):
	path = model_path or presets[preset]
	if masked_lm:
	model = FastEsmForMaskedLM.from_pretrained(path, dtype=dtype).eval()
	else:
	model = FastEsmForEmbedding(path, dtype=dtype).eval()
	tokenizer = get_esm2_tokenizer(preset)
	return model, tokenizer


	def get_esm2_for_training(preset: str, tokenwise: bool = False, num_labels: int = None, hybrid: bool = False, dtype: torch.dtype = None, model_path: str = None):
	model_path = model_path or presets[preset]
	if hybrid:
	model = FastEsmModel.from_pretrained(model_path, dtype=dtype).eval()
	else:
	if tokenwise:
	model = FastEsmForTokenClassification.from_pretrained(model_path, num_labels=num_labels, dtype=dtype).eval()
	else:
	model = FastEsmForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, dtype=dtype).eval()
	tokenizer = get_esm2_tokenizer(preset)
	return model, tokenizer


	if __name__ == '__main__':
	# py -m src.protify.base_models.esm2
	model, tokenizer = build_esm2_model('ESM2-8')
	print(model)
	print(tokenizer)
	print(tokenizer('MEKVQYLTRSAIRRASTIEMPQQARQKLQNLFINFCLILICBBOLLICIIVMLL'))