nikraf
/

directionality_probe

Feature Extraction

Model card Files Files and versions

directionality_probe / protify /FastPLMs /boltz /scripts /process /cluster.py

nikraf's picture

Upload folder using huggingface_hub

714cf46 verified 4 days ago

history blame contribute delete

3.17 kB

	"""Create a mapping from structure and chain ID to MSA indices."""

	import argparse
	import hashlib
	import json
	import pickle
	import subprocess
	from pathlib import Path

	import pandas as pd
	from Bio import SeqIO


	def hash_sequence(seq: str) -> str:
	"""Hash a sequence."""
	return hashlib.sha256(seq.encode()).hexdigest()


	def main(args: argparse.Namespace) -> None:
	"""Create clustering."""
	# Set output directory
	outdir = Path(args.outdir)
	outdir.mkdir(parents=True, exist_ok=True)

	# Split the sequences into proteins and nucleotides
	with Path(args.sequences).open("r") as f:
	data = list(SeqIO.parse(f, "fasta"))

	proteins = set()
	shorts = set()
	nucleotides = set()

	# Separate the sequences into proteins, nucleotides and short sequences
	# Short sequences cause a bug in the clustering, so they are separated
	for seq in data:
	if set(str(seq.seq)).issubset({"A", "C", "G", "T", "U", "N"}):
	nucleotides.add(str(seq.seq).strip())
	elif len(str(seq.seq).strip()) < 10: # noqa: PLR2004
	shorts.add(str(seq.seq).strip())
	else:
	proteins.add(str(seq.seq).strip())

	# Run mmseqs on the protein data
	proteins = [f">{hash_sequence(seq)}\n{seq}" for seq in proteins]
	with (outdir / "proteins.fasta").open("w") as f:
	f.write("\n".join(proteins))

	subprocess.run(
	f"{args.mmseqs} easy-cluster {outdir / 'proteins.fasta'} {outdir / 'clust_prot'} {outdir / 'tmp'} --min-seq-id 0.4", # noqa: E501
	shell=True, # noqa: S602
	check=True,
	)

	# Load protein clusters
	clustering_path = outdir / "clust_prot_cluster.tsv"
	protein_data = pd.read_csv(clustering_path, sep="\t", header=None)
	clusters = protein_data[0]
	items = protein_data[1]
	clustering = dict(zip(list(items), list(clusters)))

	# Each shqrt sequence is given an id
	for short in shorts:
	short_id = hash_sequence(short)
	clustering[short_id] = short_id

	# Each unique rna sequence is given an id
	for nucl in nucleotides:
	nucl_id = hash_sequence(nucl)
	clustering[nucl_id] = nucl_id

	# Load ligand data
	with Path(args.ccd).open("rb") as handle:
	ligand_data = pickle.load(handle) # noqa: S301

	# Each unique ligand CCD is given an id
	for ccd_code in ligand_data:
	clustering[ccd_code] = ccd_code

	# Save clustering
	with (outdir / "clustering.json").open("w") as handle:
	json.dump(clustering, handle)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--sequences",
	type=str,
	help="Input to protein fasta.",
	required=True,
	)
	parser.add_argument(
	"--ccd",
	type=str,
	help="Input to rna fasta.",
	required=True,
	)
	parser.add_argument(
	"--outdir",
	type=str,
	help="Output directory.",
	required=True,
	)
	parser.add_argument(
	"--mmseqs",
	type=str,
	help="Path to mmseqs program.",
	default="mmseqs",
	)
	args = parser.parse_args()
	main(args)