nikraf's picture
Upload folder using huggingface_hub
714cf46 verified
"""Create a mapping from structure and chain ID to MSA indices."""
import argparse
import hashlib
import json
import pickle
import subprocess
from pathlib import Path
import pandas as pd
from Bio import SeqIO
def hash_sequence(seq: str) -> str:
"""Hash a sequence."""
return hashlib.sha256(seq.encode()).hexdigest()
def main(args: argparse.Namespace) -> None:
"""Create clustering."""
# Set output directory
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
# Split the sequences into proteins and nucleotides
with Path(args.sequences).open("r") as f:
data = list(SeqIO.parse(f, "fasta"))
proteins = set()
shorts = set()
nucleotides = set()
# Separate the sequences into proteins, nucleotides and short sequences
# Short sequences cause a bug in the clustering, so they are separated
for seq in data:
if set(str(seq.seq)).issubset({"A", "C", "G", "T", "U", "N"}):
nucleotides.add(str(seq.seq).strip())
elif len(str(seq.seq).strip()) < 10: # noqa: PLR2004
shorts.add(str(seq.seq).strip())
else:
proteins.add(str(seq.seq).strip())
# Run mmseqs on the protein data
proteins = [f">{hash_sequence(seq)}\n{seq}" for seq in proteins]
with (outdir / "proteins.fasta").open("w") as f:
f.write("\n".join(proteins))
subprocess.run(
f"{args.mmseqs} easy-cluster {outdir / 'proteins.fasta'} {outdir / 'clust_prot'} {outdir / 'tmp'} --min-seq-id 0.4", # noqa: E501
shell=True, # noqa: S602
check=True,
)
# Load protein clusters
clustering_path = outdir / "clust_prot_cluster.tsv"
protein_data = pd.read_csv(clustering_path, sep="\t", header=None)
clusters = protein_data[0]
items = protein_data[1]
clustering = dict(zip(list(items), list(clusters)))
# Each shqrt sequence is given an id
for short in shorts:
short_id = hash_sequence(short)
clustering[short_id] = short_id
# Each unique rna sequence is given an id
for nucl in nucleotides:
nucl_id = hash_sequence(nucl)
clustering[nucl_id] = nucl_id
# Load ligand data
with Path(args.ccd).open("rb") as handle:
ligand_data = pickle.load(handle) # noqa: S301
# Each unique ligand CCD is given an id
for ccd_code in ligand_data:
clustering[ccd_code] = ccd_code
# Save clustering
with (outdir / "clustering.json").open("w") as handle:
json.dump(clustering, handle)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--sequences",
type=str,
help="Input to protein fasta.",
required=True,
)
parser.add_argument(
"--ccd",
type=str,
help="Input to rna fasta.",
required=True,
)
parser.add_argument(
"--outdir",
type=str,
help="Output directory.",
required=True,
)
parser.add_argument(
"--mmseqs",
type=str,
help="Path to mmseqs program.",
default="mmseqs",
)
args = parser.parse_args()
main(args)