| | """Create a mapping from structure and chain ID to MSA indices.""" |
| |
|
| | import argparse |
| | import hashlib |
| | import json |
| | import pickle |
| | import subprocess |
| | from pathlib import Path |
| |
|
| | import pandas as pd |
| | from Bio import SeqIO |
| |
|
| |
|
| | def hash_sequence(seq: str) -> str: |
| | """Hash a sequence.""" |
| | return hashlib.sha256(seq.encode()).hexdigest() |
| |
|
| |
|
| | def main(args: argparse.Namespace) -> None: |
| | """Create clustering.""" |
| | |
| | outdir = Path(args.outdir) |
| | outdir.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | with Path(args.sequences).open("r") as f: |
| | data = list(SeqIO.parse(f, "fasta")) |
| |
|
| | proteins = set() |
| | shorts = set() |
| | nucleotides = set() |
| |
|
| | |
| | |
| | for seq in data: |
| | if set(str(seq.seq)).issubset({"A", "C", "G", "T", "U", "N"}): |
| | nucleotides.add(str(seq.seq).strip()) |
| | elif len(str(seq.seq).strip()) < 10: |
| | shorts.add(str(seq.seq).strip()) |
| | else: |
| | proteins.add(str(seq.seq).strip()) |
| |
|
| | |
| | proteins = [f">{hash_sequence(seq)}\n{seq}" for seq in proteins] |
| | with (outdir / "proteins.fasta").open("w") as f: |
| | f.write("\n".join(proteins)) |
| |
|
| | subprocess.run( |
| | f"{args.mmseqs} easy-cluster {outdir / 'proteins.fasta'} {outdir / 'clust_prot'} {outdir / 'tmp'} --min-seq-id 0.4", |
| | shell=True, |
| | check=True, |
| | ) |
| |
|
| | |
| | clustering_path = outdir / "clust_prot_cluster.tsv" |
| | protein_data = pd.read_csv(clustering_path, sep="\t", header=None) |
| | clusters = protein_data[0] |
| | items = protein_data[1] |
| | clustering = dict(zip(list(items), list(clusters))) |
| |
|
| | |
| | for short in shorts: |
| | short_id = hash_sequence(short) |
| | clustering[short_id] = short_id |
| |
|
| | |
| | for nucl in nucleotides: |
| | nucl_id = hash_sequence(nucl) |
| | clustering[nucl_id] = nucl_id |
| |
|
| | |
| | with Path(args.ccd).open("rb") as handle: |
| | ligand_data = pickle.load(handle) |
| |
|
| | |
| | for ccd_code in ligand_data: |
| | clustering[ccd_code] = ccd_code |
| |
|
| | |
| | with (outdir / "clustering.json").open("w") as handle: |
| | json.dump(clustering, handle) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument( |
| | "--sequences", |
| | type=str, |
| | help="Input to protein fasta.", |
| | required=True, |
| | ) |
| | parser.add_argument( |
| | "--ccd", |
| | type=str, |
| | help="Input to rna fasta.", |
| | required=True, |
| | ) |
| | parser.add_argument( |
| | "--outdir", |
| | type=str, |
| | help="Output directory.", |
| | required=True, |
| | ) |
| | parser.add_argument( |
| | "--mmseqs", |
| | type=str, |
| | help="Path to mmseqs program.", |
| | default="mmseqs", |
| | ) |
| | args = parser.parse_args() |
| | main(args) |
| |
|