File size: 3,168 Bytes
714cf46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""Create a mapping from structure and chain ID to MSA indices."""

import argparse
import hashlib
import json
import pickle
import subprocess
from pathlib import Path

import pandas as pd
from Bio import SeqIO


def hash_sequence(seq: str) -> str:
    """Hash a sequence."""
    return hashlib.sha256(seq.encode()).hexdigest()


def main(args: argparse.Namespace) -> None:
    """Create clustering."""
    # Set output directory
    outdir = Path(args.outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    # Split the sequences into proteins and nucleotides
    with Path(args.sequences).open("r") as f:
        data = list(SeqIO.parse(f, "fasta"))

    proteins = set()
    shorts = set()
    nucleotides = set()

    # Separate the sequences into proteins, nucleotides and short sequences
    # Short sequences cause a bug in the clustering, so they are separated
    for seq in data:
        if set(str(seq.seq)).issubset({"A", "C", "G", "T", "U", "N"}):
            nucleotides.add(str(seq.seq).strip())
        elif len(str(seq.seq).strip()) < 10:  # noqa: PLR2004
            shorts.add(str(seq.seq).strip())
        else:
            proteins.add(str(seq.seq).strip())

    # Run mmseqs on the protein data
    proteins = [f">{hash_sequence(seq)}\n{seq}" for seq in proteins]
    with (outdir / "proteins.fasta").open("w") as f:
        f.write("\n".join(proteins))

    subprocess.run(
        f"{args.mmseqs} easy-cluster {outdir / 'proteins.fasta'} {outdir / 'clust_prot'} {outdir / 'tmp'} --min-seq-id 0.4",  # noqa: E501
        shell=True,  # noqa: S602
        check=True,
    )

    # Load protein clusters
    clustering_path = outdir / "clust_prot_cluster.tsv"
    protein_data = pd.read_csv(clustering_path, sep="\t", header=None)
    clusters = protein_data[0]
    items = protein_data[1]
    clustering = dict(zip(list(items), list(clusters)))

    # Each shqrt sequence is given an id
    for short in shorts:
        short_id = hash_sequence(short)
        clustering[short_id] = short_id

    # Each unique rna sequence is given an id
    for nucl in nucleotides:
        nucl_id = hash_sequence(nucl)
        clustering[nucl_id] = nucl_id

    # Load ligand data
    with Path(args.ccd).open("rb") as handle:
        ligand_data = pickle.load(handle)  # noqa: S301

    # Each unique ligand CCD is given an id
    for ccd_code in ligand_data:
        clustering[ccd_code] = ccd_code

    # Save clustering
    with (outdir / "clustering.json").open("w") as handle:
        json.dump(clustering, handle)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--sequences",
        type=str,
        help="Input to protein fasta.",
        required=True,
    )
    parser.add_argument(
        "--ccd",
        type=str,
        help="Input to rna fasta.",
        required=True,
    )
    parser.add_argument(
        "--outdir",
        type=str,
        help="Output directory.",
        required=True,
    )
    parser.add_argument(
        "--mmseqs",
        type=str,
        help="Path to mmseqs program.",
        default="mmseqs",
    )
    args = parser.parse_args()
    main(args)