|
|
import os |
|
|
import re |
|
|
import gzip |
|
|
import time |
|
|
import shutil |
|
|
import argparse |
|
|
from copy import deepcopy |
|
|
from tempfile import NamedTemporaryFile |
|
|
import multiprocessing as mp |
|
|
|
|
|
import numpy as np |
|
|
from Bio.PDB import PDBParser,Chain,Model,Structure, PDBIO |
|
|
from Bio.PDB.DSSP import dssp_dict_from_pdb_file |
|
|
from Bio.SeqUtils.ProtParam import ProteinAnalysis |
|
|
from rdkit.Chem.rdMolDescriptors import CalcTPSA |
|
|
from freesasa import calcBioPDB |
|
|
from rdkit.Chem import MolFromSmiles |
|
|
|
|
|
from globals import CACHE_DIR, CONTACT_DIST |
|
|
from utils.logger import print_log |
|
|
from utils.file_utils import cnt_num_files, get_filename |
|
|
from data.mmap_dataset import create_mmap |
|
|
from data.converter.pdb_to_list_blocks import pdb_to_list_blocks |
|
|
from data.converter.blocks_interface import blocks_cb_interface, blocks_interface |
|
|
|
|
|
from .pepbind import clustering |
|
|
|
|
|
|
|
|
def parse(): |
|
|
parser = argparse.ArgumentParser(description='Filter peptide-like loop from monomers') |
|
|
parser.add_argument('--database_dir', type=str, required=True, |
|
|
help='Directory of pdb database processed in monomers') |
|
|
parser.add_argument('--pdb_dir', type=str, required=True, help='Directory to PDB database') |
|
|
parser.add_argument('--out_dir', type=str, required=True, help='Output directory') |
|
|
parser.add_argument('--pocket_th', type=float, default=10.0, help='Threshold for determining pocket') |
|
|
parser.add_argument('--n_cpu', type=int, default=4, help='Number of CPU to use') |
|
|
|
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
|
|
|
AA3TO1 = { |
|
|
'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M', |
|
|
'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K', |
|
|
'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', |
|
|
'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',} |
|
|
|
|
|
hydrophobic_residues=['V','I','L','M','F','W','C'] |
|
|
charged_residues=['H','R','K','D','E'] |
|
|
|
|
|
def add_cb(input_array): |
|
|
|
|
|
|
|
|
N,CA,C,O = input_array |
|
|
b = CA - N |
|
|
c = C - CA |
|
|
a = np.cross(b,c) |
|
|
CB = np.around(-0.58273431*a + 0.56802827*b - 0.54067466*c + CA,3) |
|
|
return CB |
|
|
|
|
|
aaSMILES = {'G': 'NCC(=O)O', |
|
|
'A': 'N[C@@]([H])(C)C(=O)O', |
|
|
'R': 'N[C@@]([H])(CCCNC(=N)N)C(=O)O', |
|
|
'N': 'N[C@@]([H])(CC(=O)N)C(=O)O', |
|
|
'D': 'N[C@@]([H])(CC(=O)O)C(=O)O', |
|
|
'C': 'N[C@@]([H])(CS)C(=O)O', |
|
|
'E': 'N[C@@]([H])(CCC(=O)O)C(=O)O', |
|
|
'Q': 'N[C@@]([H])(CCC(=O)N)C(=O)O', |
|
|
'H': 'N[C@@]([H])(CC1=CN=C-N1)C(=O)O', |
|
|
'I': 'N[C@@]([H])(C(CC)C)C(=O)O', |
|
|
'L': 'N[C@@]([H])(CC(C)C)C(=O)O', |
|
|
'K': 'N[C@@]([H])(CCCCN)C(=O)O', |
|
|
'M': 'N[C@@]([H])(CCSC)C(=O)O', |
|
|
'F': 'N[C@@]([H])(Cc1ccccc1)C(=O)O', |
|
|
'P': 'N1[C@@]([H])(CCC1)C(=O)O', |
|
|
'S': 'N[C@@]([H])(CO)C(=O)O', |
|
|
'T': 'N[C@@]([H])(C(O)C)C(=O)O', |
|
|
'W': 'N[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)O', |
|
|
'Y': 'N[C@@]([H])(Cc1ccc(O)cc1)C(=O)O', |
|
|
'V': 'N[C@@]([H])(C(C)C)C(=O)O'} |
|
|
|
|
|
|
|
|
class Filter: |
|
|
def __init__( |
|
|
self, |
|
|
min_loop_len = 4, |
|
|
max_loop_len = 25, |
|
|
min_BSA = 400, |
|
|
min_relBSA = 0.2, |
|
|
max_relncBSA = 0.3, |
|
|
saved_maxlen = 25, |
|
|
saved_BSA = 400, |
|
|
saved_relBSA = 0.2, |
|
|
saved_helix_ratio = 1.0, |
|
|
saved_strand_ratio = 1.0, |
|
|
cyclic=False |
|
|
) -> None: |
|
|
|
|
|
self.re_filter = re.compile(r'D[GPS]|[P]{2,}|C') |
|
|
self.cache_dir = CACHE_DIR |
|
|
|
|
|
self.min_loop_len = min_loop_len |
|
|
self.max_loop_len = max_loop_len |
|
|
self.min_BSA = min_BSA |
|
|
self.min_relBSA = min_relBSA |
|
|
self.max_relncBSA = max_relncBSA |
|
|
self.saved_maxlen = saved_maxlen |
|
|
self.saved_BSA = saved_BSA |
|
|
self.saved_relBSA = saved_relBSA |
|
|
self.saved_helix_ratio = saved_helix_ratio |
|
|
self.saved_strand_ratio = saved_strand_ratio |
|
|
self.cyclic = cyclic |
|
|
|
|
|
@classmethod |
|
|
def get_ss_info(cls, pdb_path: str): |
|
|
dssp, keys = dssp_dict_from_pdb_file(pdb_path, DSSP='mkdssp') |
|
|
ss_info = {} |
|
|
for key in keys: |
|
|
chain_id, value = key[0], dssp[key] |
|
|
if chain_id not in ss_info: |
|
|
ss_info[chain_id] = [] |
|
|
ss_type = value[1] |
|
|
if ss_type in ['H', 'G', 'I']: |
|
|
ss_info[chain_id].append('a') |
|
|
elif ss_type in ['B', 'E', 'T', 'S']: |
|
|
ss_info[chain_id].append('b') |
|
|
elif ss_type == '-': |
|
|
ss_info[chain_id].append('c') |
|
|
else: |
|
|
raise ValueError(f'SS type {ss_type} cannot be recognized!') |
|
|
return ss_info |
|
|
|
|
|
@classmethod |
|
|
def get_bsa(self, receptor_chain: Chain.Chain, ligand_chain: Chain.Chain): |
|
|
lig_chain_id = ligand_chain.get_id() |
|
|
tmp_structure = Structure.Structure('tmp') |
|
|
tmp_model = Model.Model(0) |
|
|
tmp_structure.add(tmp_model) |
|
|
tmp_model.add(ligand_chain) |
|
|
unbounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id] |
|
|
unbounded_SASA = [k.total for k in unbounded_SASA.values()] |
|
|
|
|
|
tmp_model.add(receptor_chain) |
|
|
bounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id] |
|
|
bounded_SASA = [k.total for k in bounded_SASA.values()] |
|
|
|
|
|
abs_bsa = sum(unbounded_SASA[1:-1]) - sum(bounded_SASA[1:-1]) |
|
|
rel_bsa = abs_bsa / sum(unbounded_SASA[1:-1]) |
|
|
rel_nc_bsa = (unbounded_SASA[0] + unbounded_SASA[-1] - bounded_SASA[0] - bounded_SASA[-1]) / (unbounded_SASA[0] + unbounded_SASA[-1]) |
|
|
|
|
|
return abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure |
|
|
|
|
|
def filter_pdb(self, pdb_path, selected_chains=None): |
|
|
parser = PDBParser(QUIET=True) |
|
|
ss_info = self.get_ss_info(pdb_path) |
|
|
structure = parser.get_structure('anonym', pdb_path) |
|
|
|
|
|
for model in structure.get_models(): |
|
|
structure = model |
|
|
break |
|
|
|
|
|
results = [] |
|
|
for chain in structure.get_chains(): |
|
|
if selected_chains is not None and chain.get_id() not in selected_chains: |
|
|
continue |
|
|
chain_ss_info = None if ss_info is None else ss_info[chain.get_id()] |
|
|
results.extend(self.filter_chain(chain, chain_ss_info)) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def filter_chain(self, chain, ss_info=None): |
|
|
|
|
|
non_standard = False |
|
|
for res in chain: |
|
|
if res.get_resname() not in AA3TO1: |
|
|
non_standard = True |
|
|
break |
|
|
if non_standard: |
|
|
return [] |
|
|
|
|
|
if len(ss_info) != len(chain): |
|
|
return [] |
|
|
|
|
|
cb_coord = [] |
|
|
seq = '' |
|
|
for res in chain: |
|
|
seq += AA3TO1[res.get_resname()] |
|
|
try: |
|
|
cb_coord.append(res['CB'].get_coord()) |
|
|
except: |
|
|
tmp_coord = np.array([ |
|
|
res['N'].get_coord(), |
|
|
res['CA'].get_coord(), |
|
|
res['C'].get_coord(), |
|
|
res['O'].get_coord() |
|
|
]) |
|
|
cb_coord.append(add_cb(tmp_coord)) |
|
|
cb_coord = np.array(cb_coord) |
|
|
cb_contact = np.linalg.norm(cb_coord[None,:,:,] - cb_coord[:,None,:],axis=-1) |
|
|
if self.cyclic: |
|
|
possible_ss = (cb_contact >= 3.5) & (cb_contact <= 5) |
|
|
else: |
|
|
possible_ss = np.ones(cb_contact.shape, dtype=bool) |
|
|
possible_ss = np.triu(np.tril(possible_ss, self.max_loop_len - 1), self.min_loop_len - 1) |
|
|
ss_pair = np.where(possible_ss) |
|
|
accepted, saved_spans = [], [] |
|
|
for i, j in zip(ss_pair[0],ss_pair[1]): |
|
|
redundant = False |
|
|
for exist_i, exist_j in saved_spans: |
|
|
overlap = min(j, exist_j) - max(i, exist_i) + 1 |
|
|
if overlap / (j - i + 1) > 0.4 or overlap / (exist_j - exist_i + 1) > 0.4: |
|
|
redundant = True |
|
|
break |
|
|
if redundant: |
|
|
continue |
|
|
|
|
|
min_dist = np.min(cb_contact[i : j + 1], axis=0) |
|
|
min_dist[max(i - 5, 0):min(j + 6, len(seq))] = 21 |
|
|
neighbors_20A = np.where(min_dist < 20)[0] |
|
|
if len(neighbors_20A) < 16: |
|
|
continue |
|
|
|
|
|
|
|
|
pep_seq = seq[i:j+1] |
|
|
|
|
|
if self.re_filter.search(pep_seq) is not None: |
|
|
continue |
|
|
prot_param=ProteinAnalysis(pep_seq) |
|
|
aa_percent = prot_param.get_amino_acids_percent() |
|
|
max_ratio = max(aa_percent.values()) |
|
|
|
|
|
if max_ratio > 0.25: |
|
|
continue |
|
|
hydrophobic_ratio = sum([aa_percent[k] for k in hydrophobic_residues]) |
|
|
|
|
|
if hydrophobic_ratio > 0.45: |
|
|
continue |
|
|
|
|
|
charged_ratio = sum([aa_percent[k] for k in charged_residues]) |
|
|
if charged_ratio > 0.45 or charged_ratio < 0.25: |
|
|
continue |
|
|
|
|
|
if prot_param.instability_index() >= 40: |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
receptor_chain = Chain.Chain('R') |
|
|
ligand_chain = Chain.Chain('L') |
|
|
for k,res in enumerate(chain): |
|
|
if k >= i and k <= j: |
|
|
ligand_chain.add(res.copy()) |
|
|
elif k in neighbors_20A: |
|
|
receptor_chain.add(res.copy()) |
|
|
|
|
|
abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure = self.get_bsa(receptor_chain, ligand_chain) |
|
|
if abs_bsa <= self.min_BSA or rel_bsa <= self.min_relBSA or (self.cyclic and rel_nc_bsa >= self.max_relncBSA): |
|
|
continue |
|
|
|
|
|
|
|
|
length = j - i + 1 |
|
|
if ss_info is None: |
|
|
helix_ratio = -1 |
|
|
strand_ratio = -1 |
|
|
coil_ratio = -1 |
|
|
else: |
|
|
ssa = ss_info[i:j+1] |
|
|
helix_ratio = ssa.count('a') / length |
|
|
strand_ratio = ssa.count('b') / length |
|
|
coil_ratio = ssa.count('c') / length |
|
|
|
|
|
|
|
|
|
|
|
if length <= self.saved_maxlen and abs_bsa >= self.saved_BSA and rel_bsa >= self.saved_relBSA and helix_ratio <= self.saved_helix_ratio and strand_ratio <= self.saved_strand_ratio: |
|
|
output_structure = deepcopy(tmp_structure) |
|
|
else: |
|
|
output_structure = None |
|
|
accepted.append(( |
|
|
i , j, length, abs_bsa, rel_bsa, helix_ratio, strand_ratio, coil_ratio, output_structure |
|
|
)) |
|
|
saved_spans.append((i, j)) |
|
|
|
|
|
return accepted |
|
|
|
|
|
|
|
|
def get_non_redundant(mmap_dir): |
|
|
np.random.seed(12) |
|
|
index_path = os.path.join(mmap_dir, 'index.txt') |
|
|
parent_dir = mmap_dir |
|
|
|
|
|
|
|
|
items = {} |
|
|
with open(index_path, 'r') as fin: |
|
|
lines = fin.readlines() |
|
|
for line in lines: |
|
|
values = line.strip().split('\t') |
|
|
_id, seq = values[0], values[-1] |
|
|
chain, pdb_file = _id.split('_') |
|
|
items[_id] = (seq, chain, pdb_file) |
|
|
|
|
|
|
|
|
tmp_dir = os.path.join(parent_dir, 'tmp') |
|
|
if not os.path.exists(tmp_dir): |
|
|
os.makedirs(tmp_dir) |
|
|
else: |
|
|
raise ValueError(f'Working directory {tmp_dir} exists!') |
|
|
|
|
|
|
|
|
fasta = os.path.join(tmp_dir, 'seq.fasta') |
|
|
with open(fasta, 'w') as fout: |
|
|
for _id in items: |
|
|
fout.write(f'>{_id}\n{items[_id][0]}\n') |
|
|
id2clu, clu2id = clustering(fasta, tmp_dir, 0.9) |
|
|
non_redundant = [] |
|
|
for clu in clu2id: |
|
|
ids = clu2id[clu] |
|
|
non_redundant.append(np.random.choice(ids)) |
|
|
print_log(f'Non-redundant entries: {len(non_redundant)}') |
|
|
shutil.rmtree(tmp_dir) |
|
|
|
|
|
|
|
|
indexes = {} |
|
|
for _id in non_redundant: |
|
|
_, chain, pdb_file = items[_id] |
|
|
if pdb_file not in indexes: |
|
|
indexes[pdb_file] = [] |
|
|
indexes[pdb_file].append(chain) |
|
|
|
|
|
return indexes |
|
|
|
|
|
|
|
|
def mp_worker(data_dir, tmp_dir, pdb_file, selected_chains, pep_filter, pdb_out_dir, queue): |
|
|
category = pdb_file[4:6] |
|
|
category_dir = os.path.join(data_dir, category) |
|
|
path = os.path.join(category_dir, pdb_file) |
|
|
tmp_file = os.path.join(tmp_dir, f'{pdb_file}.decompressed') |
|
|
pdb_id = get_filename(pdb_file.split('.')[0]) |
|
|
|
|
|
|
|
|
with gzip.open(path, 'rb') as fin: |
|
|
with open(tmp_file, 'wb') as fout: |
|
|
shutil.copyfileobj(fin, fout) |
|
|
|
|
|
files = [] |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
results = pep_filter.filter_pdb(tmp_file, selected_chains=selected_chains) |
|
|
for item in results: |
|
|
i, j, struct = item[0], item[1], item[-1] |
|
|
if struct is None: |
|
|
continue |
|
|
io = PDBIO() |
|
|
io.set_structure(struct) |
|
|
_id = pdb_id + f'_{i}_{j}' |
|
|
save_path = os.path.join(pdb_out_dir, _id + '.pdb') |
|
|
io.save(save_path) |
|
|
files.append(save_path) |
|
|
except Exception: |
|
|
pass |
|
|
queue.put((pdb_file, files)) |
|
|
os.remove(tmp_file) |
|
|
|
|
|
|
|
|
def process_iterator(indexes, data_dir, tmp_dir, out_dir, pocket_th, n_cpu): |
|
|
pdb_out_dir = os.path.join(out_dir, 'pdbs') |
|
|
if not os.path.exists(pdb_out_dir): |
|
|
os.makedirs(pdb_out_dir) |
|
|
pep_filter = Filter() |
|
|
|
|
|
file_cnt, pointer, filenames = 0, 0, list(indexes.keys()) |
|
|
id2task = {} |
|
|
queue = mp.Queue() |
|
|
|
|
|
for _ in range(n_cpu): |
|
|
task_id = filenames[pointer] |
|
|
id2task[task_id] = mp.Process( |
|
|
target=mp_worker, |
|
|
args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue) |
|
|
) |
|
|
id2task[task_id].start() |
|
|
pointer += 1 |
|
|
|
|
|
while True: |
|
|
if len(id2task) == 0: |
|
|
break |
|
|
|
|
|
if not queue.qsize: |
|
|
time.sleep(1) |
|
|
continue |
|
|
|
|
|
pdb_file, paths = queue.get() |
|
|
file_cnt += 1 |
|
|
id2task[pdb_file].join() |
|
|
del id2task[pdb_file] |
|
|
|
|
|
|
|
|
if pointer < len(filenames): |
|
|
task_id = filenames[pointer] |
|
|
id2task[task_id] = mp.Process( |
|
|
target=mp_worker, |
|
|
args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue) |
|
|
) |
|
|
id2task[task_id].start() |
|
|
pointer += 1 |
|
|
|
|
|
|
|
|
for save_path in paths: |
|
|
_id = get_filename(save_path) |
|
|
|
|
|
list_blocks, chains = pdb_to_list_blocks(save_path, return_chain_ids=True) |
|
|
if chains[0] == 'L': |
|
|
list_blocks, chains = (list_blocks[1], list_blocks[0]), (chains[1], chains[0]) |
|
|
|
|
|
rec_blocks, lig_blocks = list_blocks |
|
|
rec_chain, lig_chain = chains |
|
|
try: |
|
|
_, (pocket_idx, _) = blocks_cb_interface(rec_blocks, lig_blocks, pocket_th) |
|
|
except KeyError: |
|
|
continue |
|
|
rec_num_units = sum([len(block) for block in rec_blocks]) |
|
|
lig_num_units = sum([len(block) for block in lig_blocks]) |
|
|
rec_data = [block.to_tuple() for block in rec_blocks] |
|
|
lig_data = [block.to_tuple() for block in lig_blocks] |
|
|
rec_seq = ''.join([AA3TO1[block.abrv] for block in rec_blocks]) |
|
|
lig_seq = ''.join([AA3TO1[block.abrv] for block in lig_blocks]) |
|
|
|
|
|
yield _id, (rec_data, lig_data), [ |
|
|
len(rec_blocks), len(lig_blocks), rec_num_units, lig_num_units, |
|
|
rec_chain, lig_chain, rec_seq, lig_seq, |
|
|
','.join([str(idx) for idx in pocket_idx]), |
|
|
], file_cnt |
|
|
|
|
|
|
|
|
def main(args): |
|
|
indexes = get_non_redundant(args.database_dir) |
|
|
cnt = len(indexes) |
|
|
tmp_dir = './tmp' |
|
|
if not os.path.exists(tmp_dir): |
|
|
os.makedirs(tmp_dir) |
|
|
|
|
|
print_log(f'Processing data from directory: {args.pdb_dir}.') |
|
|
print_log(f'Number of entries: {cnt}') |
|
|
create_mmap( |
|
|
process_iterator(indexes, args.pdb_dir, tmp_dir, args.out_dir, args.pocket_th, args.n_cpu), |
|
|
args.out_dir, cnt) |
|
|
|
|
|
print_log('Finished!') |
|
|
|
|
|
shutil.rmtree(tmp_dir) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main(parse()) |
|
|
|