import os import re import gzip import time import shutil import argparse from copy import deepcopy from tempfile import NamedTemporaryFile import multiprocessing as mp import numpy as np from Bio.PDB import PDBParser,Chain,Model,Structure, PDBIO from Bio.PDB.DSSP import dssp_dict_from_pdb_file from Bio.SeqUtils.ProtParam import ProteinAnalysis from rdkit.Chem.rdMolDescriptors import CalcTPSA from freesasa import calcBioPDB from rdkit.Chem import MolFromSmiles from globals import CACHE_DIR, CONTACT_DIST from utils.logger import print_log from utils.file_utils import cnt_num_files, get_filename from data.mmap_dataset import create_mmap from data.converter.pdb_to_list_blocks import pdb_to_list_blocks from data.converter.blocks_interface import blocks_cb_interface, blocks_interface from .pepbind import clustering def parse(): parser = argparse.ArgumentParser(description='Filter peptide-like loop from monomers') parser.add_argument('--database_dir', type=str, required=True, help='Directory of pdb database processed in monomers') parser.add_argument('--pdb_dir', type=str, required=True, help='Directory to PDB database') parser.add_argument('--out_dir', type=str, required=True, help='Output directory') parser.add_argument('--pocket_th', type=float, default=10.0, help='Threshold for determining pocket') parser.add_argument('--n_cpu', type=int, default=4, help='Number of CPU to use') return parser.parse_args() # Constants AA3TO1 = { 'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M', 'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K', 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',} hydrophobic_residues=['V','I','L','M','F','W','C'] charged_residues=['H','R','K','D','E'] def add_cb(input_array): #from protein mpnn #The virtual Cβ coordinates were calculated using ideal angle and bond length definitions: b = Cα - N, c = C - Cα, a = cross(b, c), Cβ = -0.58273431*a + 0.56802827*b - 0.54067466*c + Cα. N,CA,C,O = input_array b = CA - N c = C - CA a = np.cross(b,c) CB = np.around(-0.58273431*a + 0.56802827*b - 0.54067466*c + CA,3) return CB #np.array([N,CA,C,CB,O]) aaSMILES = {'G': 'NCC(=O)O', 'A': 'N[C@@]([H])(C)C(=O)O', 'R': 'N[C@@]([H])(CCCNC(=N)N)C(=O)O', 'N': 'N[C@@]([H])(CC(=O)N)C(=O)O', 'D': 'N[C@@]([H])(CC(=O)O)C(=O)O', 'C': 'N[C@@]([H])(CS)C(=O)O', 'E': 'N[C@@]([H])(CCC(=O)O)C(=O)O', 'Q': 'N[C@@]([H])(CCC(=O)N)C(=O)O', 'H': 'N[C@@]([H])(CC1=CN=C-N1)C(=O)O', 'I': 'N[C@@]([H])(C(CC)C)C(=O)O', 'L': 'N[C@@]([H])(CC(C)C)C(=O)O', 'K': 'N[C@@]([H])(CCCCN)C(=O)O', 'M': 'N[C@@]([H])(CCSC)C(=O)O', 'F': 'N[C@@]([H])(Cc1ccccc1)C(=O)O', 'P': 'N1[C@@]([H])(CCC1)C(=O)O', 'S': 'N[C@@]([H])(CO)C(=O)O', 'T': 'N[C@@]([H])(C(O)C)C(=O)O', 'W': 'N[C@@]([H])(CC(=CN2)C1=C2C=CC=C1)C(=O)O', 'Y': 'N[C@@]([H])(Cc1ccc(O)cc1)C(=O)O', 'V': 'N[C@@]([H])(C(C)C)C(=O)O'} class Filter: def __init__( self, min_loop_len = 4, max_loop_len = 25, min_BSA = 400, min_relBSA = 0.2, max_relncBSA = 0.3, saved_maxlen = 25, saved_BSA = 400, saved_relBSA = 0.2, saved_helix_ratio = 1.0, saved_strand_ratio = 1.0, cyclic=False ) -> None: self.re_filter = re.compile(r'D[GPS]|[P]{2,}|C') #https://www.thermofisher.cn/cn/zh/home/life-science/protein-biology/protein-biology-learning-center/protein-biology-resource-library/pierce-protein-methods/peptide-design.html self.cache_dir = CACHE_DIR self.min_loop_len = min_loop_len self.max_loop_len = max_loop_len self.min_BSA = min_BSA self.min_relBSA = min_relBSA self.max_relncBSA = max_relncBSA self.saved_maxlen = saved_maxlen self.saved_BSA = saved_BSA self.saved_relBSA = saved_relBSA self.saved_helix_ratio = saved_helix_ratio self.saved_strand_ratio = saved_strand_ratio self.cyclic = cyclic @classmethod def get_ss_info(cls, pdb_path: str): dssp, keys = dssp_dict_from_pdb_file(pdb_path, DSSP='mkdssp') ss_info = {} for key in keys: chain_id, value = key[0], dssp[key] if chain_id not in ss_info: ss_info[chain_id] = [] ss_type = value[1] if ss_type in ['H', 'G', 'I']: ss_info[chain_id].append('a') elif ss_type in ['B', 'E', 'T', 'S']: ss_info[chain_id].append('b') elif ss_type == '-': ss_info[chain_id].append('c') else: raise ValueError(f'SS type {ss_type} cannot be recognized!') return ss_info @classmethod def get_bsa(self, receptor_chain: Chain.Chain, ligand_chain: Chain.Chain): lig_chain_id = ligand_chain.get_id() tmp_structure = Structure.Structure('tmp') tmp_model = Model.Model(0) tmp_structure.add(tmp_model) tmp_model.add(ligand_chain) unbounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id] unbounded_SASA = [k.total for k in unbounded_SASA.values()] tmp_model.add(receptor_chain) bounded_SASA = calcBioPDB(tmp_structure)[0].residueAreas()[lig_chain_id] bounded_SASA = [k.total for k in bounded_SASA.values()] abs_bsa = sum(unbounded_SASA[1:-1]) - sum(bounded_SASA[1:-1]) rel_bsa = abs_bsa / sum(unbounded_SASA[1:-1]) rel_nc_bsa = (unbounded_SASA[0] + unbounded_SASA[-1] - bounded_SASA[0] - bounded_SASA[-1]) / (unbounded_SASA[0] + unbounded_SASA[-1]) return abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure def filter_pdb(self, pdb_path, selected_chains=None): parser = PDBParser(QUIET=True) ss_info = self.get_ss_info(pdb_path) structure = parser.get_structure('anonym', pdb_path) for model in structure.get_models(): # use model 1 only structure = model break results = [] for chain in structure.get_chains(): if selected_chains is not None and chain.get_id() not in selected_chains: continue chain_ss_info = None if ss_info is None else ss_info[chain.get_id()] results.extend(self.filter_chain(chain, chain_ss_info)) return results def filter_chain(self, chain, ss_info=None): non_standard = False for res in chain: if res.get_resname() not in AA3TO1: non_standard = True break if non_standard: return [] if len(ss_info) != len(chain): return [] cb_coord = [] seq = '' for res in chain: seq += AA3TO1[res.get_resname()] try: cb_coord.append(res['CB'].get_coord()) except: tmp_coord = np.array([ res['N'].get_coord(), res['CA'].get_coord(), res['C'].get_coord(), res['O'].get_coord() ]) cb_coord.append(add_cb(tmp_coord)) cb_coord = np.array(cb_coord) cb_contact = np.linalg.norm(cb_coord[None,:,:,] - cb_coord[:,None,:],axis=-1) if self.cyclic: possible_ss = (cb_contact >= 3.5) & (cb_contact <= 5) #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4987930/ else: possible_ss = np.ones(cb_contact.shape, dtype=bool) possible_ss = np.triu(np.tril(possible_ss, self.max_loop_len - 1), self.min_loop_len - 1) ss_pair = np.where(possible_ss) accepted, saved_spans = [], [] for i, j in zip(ss_pair[0],ss_pair[1]): redundant = False for exist_i, exist_j in saved_spans: overlap = min(j, exist_j) - max(i, exist_i) + 1 if overlap / (j - i + 1) > 0.4 or overlap / (exist_j - exist_i + 1) > 0.4: redundant = True break if redundant: continue #20A neighbor min_dist = np.min(cb_contact[i : j + 1], axis=0) min_dist[max(i - 5, 0):min(j + 6, len(seq))] = 21 neighbors_20A = np.where(min_dist < 20)[0] if len(neighbors_20A) < 16: continue #sequence filter pep_seq = seq[i:j+1] #cystine 2P and DGDA filter if self.re_filter.search(pep_seq) is not None: continue prot_param=ProteinAnalysis(pep_seq) aa_percent = prot_param.get_amino_acids_percent() max_ratio = max(aa_percent.values()) #Discard if any amino acid represents more than 25% of the total sequence if max_ratio > 0.25: continue hydrophobic_ratio = sum([aa_percent[k] for k in hydrophobic_residues]) #hydrophobic amino acids exceeds 45% if hydrophobic_ratio > 0.45: continue #charged amino acids exceeds 45% or less than 25% charged_ratio = sum([aa_percent[k] for k in charged_residues]) if charged_ratio > 0.45 or charged_ratio < 0.25: continue #instablility index>40 if prot_param.instability_index() >= 40: continue # #TPSA filter (for cell penetration) # mol_weight = prot_param.molecular_weight() # pepsmile='O' # for k in pep_seq: # pepsmile=pepsmile[:-1] + aaSMILES[k] # pepsmile = MolFromSmiles(pepsmile) # tpsa = CalcTPSA(pepsmile) # if tpsa <= mol_weight * 0.2: # continue #build structure and get BSA receptor_chain = Chain.Chain('R') ligand_chain = Chain.Chain('L') for k,res in enumerate(chain): if k >= i and k <= j: ligand_chain.add(res.copy()) elif k in neighbors_20A: receptor_chain.add(res.copy()) abs_bsa, rel_bsa, rel_nc_bsa, tmp_structure = self.get_bsa(receptor_chain, ligand_chain) if abs_bsa <= self.min_BSA or rel_bsa <= self.min_relBSA or (self.cyclic and rel_nc_bsa >= self.max_relncBSA): continue #prepare for output length = j - i + 1 if ss_info is None: helix_ratio = -1 strand_ratio = -1 coil_ratio = -1 else: ssa = ss_info[i:j+1] helix_ratio = ssa.count('a') / length strand_ratio = ssa.count('b') / length coil_ratio = ssa.count('c') / length # helix_ratio = (ssa.count("G") + ssa.count("H") + ssa.count("I") + ssa.count("T")) / length # strand_ratio = (ssa.count("E") + ssa.count("B")) / length # coil_ratio = (ssa.count("S")+ssa.count("C")) / length if length <= self.saved_maxlen and abs_bsa >= self.saved_BSA and rel_bsa >= self.saved_relBSA and helix_ratio <= self.saved_helix_ratio and strand_ratio <= self.saved_strand_ratio: output_structure = deepcopy(tmp_structure) else: output_structure = None accepted.append(( i , j, length, abs_bsa, rel_bsa, helix_ratio, strand_ratio, coil_ratio, output_structure )) saved_spans.append((i, j)) return accepted def get_non_redundant(mmap_dir): np.random.seed(12) index_path = os.path.join(mmap_dir, 'index.txt') parent_dir = mmap_dir # load index file items = {} with open(index_path, 'r') as fin: lines = fin.readlines() for line in lines: values = line.strip().split('\t') _id, seq = values[0], values[-1] chain, pdb_file = _id.split('_') items[_id] = (seq, chain, pdb_file) # make temporary directory tmp_dir = os.path.join(parent_dir, 'tmp') if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) else: raise ValueError(f'Working directory {tmp_dir} exists!') # 1. get non-redundant dimer by 90% seq-id fasta = os.path.join(tmp_dir, 'seq.fasta') with open(fasta, 'w') as fout: for _id in items: fout.write(f'>{_id}\n{items[_id][0]}\n') id2clu, clu2id = clustering(fasta, tmp_dir, 0.9) non_redundant = [] for clu in clu2id: ids = clu2id[clu] non_redundant.append(np.random.choice(ids)) print_log(f'Non-redundant entries: {len(non_redundant)}') shutil.rmtree(tmp_dir) # 2. construct non_redundant items indexes = {} for _id in non_redundant: _, chain, pdb_file = items[_id] if pdb_file not in indexes: indexes[pdb_file] = [] indexes[pdb_file].append(chain) return indexes def mp_worker(data_dir, tmp_dir, pdb_file, selected_chains, pep_filter, pdb_out_dir, queue): category = pdb_file[4:6] category_dir = os.path.join(data_dir, category) path = os.path.join(category_dir, pdb_file) tmp_file = os.path.join(tmp_dir, f'{pdb_file}.decompressed') pdb_id = get_filename(pdb_file.split('.')[0]) # uncompress the file to the tmp file with gzip.open(path, 'rb') as fin: with open(tmp_file, 'wb') as fout: shutil.copyfileobj(fin, fout) files = [] try: # biotitie_pdb_file = biotite_pdb.PDBFile.read(tmp_file) # biotite_struct = biotitie_pdb_file.get_structure(model=1) # ss_info = { chain: annotate_sse(biotite_struct, chain_id=chain) for chain in selected_chains } results = pep_filter.filter_pdb(tmp_file, selected_chains=selected_chains) for item in results: i, j, struct = item[0], item[1], item[-1] if struct is None: continue io = PDBIO() io.set_structure(struct) _id = pdb_id + f'_{i}_{j}' save_path = os.path.join(pdb_out_dir, _id + '.pdb') io.save(save_path) files.append(save_path) except Exception: # pdbs with missing backbone coordinates or DSSP failed pass queue.put((pdb_file, files)) os.remove(tmp_file) def process_iterator(indexes, data_dir, tmp_dir, out_dir, pocket_th, n_cpu): pdb_out_dir = os.path.join(out_dir, 'pdbs') if not os.path.exists(pdb_out_dir): os.makedirs(pdb_out_dir) pep_filter = Filter() file_cnt, pointer, filenames = 0, 0, list(indexes.keys()) id2task = {} queue = mp.Queue() # initialize tasks for _ in range(n_cpu): task_id = filenames[pointer] id2task[task_id] = mp.Process( target=mp_worker, args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue) ) id2task[task_id].start() pointer += 1 while True: if len(id2task) == 0: break if not queue.qsize: # no finished ones time.sleep(1) continue pdb_file, paths = queue.get() file_cnt += 1 id2task[pdb_file].join() del id2task[pdb_file] # add the next task if pointer < len(filenames): task_id = filenames[pointer] id2task[task_id] = mp.Process( target=mp_worker, args=(data_dir, tmp_dir, task_id, indexes[task_id], pep_filter, pdb_out_dir, queue) ) id2task[task_id].start() pointer += 1 # handle processed data for save_path in paths: _id = get_filename(save_path) list_blocks, chains = pdb_to_list_blocks(save_path, return_chain_ids=True) if chains[0] == 'L': list_blocks, chains = (list_blocks[1], list_blocks[0]), (chains[1], chains[0]) rec_blocks, lig_blocks = list_blocks rec_chain, lig_chain = chains try: _, (pocket_idx, _) = blocks_cb_interface(rec_blocks, lig_blocks, pocket_th) except KeyError: continue rec_num_units = sum([len(block) for block in rec_blocks]) lig_num_units = sum([len(block) for block in lig_blocks]) rec_data = [block.to_tuple() for block in rec_blocks] lig_data = [block.to_tuple() for block in lig_blocks] rec_seq = ''.join([AA3TO1[block.abrv] for block in rec_blocks]) lig_seq = ''.join([AA3TO1[block.abrv] for block in lig_blocks]) yield _id, (rec_data, lig_data), [ len(rec_blocks), len(lig_blocks), rec_num_units, lig_num_units, rec_chain, lig_chain, rec_seq, lig_seq, ','.join([str(idx) for idx in pocket_idx]), ], file_cnt def main(args): indexes = get_non_redundant(args.database_dir) cnt = len(indexes) tmp_dir = './tmp' if not os.path.exists(tmp_dir): os.makedirs(tmp_dir) print_log(f'Processing data from directory: {args.pdb_dir}.') print_log(f'Number of entries: {cnt}') create_mmap( process_iterator(indexes, args.pdb_dir, tmp_dir, args.out_dir, args.pocket_th, args.n_cpu), args.out_dir, cnt) print_log('Finished!') shutil.rmtree(tmp_dir) if __name__ == '__main__': main(parse())