File size: 4,136 Bytes
52007f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
#!/usr/bin/python
# -*- coding:utf-8 -*-
import os
import re
import argparse
from tqdm import tqdm
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
from data.mmap_dataset import create_mmap
from data.format import VOCAB
from data.converter.pdb_to_list_blocks import pdb_to_list_blocks
from data.converter.list_blocks_to_pdb import list_blocks_to_pdb
from data.converter.blocks_interface import blocks_interface, blocks_cb_interface
from utils.logger import print_log
def parse():
parser = argparse.ArgumentParser(description='Process PepBDB dataset')
parser.add_argument('--index', type=str, default=None, help='Index file of the dataset')
parser.add_argument('--out_dir', type=str, required=True, help='Output Directory')
parser.add_argument('--pocket_th', type=float, default=10.0,
help='Threshold for determining pocket')
return parser.parse_args()
def process_iterator(items, pdb_dir, pdb_out_dir, pocket_th):
if not os.path.exists(pdb_out_dir):
os.makedirs(pdb_out_dir)
for cnt, pdb_id in enumerate(items):
summary = items[pdb_id]
rec_chain, lig_chain = summary['rec_chain'], summary['pep_chain']
non_standard = 0
try:
rec_blocks = pdb_to_list_blocks(os.path.join(pdb_dir, pdb_id, 'receptor.pdb'), selected_chains=[rec_chain])[0]
lig_blocks = pdb_to_list_blocks(os.path.join(pdb_dir, pdb_id, 'peptide.pdb'), selected_chains=[lig_chain])[0]
except (KeyError, FileNotFoundError):
continue
_, (_, pep_if_idx) = blocks_interface(rec_blocks, lig_blocks, 6.0) # 6A for atomic interaction
# if len(pep_if_idx) / len(lig_blocks) < 0.3: # too less contacts
# continue
if len(pep_if_idx) == 0:
continue
try:
_, (pocket_idx, _) = blocks_cb_interface(rec_blocks, lig_blocks, pocket_th) # 10A for pocket size based on CB
except KeyError:
print_log(f'{pdb_id} missing backbone atoms')
continue # missing both CB and backbone atoms
rec_num_units = sum([len(block) for block in rec_blocks])
lig_num_units = sum([len(block) for block in lig_blocks])
data = ([block.to_tuple() for block in rec_blocks], [block.to_tuple() for block in lig_blocks])
rec_seq = ''.join([VOCAB.abrv_to_symbol(block.abrv) for block in rec_blocks])
lig_seq = ''.join([VOCAB.abrv_to_symbol(block.abrv) for block in lig_blocks])
# if '?' in [rec_seq[i] for i in pocket_idx] or '?' in lig_seq:
if '?' in lig_seq:
non_standard = 1 # has non-standard amino acids
try:
list_blocks_to_pdb(
[rec_blocks, lig_blocks],
[rec_chain, lig_chain],
os.path.join(pdb_out_dir, pdb_id + '.pdb')
)
except Exception:
# things like XE1 in 4cin_C, unknown atom
continue
yield pdb_id, data, [
len(rec_blocks), len(lig_blocks), rec_num_units, lig_num_units,
rec_chain, lig_chain, rec_seq, lig_seq, non_standard,
','.join([str(idx) for idx in pocket_idx]),
], cnt
def main(args):
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir)
# 1. get index file
with open(args.index, 'r') as fin:
lines = fin.readlines()
indexes = {}
for line in lines:
line = re.split(r'\s+', line.strip())
if line[-1] != 'prot':
continue
pdb_id = line[0]
indexes[pdb_id + '_' + line[1]] = {
'rec_chain': line[4],
'pep_chain': line[1]
}
print_log(f'Total {len(indexes)} entries')
# 2. process pdb files into our format (mmap)
create_mmap(
process_iterator(
indexes,
os.path.join(os.path.dirname(args.index), 'pepbdb'),
os.path.join(args.out_dir, 'pdbs'),
args.pocket_th
),
args.out_dir, len(indexes))
print_log('Finished!')
return
if __name__ == '__main__':
main(parse()) |