|
|
|
|
|
|
|
|
import os |
|
|
import argparse |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
from utils.logger import print_log |
|
|
|
|
|
|
|
|
def parse(): |
|
|
parser = argparse.ArgumentParser(description='Split peptide data') |
|
|
parser.add_argument('--train_index', type=str, required=True, help='Path for training index') |
|
|
parser.add_argument('--valid_index', type=str, required=True, help='Path for validation index') |
|
|
parser.add_argument('--test_index', type=str, default=None, help='Path for test index') |
|
|
parser.add_argument('--processed_dir', type=str, required=True, help='processed directory') |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def read_index(mmap_dir): |
|
|
items = {} |
|
|
index = os.path.join(mmap_dir, 'index.txt') |
|
|
with open(index, 'r') as fin: |
|
|
lines = fin.readlines() |
|
|
for line in lines: |
|
|
values = line.strip().split('\t') |
|
|
items[values[0]] = line |
|
|
return items |
|
|
|
|
|
|
|
|
def transform(items, path, out): |
|
|
ids = {} |
|
|
with open(path, 'r') as fin: |
|
|
lines = fin.readlines() |
|
|
for line in lines: |
|
|
ids[line.split('\t')[0]] = 1 |
|
|
with open(out, 'w') as fout: |
|
|
for _id in ids: fout.write(items[_id]) |
|
|
|
|
|
|
|
|
def main(args): |
|
|
|
|
|
|
|
|
items = read_index(args.processed_dir) |
|
|
|
|
|
|
|
|
transform(items, args.train_index, os.path.join(args.processed_dir, 'train_index.txt')) |
|
|
transform(items, args.valid_index, os.path.join(args.processed_dir, 'valid_index.txt')) |
|
|
if args.test_index is not None: |
|
|
transform(items, args.test_index, os.path.join(args.processed_dir, 'test_index.txt')) |
|
|
|
|
|
print_log('Done') |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
np.random.seed(12) |
|
|
main(parse()) |