# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """ preprocess input data into feature and stores binary as python shelve DB each chunk is gzipped JSON string """ import argparse import gzip import json import subprocess as sp import shelve import os from os.path import dirname, exists, join import torch from lsp_model import GPT2Tokenizer from tqdm import tqdm from env import END_OF_TEXT_TOKEN from gpt2_training.train_utils import InputFeatures_train as InputFeatures def _get_file_len(corpus): n_line = int(sp.check_output(f"wc -l {corpus}".split(), universal_newlines=True).split()[0]) return n_line def _norm_text(text): w, *toks = text.strip().split() try: w = float(w) except Exception: toks = [w] + toks w = 1.0 return w, ' '.join(toks) def _get_inputs_from_text(text, tokenizer): srcs, tgt = text.strip().split('\t') weights = [] inputs = [] for src in srcs.split(' EOS '): src_weight, src = _norm_text(src) context_id = tokenizer.encode(src) weights.append(src_weight) inputs.append(context_id) tgt_weight, tgt = _norm_text(tgt) if tgt_weight != 0: response_id = tokenizer.encode(tgt) weights.append(tgt_weight) inputs.append(response_id) return weights, inputs def _make_features(id_, weights, inputs, tokenizer, max_len): end_of_text_id = tokenizer.encoder[END_OF_TEXT_TOKEN] features = [] sents = [] ws = [] len_ = 0 i = 0 for ids, w in zip(inputs, weights): if len(ids) > max_len: if len(sents) >= 2: feat = _make_feature(id_ + i, sents, ws, end_of_text_id) if feat is not None: features.append(feat) i += 1 len_ = 0 sents = [] ws = [] continue elif len_ > max_len: feat = _make_feature(id_ + i, sents, ws, end_of_text_id) if feat is not None: features.append(feat) i += 1 len_ = len(sents[-1]) + 1 sents = sents[-1:] ws = ws[-1:] len_ += (len(ids) + 1) sents.append(ids) ws.append(w) if len(sents) >= 2: feat = _make_feature(id_ + i, sents, ws, end_of_text_id) if feat is not None: features.append(feat) return features def _make_feature(id_, sents, ws, eos): if all(w == 0 for w in ws[1:]): return None input_ids = [i for s in sents for i in s+[eos]][:-1] lm_labels = [] weights = [] token_type_ids = [] # this becomes round ids for i, (s, w) in enumerate(zip(sents, ws)): if i == 0: lm_labels += [-1] * len(s) weights += [0.0] * len(s) token_type_ids += [0] * len(s) continue token_type_ids += [i] * (len(s) + 1) if w == 0.0: lm_labels += [-1] * (len(s) + 1) weights += [0.0] * (len(s) + 1) else: lm_labels += (s + [eos]) weights += [w] * (len(s) + 1) # handle trailing -1's i = len(lm_labels) - 1 while i >= 0: if lm_labels[i] != -1: break i -= 1 input_ids = input_ids[:i+1] lm_labels = lm_labels[:i+1] weights = weights[:i+1] token_type_ids = token_type_ids[:i+1] # pad to multiples of 8 while len(input_ids) % 8 != 0: input_ids.append(0) token_type_ids.append(0) lm_labels.append(-1) weights.append(0.0) position_ids = list(range(len(input_ids))) assert (len(input_ids) == len(position_ids) == len(token_type_ids) == len(lm_labels) == len(weights)) assert len(input_ids) % 8 == 0 if len(input_ids) == 0: import pdb pdb.set_trace() feature = InputFeatures(id_, input_ids, position_ids, token_type_ids, lm_labels, weights) return feature def main(args): toker = GPT2Tokenizer.from_pretrained('gpt2') attrs = [] if args.reverse: attrs.append('reverse') if args.two_turn: attrs.append('2turn') if attrs: db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.' f'{".".join(attrs)}.db/db') else: db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db' if exists(dirname(db_path)): raise ValueError('Found existing DB, please backup') else: os.makedirs(dirname(db_path)) with open(args.corpus, "r", encoding="utf-8") as reader, \ shelve.open(db_path, 'n') as db: chunk = [] n_chunk = 0 n_example = 0 for line in tqdm(reader, total=_get_file_len(args.corpus)): try: if len(chunk) >= args.chunk_size: # save and renew chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk[:args.chunk_size]).encode('utf-8')) chunk = chunk[args.chunk_size:] n_chunk += 1 weights, inputs = _get_inputs_from_text(line, toker) if args.reverse: weights = list(reversed(weights)) inputs = list(reversed(inputs)) if args.two_turn: weights = weights[:2] inputs = inputs[:2] if len(weights) < 2: continue features = _make_features(n_example, weights, inputs, toker, args.max_seq_len) for feature in features: chunk.append(vars(feature)) n_example += 1 except Exception as e: print('!!! prepro exception !!!', e) continue # save last chunk db[f'chunk_{n_chunk}'] = gzip.compress( json.dumps(chunk).encode('utf-8')) # save relevant information to reproduce meta = {'n_example': n_example, 'chunk_size': args.chunk_size, 'max_seq_len': args.max_seq_len, 'reverse': args.reverse, 'two_turn': args.two_turn} with open(join(dirname(db_path), 'meta.json'), 'w') as writer: json.dump(meta, writer, indent=4) torch.save(toker, join(dirname(db_path), 'tokenizer.pt')) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--corpus', required=True, help='file name of training corpus (should be .tsv)') parser.add_argument('--chunk_size', type=int, default=65536, help='num of data examples in a storing chunk') parser.add_argument('--max_seq_len', type=int, default=128, help='discard data longer than this') parser.add_argument('--reverse', action='store_true', help='reverse the src tgt') parser.add_argument('--two_turn', action='store_true', help='take only the first 2 turns') args = parser.parse_args() main(args)