| | import pandas as pd |
| | import os |
| |
|
| | import torch |
| | from transformers import RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorWithPadding |
| |
|
| | import datasets |
| | from datasets import disable_caching |
| | disable_caching() |
| |
|
| | DEVICE = 'cuda:0' |
| | ENCODER_MODEL_NAME = "entropy/roberta_zinc_480m" |
| | ENCODER_BATCH_SIZE = 1024 |
| |
|
| | TOKENIZER_MAX_LEN = 256 |
| | TOKENIZATION_NUM_PROC = 32 |
| |
|
| | ''' |
| | Data source is expected to be a CSV file with a column of SMILES strings |
| | denoted by `SMILES_COLUMN`. The CSV is processed in chunks of size `PROCESS_CHUNKSIZE`. |
| | |
| | Processed chunks are saved to `SAVE_PATH` with the format `SAVE_PATH/processed_shard_{i}.hf` |
| | ''' |
| |
|
| | DATASET_CSV_FILENAME = None |
| | PROCESS_CHUNKSIZE = 1000000 |
| | SMILES_COLUMN = 'smiles' |
| | MAX_CHUNKS = None |
| | MAX_SMILES_LENGTH = 90 |
| | MIN_SMILES_LENGTH = 5 |
| | FILTER_NUM_PROC = 32 |
| | SAVE_PATH = None |
| |
|
| | assert DATASET_CSV_FILENAME is not None, "must specify dataset filename" |
| | assert SAVE_PATH is not None, "must specify save path" |
| |
|
| |
|
| | def tokenization(example): |
| | return tokenizer(example[SMILES_COLUMN], add_special_tokens=True, |
| | truncation=True, max_length=TOKENIZER_MAX_LEN) |
| |
|
| | def embed(inputs): |
| | inputs = {k:inputs[k] for k in ['input_ids', 'attention_mask']} |
| | inputs = collator(inputs) |
| | inputs = {k:v.to(DEVICE) for k,v in inputs.items()} |
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs, output_hidden_states=True) |
| | full_embeddings = outputs[-1][-1] |
| | mask = inputs['attention_mask'] |
| | |
| | mean_embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1)) |
| | |
| | return {'encoder_hidden_states' : mean_embeddings} |
| |
|
| | def length_filter_smiles(example): |
| | min_check = (len(example[SMILES_COLUMN])>MIN_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True |
| | max_check = (len(example[SMILES_COLUMN])<MAX_SMILES_LENGTH) if (MIN_SMILES_LENGTH is not None) else True |
| | type_check = type(example[SMILES_COLUMN])==str |
| | filter_pass = all([min_check, max_check, type_check]) |
| | return filter_pass |
| |
|
| |
|
| | tokenizer = RobertaTokenizerFast.from_pretrained(ENCODER_MODEL_NAME, max_len=TOKENIZER_MAX_LEN) |
| | collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt') |
| |
|
| | model = RobertaForMaskedLM.from_pretrained(ENCODER_MODEL_NAME) |
| | model.to(DEVICE) |
| | model.eval() |
| |
|
| | df_iter = pd.read_csv(DATASET_CSV_FILENAME, chunksize=PROCESS_CHUNKSIZE, usecols=[SMILES_COLUMN]) |
| |
|
| | for i, df in enumerate(df_iter): |
| | print(f'processing dataset chunk {i}') |
| | |
| | dataset = datasets.Dataset.from_pandas(df) |
| | |
| | dataset = dataset.filter(lambda example: length_filter_smiles(example), num_proc=FILTER_NUM_PROC) |
| | |
| | dataset = dataset.map(tokenization, batched=True, num_proc=TOKENIZATION_NUM_PROC) |
| | |
| | dataset = dataset.map(embed, batched=True, batch_size=ENCODER_BATCH_SIZE) |
| | |
| | dataset.save_to_disk(f'{SAVE_PATH}/processed_shard_{i}.hf') |
| | |
| | if (MAX_CHUNKS is not None) and (i >= MAX_CHUNKS-1): |
| | break |
| |
|
| | print('finished data processing') |
| |
|
| |
|
| |
|