| | from typing import List |
| |
|
| | import torch |
| | from datasets import Dataset |
| | from torch.utils.data import DataLoader |
| | from tqdm import tqdm |
| | from transformers import PerceiverTokenizer |
| |
|
| |
|
| | def _map_outputs(predictions): |
| | """ |
| | Map model outputs to classes. |
| | |
| | :param predictions: model ouptut batch |
| | :return: |
| | """ |
| |
|
| | labels = [ |
| | "admiration", |
| | "amusement", |
| | "anger", |
| | "annoyance", |
| | "approval", |
| | "caring", |
| | "confusion", |
| | "curiosity", |
| | "desire", |
| | "disappointment", |
| | "disapproval", |
| | "disgust", |
| | "embarrassment", |
| | "excitement", |
| | "fear", |
| | "gratitude", |
| | "grief", |
| | "joy", |
| | "love", |
| | "nervousness", |
| | "optimism", |
| | "pride", |
| | "realization", |
| | "relief", |
| | "remorse", |
| | "sadness", |
| | "surprise", |
| | "neutral" |
| | ] |
| | classes = [] |
| | for i, example in enumerate(predictions): |
| | out_batch = [] |
| | for j, category in enumerate(example): |
| | out_batch.append(labels[j]) if category > 0.5 else None |
| | classes.append(out_batch) |
| | return classes |
| |
|
| |
|
| | class MultiLabelPipeline: |
| | """ |
| | Multi label classification pipeline. |
| | """ |
| |
|
| | def __init__(self, model_path): |
| | """ |
| | Init MLC pipeline. |
| | |
| | :param model_path: model to use |
| | """ |
| |
|
| | |
| | self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| | if self.device == 'cuda': |
| | self.model = torch.load(model_path).eval().to(self.device) |
| | else: |
| | self.model = torch.load(model_path, map_location=torch.device('cpu')).eval().to(self.device) |
| | self.tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver') |
| |
|
| | def __call__(self, dataset, batch_size: int = 4): |
| | """ |
| | Processing pipeline. |
| | |
| | :param dataset: dataset |
| | :return: |
| | """ |
| |
|
| | |
| | dataset = dataset.map(lambda row: self.tokenizer(row['text'], padding="max_length", truncation=True), |
| | batched=True, remove_columns=['text'], desc='Tokenizing') |
| | dataset.set_format('torch', columns=['input_ids', 'attention_mask']) |
| | dataloader = DataLoader(dataset, batch_size=batch_size) |
| |
|
| | |
| | classes = [] |
| | mem_logs = [] |
| |
|
| | with tqdm(dataloader, unit='batches') as progression: |
| | for batch in progression: |
| | progression.set_description('Inference') |
| | |
| | outputs = self.model(inputs=batch['input_ids'].to(self.device), |
| | attention_mask=batch['attention_mask'].to(self.device), ) |
| |
|
| | |
| | predictions = outputs.logits.cpu().detach().numpy() |
| |
|
| | |
| | batch_classes = _map_outputs(predictions) |
| |
|
| | for row in batch_classes: |
| | classes.append(row) |
| |
|
| | |
| | memory = round(torch.cuda.memory_reserved(self.device) / 1e9, 2) |
| | mem_logs.append(memory) |
| |
|
| | |
| | progression.set_postfix(memory=f"{round(sum(mem_logs) / len(mem_logs), 2)}Go") |
| |
|
| | return classes |
| |
|
| |
|
| | def inputs_to_dataset(inputs: List[str]): |
| | """ |
| | Convert a list of strings to a dataset object. |
| | |
| | :param inputs: list of strings |
| | :return: |
| | """ |
| |
|
| | inputs = {'text': [input for input in inputs]} |
| |
|
| | return Dataset.from_dict(inputs) |
| |
|