| | |
| | |
| | import warnings |
| | warnings.filterwarnings("ignore", category=FutureWarning) |
| | import nltk |
| | nltk.download('punkt') |
| | import pandas as pd |
| | from nltk import pos_tag |
| | from nltk.corpus import stopwords |
| | import string |
| |
|
| | |
| | import numpy as np |
| | import re |
| | from gensim.models import Word2Vec |
| | import pickle |
| | import os |
| | from pathos.multiprocessing import ProcessingPool as Pool |
| | import itertools |
| | from time import time |
| | nltk.download('stopwords') |
| | |
| | nltk.download('averaged_perceptron_tagger') |
| | import torch |
| | |
| | from torch.utils.data import Dataset |
| | from transformers import BertTokenizer |
| | from ast import literal_eval |
| | import os.path |
| | import os |
| | from torch.nn.utils import clip_grad_norm_ |
| | from torch.utils.data import DataLoader |
| | from torch.nn.functional import softmax |
| | from torch.nn import CrossEntropyLoss |
| | from torch.optim import Adam |
| | import time |
| | from sklearn import metrics |
| | import statistics |
| | from transformers import get_linear_schedule_with_warmup |
| | |
| | import torch |
| | from torch.utils.data import Dataset |
| | from transformers import BertTokenizer |
| | import pandas as pd |
| |
|
| | from ast import literal_eval |
| | import os.path |
| |
|
| |
|
| |
|
| | nltk.download('punkt') |
| | import pandas as pd |
| | import string |
| |
|
| | |
| | |
| |
|
| | from pathos.multiprocessing import ProcessingPool as Pool |
| | import itertools |
| | from time import time |
| | import os |
| | nltk.download('stopwords') |
| | |
| | from torch.nn.utils import clip_grad_norm_ |
| | from torch.utils.data import DataLoader |
| | from transformers import get_linear_schedule_with_warmup |
| | import torch.nn as nn |
| |
|
| |
|
| | from transformers import * |
| |
|
| | nltk.download('punkt') |
| | nltk.download('wordnet') |
| | nltk.download('omw-1.4') |
| |
|
| |
|
| |
|
| | device = torch.device('cpu') |
| | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| |
|
| | MAX_SEQ_LEN = 256 |
| |
|
| |
|
| | MASK_TOKEN = '[MASK]' |
| | BATCH_SIZE=32 |
| |
|
| | def generate_production_batch(batch): |
| | tok=[(instance.tokens for instance in batch)] |
| |
|
| | tok=list( itertools.chain.from_iterable(tok)) |
| | tok=list( itertools.chain.from_iterable([[' '.join(i)] for i in tok])) |
| | encoded = tokenizer.__call__(tok, add_special_tokens=True, |
| | max_length=MAX_SEQ_LEN, pad_to_max_length=True, |
| | return_tensors='pt') |
| | input_ids = encoded['input_ids'] |
| | attn_mask = encoded['attention_mask'] |
| |
|
| | entity_indices = indices_for_entity_ranges([instance.entity_range for instance in batch]) |
| |
|
| | return input_ids, attn_mask, entity_indices, batch |
| |
|
| |
|
| | def indices_for_entity_ranges(ranges): |
| | max_e_len = max(end - start for start, end in ranges) |
| | indices = torch.tensor([[[min(t, end)] * HIDDEN_OUTPUT_FEATURES |
| | for t in range(start, start + max_e_len + 1)] |
| | for start, end in ranges]) |
| | return indices |
| |
|
| | |
| | open_file = open("./labels.pkl", "rb") |
| | LABELS = pickle.load(open_file) |
| | NUM_CLASSES = len(LABELS) |
| | open_file.close() |
| | with open('./labels_map.pkl', 'rb') as f: |
| | LABEL_MAP = pickle.load(f) |
| |
|
| | open_file = open("./labels.pkl", "rb") |
| | LABELS = pickle.load(open_file) |
| | open_file.close() |
| | with open('./labels_map.pkl', 'rb') as f: |
| | LABEL_MAP = pickle.load(f) |
| |
|
| |
|
| | class EntityDataset(Dataset): |
| |
|
| | def __init__(self, df, size=None): |
| | |
| | self.df = df[df.apply(lambda x: EntityDataset.instance_from_row(x) is not None, axis=1)] |
| | print(len(self.df)) |
| |
|
| | |
| | if size is not None and size < len(self): |
| | self.df = self.df.sample(size, replace=False) |
| |
|
| | @staticmethod |
| | def from_df(df, size=None): |
| | dataset = EntityDataset(df, size=size) |
| | print('Obtained dataset of size', len(dataset)) |
| | return dataset |
| |
|
| |
|
| | @staticmethod |
| | def instance_from_row(row): |
| | unpacked_arr = literal_eval(row['entityMentions']) if type(row['entityMentions']) is str else row['entityMentions'] |
| | entity= unpacked_arr[0]['text'] |
| | text = row['sentText'] |
| | return EntityDataset.get_instance(text, entity) |
| | @staticmethod |
| | def get_instance(text, entity, label=None): |
| | tokens = tokenizer.tokenize(text) |
| |
|
| | i = 0 |
| | found_entity = True |
| | entity_range = (0,100) |
| |
|
| | if found_entity: |
| | return PairRelInstance(tokens, entity, entity_range, None, text) |
| | def __len__(self): |
| | return len(self.df.index) |
| |
|
| | def __getitem__(self, idx): |
| | return EntityDataset.instance_from_row(self.df.iloc[idx]) |
| |
|
| |
|
| |
|
| | class PairRelInstance: |
| |
|
| | def __init__(self, tokens, entity, entity_range, label, text): |
| | self.tokens = tokens |
| | self.entity = entity |
| | self.entity_range = entity_range |
| | self.label = label |
| | self.text = text |
| | TRAINED_WEIGHTS = 'bert-base-uncased' |
| | HIDDEN_OUTPUT_FEATURES = 768 |
| |
|
| |
|
| |
|
| | class PairRelInstance: |
| |
|
| | def __init__(self, tokens, entity, entity_range, label, text): |
| | self.tokens = tokens |
| | self.entity = entity |
| | self.entity_range = entity_range |
| | self.label = label |
| | self.text = text |
| |
|
| | def input_text_format(text ): |
| |
|
| | if text is not None: |
| | return text, [{'text': text}] |
| |
|
| | return None |
| | def prep(s): |
| | return s.replace('_', ' ').lower() |
| | class BertEntityExtractor: |
| |
|
| | def __init__(self): |
| | self.net = EntityBertNet() |
| |
|
| | @staticmethod |
| | def load_saved(path): |
| | extr = BertEntityExtractor() |
| | extr.net = EntityBertNet() |
| | extr.net.load_state_dict(torch.load(path,map_location=torch.device('cpu'))) |
| | extr.net.eval() |
| | return extr |
| | def load_trained_model(): |
| | entity_extractor_path = './entity_model2.pt' |
| | entity_extractor = BertEntityExtractor.load_saved(entity_extractor_path) |
| | return entity_extractor |
| | def input_text(self,texts): |
| | mapping1=[input_text_format(texts)] |
| | entity_texts = [t for t in mapping1 |
| | if t is not None] |
| | |
| | df = pd.DataFrame(entity_texts, columns=['sentText', 'entityMentions']) |
| | df['sentText']=str(df['sentText'][0]) |
| | data = EntityDataset.from_df(df) |
| | return data,df |
| | def extract_entity_probabilities(self, file_path=None, dataset=None, size=None): |
| | |
| | if file_path is not None: |
| | data, _ = EntityDataset.from_file(file_path, size=size) |
| | else: |
| | if dataset is None: |
| | raise AttributeError('file_path and data cannot both be None') |
| | data = dataset |
| |
|
| | loader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, |
| | collate_fn=generate_production_batch) |
| | |
| | self.net.to(device) |
| | self.net.eval() |
| |
|
| | probs =[] |
| |
|
| | with torch.no_grad(): |
| | for input_ids, attn_mask, entity_indices, instances in loader: |
| | |
| | input_ids, attn_mask, entity_indices = tuple(i.to(device) for i in [input_ids, attn_mask, |
| | entity_indices]) |
| |
|
| | |
| | output_scores = softmax(self.net(input_ids, attn_mask, entity_indices), dim=1) |
| | for i,(ins, score) in enumerate(zip(instances, output_scores.tolist())): |
| | probs.append(score) |
| | return probs |
| | |
| |
|
| | return {t: statistics.mean(t_probs) if len(t_probs) > 0 else None for t, t_probs in probs.items()} |
| |
|
| |
|
| | class EntityBertNet(nn.Module): |
| |
|
| | def __init__(self): |
| | super(EntityBertNet, self).__init__() |
| | config = BertConfig.from_pretrained(TRAINED_WEIGHTS) |
| | self.bert_base = BertModel.from_pretrained(TRAINED_WEIGHTS, config=config) |
| | self.fc = nn.Linear(HIDDEN_OUTPUT_FEATURES, NUM_CLASSES) |
| |
|
| | def forward(self, input_ids, attn_mask, entity_indices): |
| | |
| | bert_output, _ = self.bert_base(input_ids=input_ids, attention_mask=attn_mask,return_dict=False) |
| | |
| | entity_pooled_output = EntityBertNet.pooled_output(bert_output, entity_indices) |
| |
|
| | |
| | x = self.fc(entity_pooled_output) |
| | return x |
| |
|
| | @staticmethod |
| | def pooled_output(bert_output, indices): |
| | |
| | outputs = torch.gather(input=bert_output, dim=1, index=indices) |
| | pooled_output, _ = torch.max(outputs, dim=1) |
| | return pooled_output |
| |
|
| |
|
| |
|