Instructions to use yuneun92/koCSN_SAPR with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use yuneun92/koCSN_SAPR with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="yuneun92/koCSN_SAPR")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("yuneun92/koCSN_SAPR", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ | |
| NER ๋ชจ๋ธ์ ์ด์ฉํ์ฌ ์์ ํ๋ ์ฝ๋์ ๋๋ค. | |
| """ | |
| import re | |
| import torch | |
| import numpy as np | |
| from collections import Counter | |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
| def ner_tokenizer(text, max_seq_length, checkpoint): | |
| """ | |
| NER์ ์ํด ํ ์คํธ๋ฅผ ํ ํฐํํฉ๋๋ค. | |
| Args: | |
| sent: ์ฒ๋ฆฌํ๊ณ ์ ํ๋ ํ ์คํธ๋ฅผ ์ ๋ ฅ๋ฐ์ต๋๋ค. | |
| max_seq_length: BERT์ config์์ ์ฒ๋ฆฌ ๊ฐ๋ฅํ ์ต๋ ๋ฌธ์์ด ๊ธธ์ด๋ 512์ ๋๋ค. ์ต๋ ๊ธธ์ด๋ฅผ ๋์ด์์ง ์๋๋ก, ํ ์คํธ ๊ธธ์ด๊ฐ 512๋ฅผ ๋์ด๊ฐ ๊ฒฝ์ฐ ์ฌ๋ฌ ๊ฐ์ ๋ฌธ์์ด๋ก ๋ถ๋ฆฌํฉ๋๋ค. | |
| ๋ฌธ๋งฅ ์ ๋ณด๋ฅผ ๊ณ ๋ คํ๋ฏ๋ก ๊ฐ๋ฅํ ๊ธด ๊ธธ์ด๋ก chunkingํ๋ ๊ฒ์ด ์ข์ ์ฑ๋ฅ์ ๋ณด์ฅํ ์ ์์ต๋๋ค. | |
| checkpoint: NER ๋ชจ๋ธ์ ๋ํ ์ ๋ณด๋ฅผ ๋ถ๋ฌ๋ค์ ๋๋ค. | |
| Return: | |
| ner_tokenizer_dict: ์๋ ์ธ ์์๋ฅผ ํฌํจํ ๋์ ๋๋ฆฌ์ ๋๋ค. | |
| input_ids: ๊ฐ ํ ํฐ์ ๋ชจ๋ธ ๋์ ๋๋ฆฌ์์์ ์์ด๋๊ฐ์ ๋๋ค. | |
| attention_mask: ๊ฐ ํ ํฐ์ ์ดํ ์ ๋ง์คํฌ ํ์ฑํ ์ฌ๋ถ์ ๋๋ค. | |
| token_type_ids: ๊ฐ์ฒด๋ช ์ธ์ ๋ ํ ํฐ์ ๊ฒฝ์ฐ ๊ทธ ํ์ ์ ์์ด๋(์ซ์ ์กฐํฉ)๋ฅผ ๋ฐํํฉ๋๋ค. | |
| """ | |
| #์ ์ฅ๋ ๋ชจ๋ธ์ ํ ํฌ๋์ด์ ๋ฅผ ๋ถ๋ฌ์ต๋๋ค. | |
| tokenizer = checkpoint['tokenizer'] | |
| #๊ฐ๊ฐ ํจ๋ฉ, ๋ฌธ์ฅ ์์, ๋ฌธ์ฅ ๋์ ๋ํ๋ด๋ ํน๋ณํ ํ ํฐ๋ค์ ID ๊ฐ๋ค์ ๊ฐ์ ธ์ต๋๋ค. | |
| pad_token_id = tokenizer.pad_token_id | |
| cls_token_id = tokenizer.cls_token_id | |
| sep_token_id = tokenizer.sep_token_id | |
| #์ด์ ์์ ์ ์ ์ฅํ๋ ๋ณ์๋ฅผ ์ด๊ธฐํํฉ๋๋ค. | |
| pre_syllable = "_" | |
| #ํ ํฌ๋์ด์ง๋ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ ๋ฆฌ์คํธ๋ค์ ์ด๊ธฐํํฉ๋๋ค. | |
| input_ids = [pad_token_id] * (max_seq_length - 1) | |
| attention_mask = [0] * (max_seq_length - 1) | |
| token_type_ids = [0] * max_seq_length | |
| #์ ๋ ฅ๋ ํ ์คํธ๋ฅผ ์ต๋ ์ํ์ค ๊ธธ์ด์ ๋ง๊ฒ ์๋ผ๋ ๋๋ค. | |
| text = text[:max_seq_length-2] | |
| #ํ ์คํธ์ ๊ฐ ์์ ์ ๋ํด ๋ฐ๋ณต๋ฌธ์ ์คํํฉ๋๋ค. | |
| for i, syllable in enumerate(text): | |
| if syllable == '_': | |
| pre_syllable = syllable | |
| if pre_syllable != "_": | |
| syllable = '##' + syllable | |
| pre_syllable = syllable | |
| #ํ ํฐ์ ๋ชจ๋ธ์ ๋จ์ด ์ฌ์ ์ ์๋ ID ๊ฐ์ผ๋ก ๋ณํํ์ฌ input_ids ๋ฆฌ์คํธ์ ์ ์ฅํฉ๋๋ค. | |
| input_ids[i] = tokenizer.convert_tokens_to_ids(syllable) | |
| #ํด๋น ์์น์ ํ ํฐ์ ๋ํ ์ดํ ์ ๋ง์คํฌ๋ฅผ ํ์ฑํํฉ๋๋ค. | |
| attention_mask[i] = 1 | |
| #์ ๋ ฅ ์ํ์ค์ ์์์๋ cls_token_id๋ฅผ, ๋์๋ sep_token_id๋ฅผ ์ถ๊ฐํฉ๋๋ค. | |
| input_ids = [cls_token_id] + input_ids[:-1] + [sep_token_id] | |
| #์ดํ ์ ๋ง์คํฌ๋ ์์๊ณผ ๋ ํ ํฐ์ ๊ณ ๋ คํ์ฌ ์์ ํฉ๋๋ค. | |
| attention_mask = [1] + attention_mask[:-1] + [1] | |
| ner_tokenizer_dict = {"input_ids": input_ids, | |
| "attention_mask": attention_mask, | |
| "token_type_ids": token_type_ids} | |
| return ner_tokenizer_dict | |
| def get_ner_predictions(text, checkpoint): | |
| """ | |
| ํ ํฐํํ ๋ฌธ์ฅ(tokenized_sent)๊ณผ ์์ธกํ ํ๊ทธ(pred_tags) ๊ฐ์ ๋ง๋๋ ํจ์์ ๋๋ค. | |
| Args: | |
| text: NER ์์ธก์ ํ์๋ก ํ๋ ํ ์คํธ๋ฅผ ์ ๋ ฅํฉ๋๋ค. | |
| checkpoint: ์ ์ฅํ ๋ชจ๋ธ์ ๋ถ๋ฌ๋ค์ ๋๋ค. | |
| Returns: | |
| tokenized_sent: ๋ชจ๋ธ ์ ๋ ฅ์ ์ํ ํ ํฐํ๋ ๋ฌธ์ฅ ์ ๋ณด์ ๋๋ค. | |
| pred_tags: ๊ฐ ํ ํฐ์ ๋ํ ์์ธก๋ ํ๊ทธ๋ค์ ํฌํจํฉ๋๋ค. | |
| """ | |
| #์ ์ฅํ ๋ชจ๋ธ์ ๋ถ๋ฌ๋ค์ ๋๋ค. | |
| model = checkpoint['model'] | |
| #ํ๊ทธ์ ํด๋น ํ๊ทธ์ ID ๋งคํ ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ต๋๋ค. | |
| tag2id = checkpoint['tag2id'] | |
| model.to(device) | |
| #์ ๋ ฅ๋ ํ ์คํธ์์ ๊ณต๋ฐฑ์ ์ธ๋์ค์ฝ์ด(_)๋ก ๋์ฒดํฉ๋๋ค. | |
| text = text.replace(' ', '_') | |
| #์์ธก๊ฐ๊ณผ ์ค์ ๋ผ๋ฒจ์ ์ ์ฅํ ๋น ๋ฆฌ์คํธ๋ฅผ ์์ฑํฉ๋๋ค. | |
| predictions, true_labels = [], [] | |
| #ner_tokenizer ํจ์๋ฅผ ์ฌ์ฉํ์ฌ ํ ์คํธ๋ฅผ ํ ํฐํํฉ๋๋ค. | |
| tokenized_sent = ner_tokenizer(text, len(text) + 2, checkpoint) | |
| #ํ ํฐํ๋ ๊ฒฐ๊ณผ๋ฅผ ํ ๋๋ก ํ ์๋ก ๋ณํํ์ฌ ๋ชจ๋ธ ์ ๋ ฅ ํ์์ ๋ง๊ฒ ์ค๋นํฉ๋๋ค. | |
| input_ids = torch.tensor( | |
| tokenized_sent['input_ids']).unsqueeze(0).to(device) | |
| attention_mask = torch.tensor( | |
| tokenized_sent['attention_mask']).unsqueeze(0).to(device) | |
| token_type_ids = torch.tensor( | |
| tokenized_sent['token_type_ids']).unsqueeze(0).to(device) | |
| #๊ทธ๋๋์ธํธ ๊ณ์ฐ์ ์ํํ์ง ์๊ธฐ ์ํด torch.no_grad() ์ปจํ ์คํธ ๋ด์์ ๋ค์์ ์คํํฉ๋๋ค. (eval ์์ญ์ด๊ธฐ ๋๋ฌธ์ ํ์ต์ ํ์ง ์์ต๋๋ค) | |
| with torch.no_grad(): | |
| outputs = model( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| token_type_ids=token_type_ids) | |
| #๋ชจ๋ธ ์ถ๋ ฅ์์ ๋ก์ง ๊ฐ์ ๊ฐ์ ธ์ Numpy๊ฐ์ผ๋ก ๋ณํํ๊ณ , ๋ผ๋ฒจ ID๋ค์ CPU ์์ NumPy ๋ฐฐ์ด๋ก ๊ฐ์ ธ์ต๋๋ค. | |
| logits = outputs['logits'] | |
| logits = logits.detach().cpu().numpy() | |
| label_ids = token_type_ids.cpu().numpy() | |
| #์์ธก๋ ๋ผ๋ฒจ ๊ฐ์ ๊ฐ์ ธ์์ ๋ฆฌ์คํธ์ ์ถ๊ฐํฉ๋๋ค. | |
| predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) | |
| #์ค์ ๋ผ๋ฒจ์ ๋ฆฌ์คํธ์ ์ถ๊ฐํฉ๋๋ค. | |
| true_labels.append(label_ids) | |
| #์์ธก๋ ๋ผ๋ฒจ ID๋ฅผ ์ค์ ํ๊ทธ๋ก ๋ณํํฉ๋๋ค. | |
| pred_tags = [list(tag2id.keys())[p_i] for p in predictions for p_i in p] | |
| return tokenized_sent, pred_tags | |
| def ner_inference(tokenized_sent, pred_tags, checkpoint, name_len=5) -> list: | |
| """ | |
| NER์ ์คํํ๊ณ , ์ด๋ฆ๊ณผ ์๊ฐ ๋ฐ ๊ณต๊ฐ ์ ๋ณด๋ฅผ ์ถ์ถํฉ๋๋ค. | |
| Args: | |
| tokenized_sent: ํ ํฐํ๋ ๋ฌธ์ฅ์ด ์ ์ฅ๋ ๋ฆฌ์คํธ | |
| pred_tags: ๊ฐ ํ ํฐ์ ๋ํ ์์ธก ํ๊ทธ๊ฐ (NER ๊ฒฐ๊ณผ) | |
| checkpoint: ์ ์ฅํด๋ ๋ชจ๋ธ์ ๋ถ๋ฌ์ด | |
| name_len: ๋ ์ ํํ ์ด๋ฆ ์ธ์์ ์ํด ์๋ค๋ก ๋ช ๊ฐ์ ์์ ์ ๋ ๊ฒํ ํ ์ง ์ง์ ํฉ๋๋ค. | |
| Returns: | |
| namelist: ์ถ์ถํ ์ด๋ฆ(๋ณ์นญ ํฌํจ) ๋ฆฌ์คํธ์ ๋๋ค. ํ์ฒ๋ฆฌ๋ฅผ ํตํด | |
| scene: ์ถ์ถํ ์ฅ์ ์๊ฐ ์ฌ์ ์ ๋๋ค. | |
| """ | |
| name_list = [] | |
| speaker = '' | |
| tokenizer = checkpoint['tokenizer'] | |
| scene = {'์ฅ์': [], '์๊ฐ': []} | |
| target = '' | |
| c_tag = None | |
| for i, tag in enumerate(pred_tags): | |
| token = tokenizer.convert_ids_to_tokens(tokenized_sent['input_ids'][i]).replace('#', '') | |
| if 'PER' in tag: | |
| if 'B' in tag and speaker != '': | |
| name_list.append(speaker) | |
| speaker = '' | |
| speaker += token | |
| elif speaker != '' and tag != pred_tags[i-1]: | |
| if speaker in name_list: | |
| name_list.append(speaker) | |
| else: | |
| tmp = speaker | |
| found_name = False | |
| # print(f'{speaker}์ ์๋ฌธ์ด ์๊ฒจ ํ์ธํด๋ด ๋๋ค.') | |
| for j in range(name_len): | |
| if i + j < len(tokenized_sent['input_ids']): | |
| token = tokenizer.convert_ids_to_tokens( | |
| tokenized_sent['input_ids'][i+j]).replace('#', '') | |
| tmp += token | |
| # print(f'{speaker} ๋ค๋ก ๋์จ {j} ๋ฒ์งธ ๊น์ง ํ์ธํ๊ฒฐ๊ณผ, {tmp} ์ ๋๋ค') | |
| if tmp in name_list: | |
| name_list.append(tmp) | |
| found_name = True | |
| # print(f'๋ช ๋จ์ {tmp} ๊ฐ ์กด์ฌํ์ฌ, {speaker} ๋์ ์ถ๊ฐํ์์ต๋๋ค.') | |
| break | |
| if not found_name: | |
| name_list.append(speaker) | |
| # print(f'์ฐพ์ง ๋ชปํ์ฌ {speaker} ๋ฅผ ์ถ๊ฐํ์์ต๋๋ค.') | |
| speaker = '' | |
| elif tag != 'O': | |
| if tag.startswith('B'): | |
| if c_tag in ['TIM', 'DAT']: | |
| scene['์๊ฐ'].append(target) | |
| elif c_tag =='LOC': | |
| scene['์ฅ์'].append(target) | |
| c_tag = tag[2:] | |
| target = token | |
| else: | |
| target += token.replace('_', ' ') | |
| return name_list, scene | |
| def make_name_list(ner_inputs, checkpoint): | |
| """ | |
| ๋ฌธ์ฅ๋ค์ NER ๋๋ ค์ Name List ๋ง๋ค๊ธฐ. | |
| """ | |
| name_list = [] | |
| times = [] | |
| places = [] | |
| for ner_input in ner_inputs: | |
| tokenized_sent, pred_tags = get_ner_predictions(ner_input, checkpoint) | |
| names, scene = ner_inference(tokenized_sent, pred_tags, checkpoint) | |
| name_list.extend(names) | |
| times.extend(scene['์๊ฐ']) | |
| places.extend(scene['์ฅ์']) | |
| return name_list, times, places | |
| def show_name_list(name_list): | |
| """ | |
| ์ฌ์ฉ์ ์นํ์ ์ผ๋ก ๋ค์๋ฆฌ์คํธ๋ฅผ ๋ณด์ฌ์ค๋๋ค. | |
| Arg: | |
| name_list: ์ถ์ถํ ์ด๋ฆ ๋ฆฌ์คํธ | |
| Return: | |
| name: ๋์ผํ ์ด๋ฆ์ด ๋ช ๋ฒ ๋ฑ์ฅํ๋์ง ํ์๋ฅผ ํจ๊ป ์ ๊ณตํฉ๋๋ค. | |
| """ | |
| name = dict(Counter(name_list)) | |
| return name | |
| def compare_strings(str1, str2): | |
| """ | |
| ner๋ก ์ถ์ถํ ์ธ๋ช ์ ํ์ฒ๋ฆฌํ๋ ์ฝ๋์ ๋๋ค. | |
| ๋น๊ตํ ๋ ๋ฌธ์์ด์ ๊ธธ์ด๊ฐ ๋ค๋ฅผ ๊ฒฝ์ฐ, ๋ ์งง์ ๋ฌธ์์ด์ด ๋ ๊ธด ๋ฌธ์์ด์ ํฌํจ๋๋์ง ํ์ธํฉ๋๋ค. | |
| ๋น๊ตํ ๋ ๋ฌธ์์ด์ ๊ธธ์ด๊ฐ ๊ฐ์ ๊ฒฝ์ฐ, ๊ฒน์น๋ ๋ถ๋ถ์ด 2๊ธ์ ์ด์์ผ ๊ฒฝ์ฐ ๊ฐ์ ์ด๋ฆ์ผ๋ก ์ง์ ํฉ๋๋ค. | |
| ์ด ํจ์์ ์๋์ combine_similar_names๋ฅผ ํจ๊ป ์คํํ๋ฉด, 'ํ๋ค์ '๊ณผ '๋ค์ ์ด', '๋ค์ ์ด๊ฐ' ๋ฑ์ ๋ชจ๋ ํ๋์ ์ธ๋ฌผ๋ก ๋ฌถ์ ์ ์์ต๋๋ค. | |
| Args: ๋น๊ตํ๋ ค๋ ๋ ๋ฌธ์์ด | |
| Return: ๋ ๋ฌธ์์ด์ด ๊ฐ์ ์ด๋ฆ์ผ๋ก ํ๋จ๋ ๊ฒฝ์ฐ True, ์๋ ๊ฒฝ์ฐ False | |
| """ | |
| if len(str1) != len(str2): | |
| # ๋ ์งง์ ๋ฌธ์์ด์ด ๋ ๊ธด ๋ฌธ์์ด์ ํฌํจ๋๋์ง ํ์ธ | |
| shorter, longer = (str1, str2) if len(str1) < len(str2) else (str2, str1) | |
| if shorter in longer: | |
| return True | |
| else: | |
| same_part = [] | |
| for i in range(len(str1)): | |
| if str1[i] in str2: | |
| same_part += str1[i] | |
| continue | |
| else: | |
| break | |
| if len(same_part) >= 2: | |
| return True | |
| return False | |
| def combine_similar_names(names_dict): | |
| """ | |
| compare_strings ํจ์๋ฅผ ๋ฐํ์ผ๋ก ์ ์ฌํ ์ด๋ฆ์ ํจ๊ป ๋ฌถ์ต๋๋ค. | |
| 2๊ธ์๋ ์ด๋ฆ์ผ ํ๋ฅ ์ด ๋์ผ๋ ๊ธฐ์ค์ ์ผ๋ก ์ง์ ํฉ๋๋ค. | |
| """ | |
| names = names_dict.keys() | |
| similar_groups = [[name] for name in names if len(name) == 2] | |
| idx = 0 | |
| # print(similar_groups, '\n',idx) | |
| for name in names: | |
| found = False | |
| for group in similar_groups: | |
| idx += 1 | |
| for item in group: | |
| if compare_strings(name, item) and len(name)>1: | |
| found = True | |
| cleaned_text = re.sub(r'(์|์ด)$', '', item) | |
| if len(name) == len(item): | |
| same_part = '' | |
| # ์์ ํ ์ผ์นํ๋ ๋ถ๋ถ์ด ์๋์ง ํ์ธ | |
| for i in range(len(name)): | |
| if name[i] in item: | |
| same_part += name[i] | |
| if same_part not in group and cleaned_text not in group: | |
| group.append(cleaned_text) | |
| # print(similar_groups, '\n',idx, '๋ฌธ์์ด์ ๊ธธ์ด๊ฐ ๊ฐ์ ๋') | |
| else: | |
| group.append(name) | |
| # print(similar_groups, '\n',idx, '๋ฌธ์์ด์ ๊ธธ์ด๊ฐ ๋ค๋ฅผ ๋') | |
| break | |
| if found: | |
| break | |
| if not found: | |
| similar_groups.append([name]) | |
| updated_names = {tuple(name for name in group if len(name) > 1): counts for group, counts in ( | |
| (group, sum(names_dict[name] for name in group if name != '')) for group in similar_groups) | |
| if len([name for name in group if len(name) > 1]) > 0} | |
| return updated_names | |
| def convert_name2codename(codename2name, text): | |
| """RE๋ฅผ ์ด์ฉํ์ฌ ์ด๋ฆ์ ์ฝ๋๋ค์์ผ๋ก ๋ณ๊ฒฝํฉ๋๋ค. ์ด๋ ๊ฐ ์ฝ๋๋ค์์ ๋ฒํธ๋ ๋น๋์ ๊ธฐ์ค ๋ด๋ฆผ์ฐจ์ํ ๊ฒฐ๊ณผ์ ๋๋ค.""" | |
| import re | |
| for n_list in codename2name.values(): | |
| n_list.sort(key=lambda x:(len(x), x), reverse=True) | |
| for codename, n_list in codename2name.items(): | |
| for subname in n_list: | |
| text = re.sub(subname, codename, text) | |
| return text | |
| def convert_codename2name(codename2name, text): | |
| """์ฝ๋๋ค์์ ์ด๋ฆ์ผ๋ก ๋ณ๊ฒฝํด์ค๋๋ค.""" | |
| outputs = [] | |
| for i in text: | |
| try: | |
| outputs.append(codename2name[i][0]) | |
| except: | |
| outputs.append('์ ์ ์์') | |
| return outputs | |