koCSN_SAPR / utils /ner_utils.py
yuneun92's picture
Upload 13 files
bcb1848 verified
raw
history blame
12.9 kB
"""
NER ๋ชจ๋ธ์„ ์ด์šฉํ•˜์—ฌ ์ž‘์—…ํ•˜๋Š” ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค.
"""
import re
import torch
import numpy as np
from collections import Counter
device = "cuda:0" if torch.cuda.is_available() else "cpu"
def ner_tokenizer(text, max_seq_length, checkpoint):
"""
NER์„ ์œ„ํ•ด ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
Args:
sent: ์ฒ˜๋ฆฌํ•˜๊ณ ์ž ํ•˜๋Š” ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅ๋ฐ›์Šต๋‹ˆ๋‹ค.
max_seq_length: BERT์˜ config์—์„œ ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅํ•œ ์ตœ๋Œ€ ๋ฌธ์ž์—ด ๊ธธ์ด๋Š” 512์ž…๋‹ˆ๋‹ค. ์ตœ๋Œ€ ๊ธธ์ด๋ฅผ ๋„˜์–ด์„œ์ง€ ์•Š๋„๋ก, ํ…์ŠคํŠธ ๊ธธ์ด๊ฐ€ 512๋ฅผ ๋„˜์–ด๊ฐˆ ๊ฒฝ์šฐ ์—ฌ๋Ÿฌ ๊ฐœ์˜ ๋ฌธ์ž์—ด๋กœ ๋ถ„๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
๋ฌธ๋งฅ ์ •๋ณด๋ฅผ ๊ณ ๋ คํ•˜๋ฏ€๋กœ ๊ฐ€๋Šฅํ•œ ๊ธด ๊ธธ์ด๋กœ chunkingํ•˜๋Š” ๊ฒƒ์ด ์ข‹์€ ์„ฑ๋Šฅ์„ ๋ณด์žฅํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
checkpoint: NER ๋ชจ๋ธ์— ๋Œ€ํ•œ ์ •๋ณด๋ฅผ ๋ถˆ๋Ÿฌ๋“ค์ž…๋‹ˆ๋‹ค.
Return:
ner_tokenizer_dict: ์•„๋ž˜ ์„ธ ์š”์†Œ๋ฅผ ํฌํ•จํ•œ ๋”•์…”๋„ˆ๋ฆฌ์ž…๋‹ˆ๋‹ค.
input_ids: ๊ฐ ํ† ํฐ์˜ ๋ชจ๋ธ ๋”•์…”๋„ˆ๋ฆฌ์—์„œ์˜ ์•„์ด๋””๊ฐ’์ž…๋‹ˆ๋‹ค.
attention_mask: ๊ฐ ํ† ํฐ์˜ ์–ดํƒ ์…˜ ๋งˆ์Šคํฌ ํ™œ์„ฑํ™” ์—ฌ๋ถ€์ž…๋‹ˆ๋‹ค.
token_type_ids: ๊ฐœ์ฒด๋ช… ์ธ์‹ ๋œ ํ† ํฐ์˜ ๊ฒฝ์šฐ ๊ทธ ํƒ€์ž…์˜ ์•„์ด๋””(์ˆซ์ž ์กฐํ•ฉ)๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
"""
#์ €์žฅ๋œ ๋ชจ๋ธ์˜ ํ† ํฌ๋‚˜์ด์ €๋ฅผ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค.
tokenizer = checkpoint['tokenizer']
#๊ฐ๊ฐ ํŒจ๋”ฉ, ๋ฌธ์žฅ ์‹œ์ž‘, ๋ฌธ์žฅ ๋์„ ๋‚˜ํƒ€๋‚ด๋Š” ํŠน๋ณ„ํ•œ ํ† ํฐ๋“ค์˜ ID ๊ฐ’๋“ค์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
pad_token_id = tokenizer.pad_token_id
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
#์ด์ „ ์Œ์ ˆ์„ ์ €์žฅํ•˜๋Š” ๋ณ€์ˆ˜๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
pre_syllable = "_"
#ํ† ํฌ๋‚˜์ด์ง•๋œ ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ๋“ค์„ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
input_ids = [pad_token_id] * (max_seq_length - 1)
attention_mask = [0] * (max_seq_length - 1)
token_type_ids = [0] * max_seq_length
#์ž…๋ ฅ๋œ ํ…์ŠคํŠธ๋ฅผ ์ตœ๋Œ€ ์‹œํ€€์Šค ๊ธธ์ด์— ๋งž๊ฒŒ ์ž˜๋ผ๋ƒ…๋‹ˆ๋‹ค.
text = text[:max_seq_length-2]
#ํ…์ŠคํŠธ์˜ ๊ฐ ์Œ์ ˆ์— ๋Œ€ํ•ด ๋ฐ˜๋ณต๋ฌธ์„ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค.
for i, syllable in enumerate(text):
if syllable == '_':
pre_syllable = syllable
if pre_syllable != "_":
syllable = '##' + syllable
pre_syllable = syllable
#ํ† ํฐ์„ ๋ชจ๋ธ์˜ ๋‹จ์–ด ์‚ฌ์ „์— ์žˆ๋Š” ID ๊ฐ’์œผ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ input_ids ๋ฆฌ์ŠคํŠธ์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
input_ids[i] = tokenizer.convert_tokens_to_ids(syllable)
#ํ•ด๋‹น ์œ„์น˜์˜ ํ† ํฐ์— ๋Œ€ํ•œ ์–ดํ…์…˜ ๋งˆ์Šคํฌ๋ฅผ ํ™œ์„ฑํ™”ํ•ฉ๋‹ˆ๋‹ค.
attention_mask[i] = 1
#์ž…๋ ฅ ์‹œํ€€์Šค์˜ ์‹œ์ž‘์—๋Š” cls_token_id๋ฅผ, ๋์—๋Š” sep_token_id๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
input_ids = [cls_token_id] + input_ids[:-1] + [sep_token_id]
#์–ดํ…์…˜ ๋งˆ์Šคํฌ๋„ ์‹œ์ž‘๊ณผ ๋ ํ† ํฐ์„ ๊ณ ๋ คํ•˜์—ฌ ์ˆ˜์ •ํ•ฉ๋‹ˆ๋‹ค.
attention_mask = [1] + attention_mask[:-1] + [1]
ner_tokenizer_dict = {"input_ids": input_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids}
return ner_tokenizer_dict
def get_ner_predictions(text, checkpoint):
"""
ํ† ํฐํ™”ํ•œ ๋ฌธ์žฅ(tokenized_sent)๊ณผ ์˜ˆ์ธกํ•œ ํƒœ๊ทธ(pred_tags) ๊ฐ’์„ ๋งŒ๋“œ๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
Args:
text: NER ์˜ˆ์ธก์„ ํ•„์š”๋กœ ํ•˜๋Š” ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•ฉ๋‹ˆ๋‹ค.
checkpoint: ์ €์žฅํ•œ ๋ชจ๋ธ์„ ๋ถˆ๋Ÿฌ๋“ค์ž…๋‹ˆ๋‹ค.
Returns:
tokenized_sent: ๋ชจ๋ธ ์ž…๋ ฅ์„ ์œ„ํ•œ ํ† ํฐํ™”๋œ ๋ฌธ์žฅ ์ •๋ณด์ž…๋‹ˆ๋‹ค.
pred_tags: ๊ฐ ํ† ํฐ์— ๋Œ€ํ•œ ์˜ˆ์ธก๋œ ํƒœ๊ทธ๋“ค์„ ํฌํ•จํ•ฉ๋‹ˆ๋‹ค.
"""
#์ €์žฅํ•œ ๋ชจ๋ธ์„ ๋ถˆ๋Ÿฌ๋“ค์ž…๋‹ˆ๋‹ค.
model = checkpoint['model']
#ํƒœ๊ทธ์™€ ํ•ด๋‹น ํƒœ๊ทธ์˜ ID ๋งคํ•‘ ์ •๋ณด๋ฅผ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
tag2id = checkpoint['tag2id']
model.to(device)
#์ž…๋ ฅ๋œ ํ…์ŠคํŠธ์—์„œ ๊ณต๋ฐฑ์„ ์–ธ๋”์Šค์ฝ”์–ด(_)๋กœ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค.
text = text.replace(' ', '_')
#์˜ˆ์ธก๊ฐ’๊ณผ ์‹ค์ œ ๋ผ๋ฒจ์„ ์ €์žฅํ•  ๋นˆ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
predictions, true_labels = [], []
#ner_tokenizer ํ•จ์ˆ˜๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ…์ŠคํŠธ๋ฅผ ํ† ํฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
tokenized_sent = ner_tokenizer(text, len(text) + 2, checkpoint)
#ํ† ํฐํ™”๋œ ๊ฒฐ๊ณผ๋ฅผ ํ† ๋Œ€๋กœ ํ…์„œ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ๋ชจ๋ธ ์ž…๋ ฅ ํ˜•์‹์— ๋งž๊ฒŒ ์ค€๋น„ํ•ฉ๋‹ˆ๋‹ค.
input_ids = torch.tensor(
tokenized_sent['input_ids']).unsqueeze(0).to(device)
attention_mask = torch.tensor(
tokenized_sent['attention_mask']).unsqueeze(0).to(device)
token_type_ids = torch.tensor(
tokenized_sent['token_type_ids']).unsqueeze(0).to(device)
#๊ทธ๋ž˜๋””์–ธํŠธ ๊ณ„์‚ฐ์„ ์ˆ˜ํ–‰ํ•˜์ง€ ์•Š๊ธฐ ์œ„ํ•ด torch.no_grad() ์ปจํ…์ŠคํŠธ ๋‚ด์—์„œ ๋‹ค์Œ์„ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค. (eval ์˜์—ญ์ด๊ธฐ ๋•Œ๋ฌธ์— ํ•™์Šต์„ ํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค)
with torch.no_grad():
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids)
#๋ชจ๋ธ ์ถœ๋ ฅ์—์„œ ๋กœ์ง“ ๊ฐ’์„ ๊ฐ€์ ธ์™€ Numpy๊ฐ’์œผ๋กœ ๋ณ€ํ™˜ํ•˜๊ณ , ๋ผ๋ฒจ ID๋“ค์„ CPU ์ƒ์˜ NumPy ๋ฐฐ์—ด๋กœ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค.
logits = outputs['logits']
logits = logits.detach().cpu().numpy()
label_ids = token_type_ids.cpu().numpy()
#์˜ˆ์ธก๋œ ๋ผ๋ฒจ ๊ฐ’์„ ๊ฐ€์ ธ์™€์„œ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
#์‹ค์ œ ๋ผ๋ฒจ์„ ๋ฆฌ์ŠคํŠธ์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
true_labels.append(label_ids)
#์˜ˆ์ธก๋œ ๋ผ๋ฒจ ID๋ฅผ ์‹ค์ œ ํƒœ๊ทธ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
pred_tags = [list(tag2id.keys())[p_i] for p in predictions for p_i in p]
return tokenized_sent, pred_tags
def ner_inference(tokenized_sent, pred_tags, checkpoint, name_len=5) -> list:
"""
NER์„ ์‹คํ–‰ํ•˜๊ณ , ์ด๋ฆ„๊ณผ ์‹œ๊ฐ„ ๋ฐ ๊ณต๊ฐ„ ์ •๋ณด๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
Args:
tokenized_sent: ํ† ํฐํ™”๋œ ๋ฌธ์žฅ์ด ์ €์žฅ๋œ ๋ฆฌ์ŠคํŠธ
pred_tags: ๊ฐ ํ† ํฐ์— ๋Œ€ํ•œ ์˜ˆ์ธก ํƒœ๊ทธ๊ฐ’ (NER ๊ฒฐ๊ณผ)
checkpoint: ์ €์žฅํ•ด๋‘” ๋ชจ๋ธ์„ ๋ถˆ๋Ÿฌ์˜ด
name_len: ๋” ์ •ํ™•ํ•œ ์ด๋ฆ„ ์ธ์‹์„ ์œ„ํ•ด ์•ž๋’ค๋กœ ๋ช‡ ๊ฐœ์˜ ์Œ์ ˆ์„ ๋” ๊ฒ€ํ† ํ• ์ง€ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
Returns:
namelist: ์ถ”์ถœํ•œ ์ด๋ฆ„(๋ณ„์นญ ํฌํ•จ) ๋ฆฌ์ŠคํŠธ์ž…๋‹ˆ๋‹ค. ํ›„์ฒ˜๋ฆฌ๋ฅผ ํ†ตํ•ด
scene: ์ถ”์ถœํ•œ ์žฅ์†Œ ์‹œ๊ฐ„ ์‚ฌ์ „์ž…๋‹ˆ๋‹ค.
"""
name_list = []
speaker = ''
tokenizer = checkpoint['tokenizer']
scene = {'์žฅ์†Œ': [], '์‹œ๊ฐ„': []}
target = ''
c_tag = None
for i, tag in enumerate(pred_tags):
token = tokenizer.convert_ids_to_tokens(tokenized_sent['input_ids'][i]).replace('#', '')
if 'PER' in tag:
if 'B' in tag and speaker != '':
name_list.append(speaker)
speaker = ''
speaker += token
elif speaker != '' and tag != pred_tags[i-1]:
if speaker in name_list:
name_list.append(speaker)
else:
tmp = speaker
found_name = False
# print(f'{speaker}์— ์˜๋ฌธ์ด ์ƒ๊ฒจ ํ™•์ธํ•ด๋ด…๋‹ˆ๋‹ค.')
for j in range(name_len):
if i + j < len(tokenized_sent['input_ids']):
token = tokenizer.convert_ids_to_tokens(
tokenized_sent['input_ids'][i+j]).replace('#', '')
tmp += token
# print(f'{speaker} ๋’ค๋กœ ๋‚˜์˜จ {j} ๋ฒˆ์งธ ๊นŒ์ง€ ํ™•์ธํ•œ๊ฒฐ๊ณผ, {tmp} ์ž…๋‹ˆ๋‹ค')
if tmp in name_list:
name_list.append(tmp)
found_name = True
# print(f'๋ช…๋‹จ์— {tmp} ๊ฐ€ ์กด์žฌํ•˜์—ฌ, {speaker} ๋Œ€์‹  ์ถ”๊ฐ€ํ•˜์˜€์Šต๋‹ˆ๋‹ค.')
break
if not found_name:
name_list.append(speaker)
# print(f'์ฐพ์ง€ ๋ชปํ•˜์—ฌ {speaker} ๋ฅผ ์ถ”๊ฐ€ํ•˜์˜€์Šต๋‹ˆ๋‹ค.')
speaker = ''
elif tag != 'O':
if tag.startswith('B'):
if c_tag in ['TIM', 'DAT']:
scene['์‹œ๊ฐ„'].append(target)
elif c_tag =='LOC':
scene['์žฅ์†Œ'].append(target)
c_tag = tag[2:]
target = token
else:
target += token.replace('_', ' ')
return name_list, scene
def make_name_list(ner_inputs, checkpoint):
"""
๋ฌธ์žฅ๋“ค์„ NER ๋Œ๋ ค์„œ Name List ๋งŒ๋“ค๊ธฐ.
"""
name_list = []
times = []
places = []
for ner_input in ner_inputs:
tokenized_sent, pred_tags = get_ner_predictions(ner_input, checkpoint)
names, scene = ner_inference(tokenized_sent, pred_tags, checkpoint)
name_list.extend(names)
times.extend(scene['์‹œ๊ฐ„'])
places.extend(scene['์žฅ์†Œ'])
return name_list, times, places
def show_name_list(name_list):
"""
์‚ฌ์šฉ์ž ์นœํ™”์ ์œผ๋กœ ๋„ค์ž„๋ฆฌ์ŠคํŠธ๋ฅผ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค.
Arg:
name_list: ์ถ”์ถœํ•œ ์ด๋ฆ„ ๋ฆฌ์ŠคํŠธ
Return:
name: ๋™์ผํ•œ ์ด๋ฆ„์ด ๋ช‡ ๋ฒˆ ๋“ฑ์žฅํ–ˆ๋Š”์ง€ ํšŸ์ˆ˜๋ฅผ ํ•จ๊ป˜ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค.
"""
name = dict(Counter(name_list))
return name
def compare_strings(str1, str2):
"""
ner๋กœ ์ถ”์ถœํ•œ ์ธ๋ช…์„ ํ›„์ฒ˜๋ฆฌํ•˜๋Š” ์ฝ”๋“œ์ž…๋‹ˆ๋‹ค.
๋น„๊ตํ•  ๋‘ ๋ฌธ์ž์—ด์˜ ๊ธธ์ด๊ฐ€ ๋‹ค๋ฅผ ๊ฒฝ์šฐ, ๋” ์งง์€ ๋ฌธ์ž์—ด์ด ๋” ๊ธด ๋ฌธ์ž์—ด์— ํฌํ•จ๋˜๋Š”์ง€ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
๋น„๊ตํ•  ๋‘ ๋ฌธ์ž์—ด์˜ ๊ธธ์ด๊ฐ€ ๊ฐ™์„ ๊ฒฝ์šฐ, ๊ฒน์น˜๋Š” ๋ถ€๋ถ„์ด 2๊ธ€์ž ์ด์ƒ์ผ ๊ฒฝ์šฐ ๊ฐ™์€ ์ด๋ฆ„์œผ๋กœ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
์ด ํ•จ์ˆ˜์™€ ์•„๋ž˜์˜ combine_similar_names๋ฅผ ํ•จ๊ป˜ ์‹คํ–‰ํ•˜๋ฉด, 'ํ•œ๋‹ค์ •'๊ณผ '๋‹ค์ •์ด', '๋‹ค์ •์ด๊ฐ€' ๋“ฑ์€ ๋ชจ๋‘ ํ•˜๋‚˜์˜ ์ธ๋ฌผ๋กœ ๋ฌถ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
Args: ๋น„๊ตํ•˜๋ ค๋Š” ๋‘ ๋ฌธ์ž์—ด
Return: ๋‘ ๋ฌธ์ž์—ด์ด ๊ฐ™์€ ์ด๋ฆ„์œผ๋กœ ํŒ๋‹จ๋  ๊ฒฝ์šฐ True, ์•„๋‹ ๊ฒฝ์šฐ False
"""
if len(str1) != len(str2):
# ๋” ์งง์€ ๋ฌธ์ž์—ด์ด ๋” ๊ธด ๋ฌธ์ž์—ด์— ํฌํ•จ๋˜๋Š”์ง€ ํ™•์ธ
shorter, longer = (str1, str2) if len(str1) < len(str2) else (str2, str1)
if shorter in longer:
return True
else:
same_part = []
for i in range(len(str1)):
if str1[i] in str2:
same_part += str1[i]
continue
else:
break
if len(same_part) >= 2:
return True
return False
def combine_similar_names(names_dict):
"""
compare_strings ํ•จ์ˆ˜๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์œ ์‚ฌํ•œ ์ด๋ฆ„์„ ํ•จ๊ป˜ ๋ฌถ์Šต๋‹ˆ๋‹ค.
2๊ธ€์ž๋Š” ์ด๋ฆ„์ผ ํ™•๋ฅ ์ด ๋†’์œผ๋‹ˆ ๊ธฐ์ค€์ ์œผ๋กœ ์ง€์ •ํ•ฉ๋‹ˆ๋‹ค.
"""
names = names_dict.keys()
similar_groups = [[name] for name in names if len(name) == 2]
idx = 0
# print(similar_groups, '\n',idx)
for name in names:
found = False
for group in similar_groups:
idx += 1
for item in group:
if compare_strings(name, item) and len(name)>1:
found = True
cleaned_text = re.sub(r'(์•„|์ด)$', '', item)
if len(name) == len(item):
same_part = ''
# ์™„์ „ํžˆ ์ผ์น˜ํ•˜๋Š” ๋ถ€๋ถ„์ด ์žˆ๋Š”์ง€ ํ™•์ธ
for i in range(len(name)):
if name[i] in item:
same_part += name[i]
if same_part not in group and cleaned_text not in group:
group.append(cleaned_text)
# print(similar_groups, '\n',idx, '๋ฌธ์ž์—ด์˜ ๊ธธ์ด๊ฐ€ ๊ฐ™์„ ๋•Œ')
else:
group.append(name)
# print(similar_groups, '\n',idx, '๋ฌธ์ž์—ด์˜ ๊ธธ์ด๊ฐ€ ๋‹ค๋ฅผ ๋•Œ')
break
if found:
break
if not found:
similar_groups.append([name])
updated_names = {tuple(name for name in group if len(name) > 1): counts for group, counts in (
(group, sum(names_dict[name] for name in group if name != '')) for group in similar_groups)
if len([name for name in group if len(name) > 1]) > 0}
return updated_names
def convert_name2codename(codename2name, text):
"""RE๋ฅผ ์ด์šฉํ•˜์—ฌ ์ด๋ฆ„์„ ์ฝ”๋“œ๋„ค์ž„์œผ๋กœ ๋ณ€๊ฒฝํ•ฉ๋‹ˆ๋‹ค. ์ด๋•Œ ๊ฐ ์ฝ”๋“œ๋„ค์ž„์˜ ๋ฒˆํ˜ธ๋Š” ๋นˆ๋„์ˆ˜ ๊ธฐ์ค€ ๋‚ด๋ฆผ์ฐจ์ˆœํ•œ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค."""
import re
for n_list in codename2name.values():
n_list.sort(key=lambda x:(len(x), x), reverse=True)
for codename, n_list in codename2name.items():
for subname in n_list:
text = re.sub(subname, codename, text)
return text
def convert_codename2name(codename2name, text):
"""์ฝ”๋“œ๋„ค์ž„์„ ์ด๋ฆ„์œผ๋กœ ๋ณ€๊ฒฝํ•ด์ค๋‹ˆ๋‹ค."""
outputs = []
for i in text:
try:
outputs.append(codename2name[i][0])
except:
outputs.append('์•Œ ์ˆ˜ ์—†์Œ')
return outputs