#!/usr/bin/env python3 """Minimal runnable example for FrameByFrame/korean-pii-e5-base. pip install "transformers>=4.40" torch safetensors python usage.py """ import os, re import torch from transformers import AutoTokenizer, AutoModelForTokenClassification MODEL_ID = os.environ.get("MODEL_ID", "FrameByFrame/korean-pii-e5-base") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForTokenClassification.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) model.eval() if torch.cuda.is_available(): model.cuda() _TRAILING_JOSA = ["이에요","이라고","입니다","이야","이랑","한테","에게","으로","이가","이는", "에서","이고","예요","씨","님","이","가","은","는","을","를","야","아","에","의","랑","께","고"] _DATE_END = re.compile(r".*(?:일|[0-9])", re.S) def _normalize(text, label, s, e): while s < e and text[s] in " .,\t\n": s += 1 while e > s and text[e - 1] in " .,\t\n": e -= 1 if label == "private_date": m = _DATE_END.match(text[s:e]) if m and m.end() > 0: e = s + m.end() elif label in ("private_person", "personal_handle", "private_address"): for _ in range(2): seg = text[s:e] for j in _TRAILING_JOSA: if seg.endswith(j) and (e - s) - len(j) >= 2: e -= len(j); break else: break return s, e def extract_pii(text, max_length=256): enc = tokenizer(text, truncation=True, max_length=max_length, return_offsets_mapping=True, return_tensors="pt") offsets = enc.pop("offset_mapping")[0].tolist() with torch.no_grad(): logits = model(**{k: v.to(model.device) for k, v in enc.items()}).logits pred = logits.argmax(-1)[0].tolist() id2label = model.config.id2label spans, active = [], None for i, lid in enumerate(pred): label = id2label[int(lid)] cs, ce = offsets[i] if cs == ce: if active: spans.append(active); active = None continue if label == "O": if active: spans.append(active); active = None continue prefix, cat = label.split("-", 1) if prefix in ("B", "S") or not active or active[0] != cat: if active: spans.append(active) active = [cat, cs, ce] else: active[2] = ce if active: spans.append(active) out = [] for cat, s, e in spans: s, e = _normalize(text, cat, s, e) if text[s:e].strip(): out.append({"label": cat, "start": s, "end": e, "text": text[s:e]}) return out def redact(text): spans = sorted(extract_pii(text), key=lambda s: s["start"], reverse=True) for s in spans: text = text[:s["start"]] + f"[{s['label'].upper()}]" + text[s["end"]:] return text if __name__ == "__main__": for t in ["김민수님의 번호는 010-1234-5678입니다.", "계좌 110-234-567890으로 입금하고 minsu@example.com으로 알려주세요.", "이수진 고객님 생년월일은 1985년 3월 12일입니다."]: print(t) for sp in extract_pii(t): print(f" {sp['label']:16} {sp['text']!r}") print(" REDACT:", redact(t)); print()