ishaq101's picture
feat/Catalog Retrieval System (#1)
6bff5d9
"""Regex patterns and column-name heuristics for PII detection.
Used by catalog/pii_detector.py at ingestion time. Default policy:
when in doubt, set pii_flag=True. False positives cost nothing; false
negatives leak data.
"""
import re
PII_NAME_PATTERNS = frozenset({
"email",
"phone", "mobile", "telp", "telephone",
"ssn", "tin", "passport", "ktp", "nik",
"name", "fullname", "first_name", "last_name", "surname",
"address", "street", "zipcode", "postal",
"birthdate", "dob", "birthday",
})
EMAIL_REGEX = re.compile(r"^[\w.+-]+@[\w-]+\.[\w.-]+$")
PHONE_REGEX = re.compile(r"^\+?[\d\s\-()]{7,}$")