| | import re |
| | import logging |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class RegexPIIRemover: |
| | """Remove PII using regex patterns""" |
| | |
| | def __init__(self): |
| | """Initialize PII removal patterns""" |
| | self.patterns = { |
| | |
| | 'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'), |
| | |
| | |
| | 'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'), |
| | |
| | |
| | 'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), |
| | |
| | |
| | 'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'), |
| | |
| | |
| | 'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE), |
| | |
| | |
| | 'zip': re.compile(r'\b\d{5}(-\d{4})?\b'), |
| | |
| | |
| | 'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE), |
| | |
| | |
| | 'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE), |
| | } |
| | |
| | logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns") |
| | |
| | def remove_pii(self, text: str) -> tuple[str, int]: |
| | """ |
| | Remove PII from text |
| | |
| | Args: |
| | text: Input text |
| | |
| | Returns: |
| | tuple: (cleaned_text, count_of_pii_removed) |
| | """ |
| | cleaned_text = text |
| | total_removed = 0 |
| | |
| | for pii_type, pattern in self.patterns.items(): |
| | matches = pattern.findall(cleaned_text) |
| | count = len(matches) |
| | |
| | if count > 0: |
| | logger.debug(f"Found {count} instances of {pii_type}") |
| | total_removed += count |
| | |
| | |
| | if pii_type == 'patient_name': |
| | cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) |
| | elif pii_type in ['dob', 'mrn']: |
| | cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) |
| | else: |
| | cleaned_text = pattern.sub('[REDACTED]', cleaned_text) |
| | |
| | logger.info(f"Removed {total_removed} PII entities") |
| | |
| | return cleaned_text, total_removed |