""" Document type validation utility Helps identify if uploaded documents are actually patents """ import re from pathlib import Path from typing import Tuple, List from loguru import logger class DocumentValidator: """Validate that uploaded documents are patents""" # Keywords that should appear in patent documents PATENT_KEYWORDS = [ 'patent', 'claim', 'claims', 'invention', 'abstract', 'field of invention', 'background', 'detailed description', 'inventor', 'assignee', 'filing date', 'application' ] # Required sections in patents REQUIRED_SECTIONS = ['abstract', 'claim'] @staticmethod def validate_patent_document(text: str) -> Tuple[bool, List[str]]: """ Validate if document text appears to be a patent Args: text: Extracted document text Returns: Tuple of (is_valid, issues_found) """ text_lower = text.lower() issues = [] # Check minimum length if len(text) < 500: issues.append("Document too short (< 500 characters)") # Check for patent keywords keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS if kw in text_lower) if keyword_matches < 3: issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)") # Check for required sections missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS if section not in text_lower] if missing_sections: issues.append(f"Missing required sections: {', '.join(missing_sections)}") # Check for claim structure (claims usually numbered) claim_pattern = r'claim\s+\d+' claims_found = len(re.findall(claim_pattern, text_lower)) if claims_found == 0: issues.append("No numbered claims found") # Determine if valid is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0) if not is_valid: logger.warning(f"Document validation failed: {issues}") return is_valid, issues @staticmethod def identify_document_type(text: str) -> str: """ Try to identify what type of document this is Returns: Document type description """ text_lower = text.lower() # Check for common non-patent document types if 'microsoft' in text_lower and 'windows' in text_lower: return "Microsoft Windows documentation" if any(term in text_lower for term in ['press release', 'news', 'announcement']): return "Press release or news article" if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']): return "Technical whitepaper or report" if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']): return "Legal agreement or policy document" if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower: return "Academic research paper" # Check if it's a patent is_patent, _ = DocumentValidator.validate_patent_document(text) if is_patent: return "Patent document" return "Unknown document type (not a patent)" def validate_and_log(text: str, document_name: str = "document") -> bool: """ Convenience function to validate and log results Args: text: Document text document_name: Name of document for logging Returns: True if valid patent, False otherwise """ is_valid, issues = DocumentValidator.validate_patent_document(text) if not is_valid: doc_type = DocumentValidator.identify_document_type(text) logger.error(f"❌ {document_name} is NOT a valid patent") logger.error(f" Detected type: {doc_type}") logger.error(f" Issues: {', '.join(issues)}") return False logger.success(f"✅ {document_name} appears to be a valid patent") return True