SPARKNET / src /utils /document_validator.py
MHamdan's picture
Initial commit: SPARKNET framework
a9dc537
"""
Document type validation utility
Helps identify if uploaded documents are actually patents
"""
import re
from pathlib import Path
from typing import Tuple, List
from loguru import logger
class DocumentValidator:
"""Validate that uploaded documents are patents"""
# Keywords that should appear in patent documents
PATENT_KEYWORDS = [
'patent', 'claim', 'claims', 'invention', 'abstract',
'field of invention', 'background', 'detailed description',
'inventor', 'assignee', 'filing date', 'application'
]
# Required sections in patents
REQUIRED_SECTIONS = ['abstract', 'claim']
@staticmethod
def validate_patent_document(text: str) -> Tuple[bool, List[str]]:
"""
Validate if document text appears to be a patent
Args:
text: Extracted document text
Returns:
Tuple of (is_valid, issues_found)
"""
text_lower = text.lower()
issues = []
# Check minimum length
if len(text) < 500:
issues.append("Document too short (< 500 characters)")
# Check for patent keywords
keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS
if kw in text_lower)
if keyword_matches < 3:
issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)")
# Check for required sections
missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS
if section not in text_lower]
if missing_sections:
issues.append(f"Missing required sections: {', '.join(missing_sections)}")
# Check for claim structure (claims usually numbered)
claim_pattern = r'claim\s+\d+'
claims_found = len(re.findall(claim_pattern, text_lower))
if claims_found == 0:
issues.append("No numbered claims found")
# Determine if valid
is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0)
if not is_valid:
logger.warning(f"Document validation failed: {issues}")
return is_valid, issues
@staticmethod
def identify_document_type(text: str) -> str:
"""
Try to identify what type of document this is
Returns:
Document type description
"""
text_lower = text.lower()
# Check for common non-patent document types
if 'microsoft' in text_lower and 'windows' in text_lower:
return "Microsoft Windows documentation"
if any(term in text_lower for term in ['press release', 'news', 'announcement']):
return "Press release or news article"
if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']):
return "Technical whitepaper or report"
if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']):
return "Legal agreement or policy document"
if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower:
return "Academic research paper"
# Check if it's a patent
is_patent, _ = DocumentValidator.validate_patent_document(text)
if is_patent:
return "Patent document"
return "Unknown document type (not a patent)"
def validate_and_log(text: str, document_name: str = "document") -> bool:
"""
Convenience function to validate and log results
Args:
text: Document text
document_name: Name of document for logging
Returns:
True if valid patent, False otherwise
"""
is_valid, issues = DocumentValidator.validate_patent_document(text)
if not is_valid:
doc_type = DocumentValidator.identify_document_type(text)
logger.error(f"❌ {document_name} is NOT a valid patent")
logger.error(f" Detected type: {doc_type}")
logger.error(f" Issues: {', '.join(issues)}")
return False
logger.success(f"✅ {document_name} appears to be a valid patent")
return True