Spaces:

MHamdan
/

SPARKNET

Sleeping

File size: 4,164 Bytes

a9dc537

"""
Document type validation utility
Helps identify if uploaded documents are actually patents
"""
import re
from pathlib import Path
from typing import Tuple, List
from loguru import logger


class DocumentValidator:
    """Validate that uploaded documents are patents"""

    # Keywords that should appear in patent documents
    PATENT_KEYWORDS = [
        'patent', 'claim', 'claims', 'invention', 'abstract',
        'field of invention', 'background', 'detailed description',
        'inventor', 'assignee', 'filing date', 'application'
    ]

    # Required sections in patents
    REQUIRED_SECTIONS = ['abstract', 'claim']

    @staticmethod
    def validate_patent_document(text: str) -> Tuple[bool, List[str]]:
        """
        Validate if document text appears to be a patent

        Args:
            text: Extracted document text

        Returns:
            Tuple of (is_valid, issues_found)
        """
        text_lower = text.lower()
        issues = []

        # Check minimum length
        if len(text) < 500:
            issues.append("Document too short (< 500 characters)")

        # Check for patent keywords
        keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS
                             if kw in text_lower)

        if keyword_matches < 3:
            issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)")

        # Check for required sections
        missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS
                           if section not in text_lower]

        if missing_sections:
            issues.append(f"Missing required sections: {', '.join(missing_sections)}")

        # Check for claim structure (claims usually numbered)
        claim_pattern = r'claim\s+\d+'
        claims_found = len(re.findall(claim_pattern, text_lower))

        if claims_found == 0:
            issues.append("No numbered claims found")

        # Determine if valid
        is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0)

        if not is_valid:
            logger.warning(f"Document validation failed: {issues}")

        return is_valid, issues

    @staticmethod
    def identify_document_type(text: str) -> str:
        """
        Try to identify what type of document this is

        Returns:
            Document type description
        """
        text_lower = text.lower()

        # Check for common non-patent document types
        if 'microsoft' in text_lower and 'windows' in text_lower:
            return "Microsoft Windows documentation"

        if any(term in text_lower for term in ['press release', 'news', 'announcement']):
            return "Press release or news article"

        if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']):
            return "Technical whitepaper or report"

        if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']):
            return "Legal agreement or policy document"

        if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower:
            return "Academic research paper"

        # Check if it's a patent
        is_patent, _ = DocumentValidator.validate_patent_document(text)
        if is_patent:
            return "Patent document"

        return "Unknown document type (not a patent)"


def validate_and_log(text: str, document_name: str = "document") -> bool:
    """
    Convenience function to validate and log results

    Args:
        text: Document text
        document_name: Name of document for logging

    Returns:
        True if valid patent, False otherwise
    """
    is_valid, issues = DocumentValidator.validate_patent_document(text)

    if not is_valid:
        doc_type = DocumentValidator.identify_document_type(text)
        logger.error(f"❌ {document_name} is NOT a valid patent")
        logger.error(f"   Detected type: {doc_type}")
        logger.error(f"   Issues: {', '.join(issues)}")
        return False

    logger.success(f"✅ {document_name} appears to be a valid patent")
    return True