Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / src /utils /document_validator.py

MHamdan

Initial commit: SPARKNET framework

a9dc537 13 days ago

raw

history blame contribute delete

4.16 kB

	"""
	Document type validation utility
	Helps identify if uploaded documents are actually patents
	"""
	import re
	from pathlib import Path
	from typing import Tuple, List
	from loguru import logger


	class DocumentValidator:
	"""Validate that uploaded documents are patents"""

	# Keywords that should appear in patent documents
	PATENT_KEYWORDS = [
	'patent', 'claim', 'claims', 'invention', 'abstract',
	'field of invention', 'background', 'detailed description',
	'inventor', 'assignee', 'filing date', 'application'
	]

	# Required sections in patents
	REQUIRED_SECTIONS = ['abstract', 'claim']

	@staticmethod
	def validate_patent_document(text: str) -> Tuple[bool, List[str]]:
	"""
	Validate if document text appears to be a patent

	Args:
	text: Extracted document text

	Returns:
	Tuple of (is_valid, issues_found)
	"""
	text_lower = text.lower()
	issues = []

	# Check minimum length
	if len(text) < 500:
	issues.append("Document too short (< 500 characters)")

	# Check for patent keywords
	keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS
	if kw in text_lower)

	if keyword_matches < 3:
	issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)")

	# Check for required sections
	missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS
	if section not in text_lower]

	if missing_sections:
	issues.append(f"Missing required sections: {', '.join(missing_sections)}")

	# Check for claim structure (claims usually numbered)
	claim_pattern = r'claim\s+\d+'
	claims_found = len(re.findall(claim_pattern, text_lower))

	if claims_found == 0:
	issues.append("No numbered claims found")

	# Determine if valid
	is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0)

	if not is_valid:
	logger.warning(f"Document validation failed: {issues}")

	return is_valid, issues

	@staticmethod
	def identify_document_type(text: str) -> str:
	"""
	Try to identify what type of document this is

	Returns:
	Document type description
	"""
	text_lower = text.lower()

	# Check for common non-patent document types
	if 'microsoft' in text_lower and 'windows' in text_lower:
	return "Microsoft Windows documentation"

	if any(term in text_lower for term in ['press release', 'news', 'announcement']):
	return "Press release or news article"

	if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']):
	return "Technical whitepaper or report"

	if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']):
	return "Legal agreement or policy document"

	if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower:
	return "Academic research paper"

	# Check if it's a patent
	is_patent, _ = DocumentValidator.validate_patent_document(text)
	if is_patent:
	return "Patent document"

	return "Unknown document type (not a patent)"


	def validate_and_log(text: str, document_name: str = "document") -> bool:
	"""
	Convenience function to validate and log results

	Args:
	text: Document text
	document_name: Name of document for logging

	Returns:
	True if valid patent, False otherwise
	"""
	is_valid, issues = DocumentValidator.validate_patent_document(text)

	if not is_valid:
	doc_type = DocumentValidator.identify_document_type(text)
	logger.error(f"❌ {document_name} is NOT a valid patent")
	logger.error(f" Detected type: {doc_type}")
	logger.error(f" Issues: {', '.join(issues)}")
	return False

	logger.success(f"✅ {document_name} appears to be a valid patent")
	return True