dora_pubtype / utils.py
andrehoffmann80's picture
Upload 4 files
9336543 verified
import os
from lxml import etree
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple
# Constants for DORA XML elements
TAG_GENRE = "genre"
TAG_MODS = "mods"
TAG_ORIGIN_INFO = "originInfo"
TAG_DATE_ISSUED = "dateIssued"
TAG_DATE_OTHER = "dateOther"
ATTR_REPORTING_YEAR = "reporting year"
@dataclass
class DateInfo:
both_dates_in_same_block: bool
date_issued_node: Optional[etree._Element]
reporting_year_node: Optional[etree._Element]
@dataclass
class NodeInfo:
node: etree._Element
# Parent is implicit in lxml via getparent()
name: str # The computed "path name" for comparison
has_child_elements: bool
class XmlHelper:
@staticmethod
def parse_xml(file_path_or_content) -> etree._ElementTree:
"""Parses an XML file or content."""
parser = etree.XMLParser(remove_blank_text=True)
try:
if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content):
tree = etree.parse(file_path_or_content, parser)
else:
if isinstance(file_path_or_content, bytes):
tree = etree.fromstring(file_path_or_content, parser).getroottree()
else:
tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree()
return tree
except Exception as e:
raise ValueError(f"Error parsing XML: {e}")
@staticmethod
def get_genre_node(root: etree._Element) -> Optional[etree._Element]:
"""Finds the genre element."""
# Use simple local-name matching to avoid namespace headaches
for elem in root.iter():
if etree.QName(elem).localname == TAG_GENRE:
return elem
return None
@staticmethod
def find_date_nodes(root: etree._Element) -> DateInfo:
"""Finds dateIssued and reporting year nodes."""
date_issued = None
reporting_year = None
both_in_same = False
origin_infos = []
for elem in root.iter():
if etree.QName(elem).localname == TAG_ORIGIN_INFO:
origin_infos.append(elem)
if not origin_infos:
raise ValueError("No originInfo elements found in MODS XML")
for origin_info in origin_infos:
has_issued = False
has_reporting = False
# Reset for each block to check if THIS block has both
current_date_issued = None
current_reporting_year = None
for child in origin_info:
localname = etree.QName(child).localname
if localname == TAG_DATE_ISSUED:
has_issued = True
current_date_issued = child
elif localname == TAG_DATE_OTHER:
# Check attributes
for attr_name, attr_value in child.attrib.items():
if attr_value == ATTR_REPORTING_YEAR:
has_reporting = True
current_reporting_year = child
break
if has_issued:
date_issued = current_date_issued
if has_reporting:
reporting_year = current_reporting_year
if has_issued and has_reporting:
both_in_same = True
# Java code breaks on first occurrence of both in same
break
return DateInfo(both_in_same, date_issued, reporting_year)
@staticmethod
def get_node_path_name(element: etree._Element, parent_path: str = "") -> str:
"""Generates a unique-ish name for the node based on tag and path."""
tag = etree.QName(element).localname
if tag == "mods":
return ""
name = tag
if element.attrib:
# Sort attribs for consistency
for k, v in sorted(element.attrib.items()):
name += f" [{k}={v}]"
if parent_path:
return f"{parent_path} | {name}"
return name
@staticmethod
def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]:
"""Flattens the XML structure to a list of NodeInfo."""
nodes = []
def traverse(element, parent_path):
# Calculate path for current element
# Note: The root 'mods' element usually has empty path name in Java logic
current_path = XmlHelper.get_node_path_name(element, parent_path)
has_child_elements = False
for child in element:
if isinstance(child, etree._Element):
has_child_elements = True
# Recurse
traverse(child, current_path)
# Creating info for CURRENT node
# We skip adding 'mods' root itself to the list if its path is empty?
# Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)"
# It ADDS it, but name is empty?
# Java: "String nodeName = ... ? "" : currentNodeName"
# Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName"
# If it's root mods, parent_path is None/Empty. nodeName is "".
# If it's child of mods, parent_path is "". nodeName is "genre". Result "genre".
# So root mods is added with name "".
# Note: The comparison logic later uses these names.
# If template has root mods (""), input has root mods (""). They match.
# So we should include it.
nodes.append(NodeInfo(element, current_path, has_child_elements))
# Root typically 'mods'
# Pass parent_path="" implies we are at top.
# But wait, get_node_path_name for root mods returns "".
# For child 'genre', parent_path is "". get_node_path_name returns "genre".
# This matches Java logic.
traverse(root, "")
return nodes