Spaces:
Sleeping
Sleeping
File size: 6,325 Bytes
9336543 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | import os
from lxml import etree
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple
# Constants for DORA XML elements
TAG_GENRE = "genre"
TAG_MODS = "mods"
TAG_ORIGIN_INFO = "originInfo"
TAG_DATE_ISSUED = "dateIssued"
TAG_DATE_OTHER = "dateOther"
ATTR_REPORTING_YEAR = "reporting year"
@dataclass
class DateInfo:
both_dates_in_same_block: bool
date_issued_node: Optional[etree._Element]
reporting_year_node: Optional[etree._Element]
@dataclass
class NodeInfo:
node: etree._Element
# Parent is implicit in lxml via getparent()
name: str # The computed "path name" for comparison
has_child_elements: bool
class XmlHelper:
@staticmethod
def parse_xml(file_path_or_content) -> etree._ElementTree:
"""Parses an XML file or content."""
parser = etree.XMLParser(remove_blank_text=True)
try:
if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content):
tree = etree.parse(file_path_or_content, parser)
else:
if isinstance(file_path_or_content, bytes):
tree = etree.fromstring(file_path_or_content, parser).getroottree()
else:
tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree()
return tree
except Exception as e:
raise ValueError(f"Error parsing XML: {e}")
@staticmethod
def get_genre_node(root: etree._Element) -> Optional[etree._Element]:
"""Finds the genre element."""
# Use simple local-name matching to avoid namespace headaches
for elem in root.iter():
if etree.QName(elem).localname == TAG_GENRE:
return elem
return None
@staticmethod
def find_date_nodes(root: etree._Element) -> DateInfo:
"""Finds dateIssued and reporting year nodes."""
date_issued = None
reporting_year = None
both_in_same = False
origin_infos = []
for elem in root.iter():
if etree.QName(elem).localname == TAG_ORIGIN_INFO:
origin_infos.append(elem)
if not origin_infos:
raise ValueError("No originInfo elements found in MODS XML")
for origin_info in origin_infos:
has_issued = False
has_reporting = False
# Reset for each block to check if THIS block has both
current_date_issued = None
current_reporting_year = None
for child in origin_info:
localname = etree.QName(child).localname
if localname == TAG_DATE_ISSUED:
has_issued = True
current_date_issued = child
elif localname == TAG_DATE_OTHER:
# Check attributes
for attr_name, attr_value in child.attrib.items():
if attr_value == ATTR_REPORTING_YEAR:
has_reporting = True
current_reporting_year = child
break
if has_issued:
date_issued = current_date_issued
if has_reporting:
reporting_year = current_reporting_year
if has_issued and has_reporting:
both_in_same = True
# Java code breaks on first occurrence of both in same
break
return DateInfo(both_in_same, date_issued, reporting_year)
@staticmethod
def get_node_path_name(element: etree._Element, parent_path: str = "") -> str:
"""Generates a unique-ish name for the node based on tag and path."""
tag = etree.QName(element).localname
if tag == "mods":
return ""
name = tag
if element.attrib:
# Sort attribs for consistency
for k, v in sorted(element.attrib.items()):
name += f" [{k}={v}]"
if parent_path:
return f"{parent_path} | {name}"
return name
@staticmethod
def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]:
"""Flattens the XML structure to a list of NodeInfo."""
nodes = []
def traverse(element, parent_path):
# Calculate path for current element
# Note: The root 'mods' element usually has empty path name in Java logic
current_path = XmlHelper.get_node_path_name(element, parent_path)
has_child_elements = False
for child in element:
if isinstance(child, etree._Element):
has_child_elements = True
# Recurse
traverse(child, current_path)
# Creating info for CURRENT node
# We skip adding 'mods' root itself to the list if its path is empty?
# Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)"
# It ADDS it, but name is empty?
# Java: "String nodeName = ... ? "" : currentNodeName"
# Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName"
# If it's root mods, parent_path is None/Empty. nodeName is "".
# If it's child of mods, parent_path is "". nodeName is "genre". Result "genre".
# So root mods is added with name "".
# Note: The comparison logic later uses these names.
# If template has root mods (""), input has root mods (""). They match.
# So we should include it.
nodes.append(NodeInfo(element, current_path, has_child_elements))
# Root typically 'mods'
# Pass parent_path="" implies we are at top.
# But wait, get_node_path_name for root mods returns "".
# For child 'genre', parent_path is "". get_node_path_name returns "genre".
# This matches Java logic.
traverse(root, "")
return nodes
|