import os from lxml import etree from dataclasses import dataclass from typing import List, Optional, Dict, Tuple # Constants for DORA XML elements TAG_GENRE = "genre" TAG_MODS = "mods" TAG_ORIGIN_INFO = "originInfo" TAG_DATE_ISSUED = "dateIssued" TAG_DATE_OTHER = "dateOther" ATTR_REPORTING_YEAR = "reporting year" @dataclass class DateInfo: both_dates_in_same_block: bool date_issued_node: Optional[etree._Element] reporting_year_node: Optional[etree._Element] @dataclass class NodeInfo: node: etree._Element # Parent is implicit in lxml via getparent() name: str # The computed "path name" for comparison has_child_elements: bool class XmlHelper: @staticmethod def parse_xml(file_path_or_content) -> etree._ElementTree: """Parses an XML file or content.""" parser = etree.XMLParser(remove_blank_text=True) try: if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content): tree = etree.parse(file_path_or_content, parser) else: if isinstance(file_path_or_content, bytes): tree = etree.fromstring(file_path_or_content, parser).getroottree() else: tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree() return tree except Exception as e: raise ValueError(f"Error parsing XML: {e}") @staticmethod def get_genre_node(root: etree._Element) -> Optional[etree._Element]: """Finds the genre element.""" # Use simple local-name matching to avoid namespace headaches for elem in root.iter(): if etree.QName(elem).localname == TAG_GENRE: return elem return None @staticmethod def find_date_nodes(root: etree._Element) -> DateInfo: """Finds dateIssued and reporting year nodes.""" date_issued = None reporting_year = None both_in_same = False origin_infos = [] for elem in root.iter(): if etree.QName(elem).localname == TAG_ORIGIN_INFO: origin_infos.append(elem) if not origin_infos: raise ValueError("No originInfo elements found in MODS XML") for origin_info in origin_infos: has_issued = False has_reporting = False # Reset for each block to check if THIS block has both current_date_issued = None current_reporting_year = None for child in origin_info: localname = etree.QName(child).localname if localname == TAG_DATE_ISSUED: has_issued = True current_date_issued = child elif localname == TAG_DATE_OTHER: # Check attributes for attr_name, attr_value in child.attrib.items(): if attr_value == ATTR_REPORTING_YEAR: has_reporting = True current_reporting_year = child break if has_issued: date_issued = current_date_issued if has_reporting: reporting_year = current_reporting_year if has_issued and has_reporting: both_in_same = True # Java code breaks on first occurrence of both in same break return DateInfo(both_in_same, date_issued, reporting_year) @staticmethod def get_node_path_name(element: etree._Element, parent_path: str = "") -> str: """Generates a unique-ish name for the node based on tag and path.""" tag = etree.QName(element).localname if tag == "mods": return "" name = tag if element.attrib: # Sort attribs for consistency for k, v in sorted(element.attrib.items()): name += f" [{k}={v}]" if parent_path: return f"{parent_path} | {name}" return name @staticmethod def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]: """Flattens the XML structure to a list of NodeInfo.""" nodes = [] def traverse(element, parent_path): # Calculate path for current element # Note: The root 'mods' element usually has empty path name in Java logic current_path = XmlHelper.get_node_path_name(element, parent_path) has_child_elements = False for child in element: if isinstance(child, etree._Element): has_child_elements = True # Recurse traverse(child, current_path) # Creating info for CURRENT node # We skip adding 'mods' root itself to the list if its path is empty? # Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)" # It ADDS it, but name is empty? # Java: "String nodeName = ... ? "" : currentNodeName" # Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName" # If it's root mods, parent_path is None/Empty. nodeName is "". # If it's child of mods, parent_path is "". nodeName is "genre". Result "genre". # So root mods is added with name "". # Note: The comparison logic later uses these names. # If template has root mods (""), input has root mods (""). They match. # So we should include it. nodes.append(NodeInfo(element, current_path, has_child_elements)) # Root typically 'mods' # Pass parent_path="" implies we are at top. # But wait, get_node_path_name for root mods returns "". # For child 'genre', parent_path is "". get_node_path_name returns "genre". # This matches Java logic. traverse(root, "") return nodes