Spaces:
Sleeping
Sleeping
| import os | |
| from lxml import etree | |
| from dataclasses import dataclass | |
| from typing import List, Optional, Dict, Tuple | |
| # Constants for DORA XML elements | |
| TAG_GENRE = "genre" | |
| TAG_MODS = "mods" | |
| TAG_ORIGIN_INFO = "originInfo" | |
| TAG_DATE_ISSUED = "dateIssued" | |
| TAG_DATE_OTHER = "dateOther" | |
| ATTR_REPORTING_YEAR = "reporting year" | |
| class DateInfo: | |
| both_dates_in_same_block: bool | |
| date_issued_node: Optional[etree._Element] | |
| reporting_year_node: Optional[etree._Element] | |
| class NodeInfo: | |
| node: etree._Element | |
| # Parent is implicit in lxml via getparent() | |
| name: str # The computed "path name" for comparison | |
| has_child_elements: bool | |
| class XmlHelper: | |
| def parse_xml(file_path_or_content) -> etree._ElementTree: | |
| """Parses an XML file or content.""" | |
| parser = etree.XMLParser(remove_blank_text=True) | |
| try: | |
| if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content): | |
| tree = etree.parse(file_path_or_content, parser) | |
| else: | |
| if isinstance(file_path_or_content, bytes): | |
| tree = etree.fromstring(file_path_or_content, parser).getroottree() | |
| else: | |
| tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree() | |
| return tree | |
| except Exception as e: | |
| raise ValueError(f"Error parsing XML: {e}") | |
| def get_genre_node(root: etree._Element) -> Optional[etree._Element]: | |
| """Finds the genre element.""" | |
| # Use simple local-name matching to avoid namespace headaches | |
| for elem in root.iter(): | |
| if etree.QName(elem).localname == TAG_GENRE: | |
| return elem | |
| return None | |
| def find_date_nodes(root: etree._Element) -> DateInfo: | |
| """Finds dateIssued and reporting year nodes.""" | |
| date_issued = None | |
| reporting_year = None | |
| both_in_same = False | |
| origin_infos = [] | |
| for elem in root.iter(): | |
| if etree.QName(elem).localname == TAG_ORIGIN_INFO: | |
| origin_infos.append(elem) | |
| if not origin_infos: | |
| raise ValueError("No originInfo elements found in MODS XML") | |
| for origin_info in origin_infos: | |
| has_issued = False | |
| has_reporting = False | |
| # Reset for each block to check if THIS block has both | |
| current_date_issued = None | |
| current_reporting_year = None | |
| for child in origin_info: | |
| localname = etree.QName(child).localname | |
| if localname == TAG_DATE_ISSUED: | |
| has_issued = True | |
| current_date_issued = child | |
| elif localname == TAG_DATE_OTHER: | |
| # Check attributes | |
| for attr_name, attr_value in child.attrib.items(): | |
| if attr_value == ATTR_REPORTING_YEAR: | |
| has_reporting = True | |
| current_reporting_year = child | |
| break | |
| if has_issued: | |
| date_issued = current_date_issued | |
| if has_reporting: | |
| reporting_year = current_reporting_year | |
| if has_issued and has_reporting: | |
| both_in_same = True | |
| # Java code breaks on first occurrence of both in same | |
| break | |
| return DateInfo(both_in_same, date_issued, reporting_year) | |
| def get_node_path_name(element: etree._Element, parent_path: str = "") -> str: | |
| """Generates a unique-ish name for the node based on tag and path.""" | |
| tag = etree.QName(element).localname | |
| if tag == "mods": | |
| return "" | |
| name = tag | |
| if element.attrib: | |
| # Sort attribs for consistency | |
| for k, v in sorted(element.attrib.items()): | |
| name += f" [{k}={v}]" | |
| if parent_path: | |
| return f"{parent_path} | {name}" | |
| return name | |
| def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]: | |
| """Flattens the XML structure to a list of NodeInfo.""" | |
| nodes = [] | |
| def traverse(element, parent_path): | |
| # Calculate path for current element | |
| # Note: The root 'mods' element usually has empty path name in Java logic | |
| current_path = XmlHelper.get_node_path_name(element, parent_path) | |
| has_child_elements = False | |
| for child in element: | |
| if isinstance(child, etree._Element): | |
| has_child_elements = True | |
| # Recurse | |
| traverse(child, current_path) | |
| # Creating info for CURRENT node | |
| # We skip adding 'mods' root itself to the list if its path is empty? | |
| # Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)" | |
| # It ADDS it, but name is empty? | |
| # Java: "String nodeName = ... ? "" : currentNodeName" | |
| # Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName" | |
| # If it's root mods, parent_path is None/Empty. nodeName is "". | |
| # If it's child of mods, parent_path is "". nodeName is "genre". Result "genre". | |
| # So root mods is added with name "". | |
| # Note: The comparison logic later uses these names. | |
| # If template has root mods (""), input has root mods (""). They match. | |
| # So we should include it. | |
| nodes.append(NodeInfo(element, current_path, has_child_elements)) | |
| # Root typically 'mods' | |
| # Pass parent_path="" implies we are at top. | |
| # But wait, get_node_path_name for root mods returns "". | |
| # For child 'genre', parent_path is "". get_node_path_name returns "genre". | |
| # This matches Java logic. | |
| traverse(root, "") | |
| return nodes | |