File size: 6,325 Bytes
9336543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
from lxml import etree
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple

# Constants for DORA XML elements
TAG_GENRE = "genre"
TAG_MODS = "mods"
TAG_ORIGIN_INFO = "originInfo"
TAG_DATE_ISSUED = "dateIssued"
TAG_DATE_OTHER = "dateOther"
ATTR_REPORTING_YEAR = "reporting year"

@dataclass
class DateInfo:
    both_dates_in_same_block: bool
    date_issued_node: Optional[etree._Element]
    reporting_year_node: Optional[etree._Element]

@dataclass
class NodeInfo:
    node: etree._Element
    # Parent is implicit in lxml via getparent()
    name: str # The computed "path name" for comparison
    has_child_elements: bool

class XmlHelper:
    @staticmethod
    def parse_xml(file_path_or_content) -> etree._ElementTree:
        """Parses an XML file or content."""
        parser = etree.XMLParser(remove_blank_text=True)
        try:
            if isinstance(file_path_or_content, str) and os.path.exists(file_path_or_content):
                tree = etree.parse(file_path_or_content, parser)
            else:
                if isinstance(file_path_or_content, bytes):
                    tree = etree.fromstring(file_path_or_content, parser).getroottree()
                else:
                     tree = etree.fromstring(file_path_or_content.encode('utf-8'), parser).getroottree()
            return tree
        except Exception as e:
            raise ValueError(f"Error parsing XML: {e}")

    @staticmethod
    def get_genre_node(root: etree._Element) -> Optional[etree._Element]:
        """Finds the genre element."""
        # Use simple local-name matching to avoid namespace headaches
        for elem in root.iter():
            if etree.QName(elem).localname == TAG_GENRE:
                return elem
        return None

    @staticmethod
    def find_date_nodes(root: etree._Element) -> DateInfo:
        """Finds dateIssued and reporting year nodes."""
        date_issued = None
        reporting_year = None
        both_in_same = False
        
        origin_infos = []
        for elem in root.iter():
            if etree.QName(elem).localname == TAG_ORIGIN_INFO:
                origin_infos.append(elem)

        if not origin_infos:
            raise ValueError("No originInfo elements found in MODS XML")

        for origin_info in origin_infos:
            has_issued = False
            has_reporting = False
            
            # Reset for each block to check if THIS block has both
            current_date_issued = None
            current_reporting_year = None

            for child in origin_info:
                localname = etree.QName(child).localname
                if localname == TAG_DATE_ISSUED:
                    has_issued = True
                    current_date_issued = child
                elif localname == TAG_DATE_OTHER:
                    # Check attributes
                    for attr_name, attr_value in child.attrib.items():
                        if attr_value == ATTR_REPORTING_YEAR:
                            has_reporting = True
                            current_reporting_year = child
                            break
            
            if has_issued:
                date_issued = current_date_issued
            if has_reporting:
                reporting_year = current_reporting_year
                
            if has_issued and has_reporting:
                both_in_same = True
                # Java code breaks on first occurrence of both in same
                break 
        
        return DateInfo(both_in_same, date_issued, reporting_year)

    @staticmethod
    def get_node_path_name(element: etree._Element, parent_path: str = "") -> str:
        """Generates a unique-ish name for the node based on tag and path."""
        tag = etree.QName(element).localname
        
        if tag == "mods":
            return ""

        name = tag
        if element.attrib:
            # Sort attribs for consistency
            for k, v in sorted(element.attrib.items()):
                name += f" [{k}={v}]"
        
        if parent_path:
            return f"{parent_path} | {name}"
        return name

    @staticmethod
    def get_all_nodes_info(root: etree._Element) -> List[NodeInfo]:
        """Flattens the XML structure to a list of NodeInfo."""
        nodes = []
        
        def traverse(element, parent_path):
            # Calculate path for current element
            # Note: The root 'mods' element usually has empty path name in Java logic
            
            current_path = XmlHelper.get_node_path_name(element, parent_path)
            
            has_child_elements = False
            for child in element:
                if isinstance(child, etree._Element):
                     has_child_elements = True
                     # Recurse
                     traverse(child, current_path)

            # Creating info for CURRENT node
            # We skip adding 'mods' root itself to the list if its path is empty?
            # Java: "if (currentNodeName.equalsIgnoreCase(TAGNAME_MODS)) ... resultList.add(nodeInfo)"
            # It ADDS it, but name is empty?
            # Java: "String nodeName = ... ? "" : currentNodeName"
            # Java: "if (!parentNodeName.isEmpty()) nodeName = parentNodeName + ' | ' + nodeName"
            
            # If it's root mods, parent_path is None/Empty. nodeName is "".
            # If it's child of mods, parent_path is "". nodeName is "genre". Result "genre".
            # So root mods is added with name "". 
            
            # Note: The comparison logic later uses these names. 
            # If template has root mods (""), input has root mods (""). They match. 
            # So we should include it.
            
            nodes.append(NodeInfo(element, current_path, has_child_elements))

        # Root typically 'mods'
        # Pass parent_path="" implies we are at top. 
        # But wait, get_node_path_name for root mods returns "".
        # For child 'genre', parent_path is "". get_node_path_name returns "genre".
        # This matches Java logic.
        
        traverse(root, "")
        return nodes