Spaces:

andrehoffmann80
/

dora_pubtype

Sleeping

App Files Files Community

dora_pubtype / app.py

andrehoffmann80

Upload 4 files

9336543 verified about 1 month ago

raw

history blame contribute delete

14.1 kB

	import streamlit as st
	import os
	import io
	import requests
	from lxml import etree
	from converter import ModsConverter
	from urllib.parse import quote, unquote

	st.set_page_config(
	page_title="PubTypeConverter \| DORA Tools",
	page_icon="🔄",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Custom CSS for a modern, premium look
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap');

	html, body, [class*="css"] {
	font-family: 'Inter', sans-serif;
	}

	.stApp {
	background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
	}

	/* Premium Header */
	.title-container {
	padding: 2rem 0;
	text-align: center;
	background: rgba(255, 255, 255, 0.4);
	backdrop-filter: blur(10px);
	border-radius: 20px;
	margin-bottom: 2rem;
	border: 1px solid rgba(255, 255, 255, 0.5);
	box-shadow: 0 8px 32px 0 rgba(31, 38, 135, 0.07);
	}

	.main-title {
	font-size: 3rem;
	font-weight: 700;
	background: linear-gradient(90deg, #1e3a8a, #3b82f6);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 0.5rem;
	}

	.sub-title {
	color: #64748b;
	font-size: 1.1rem;
	}

	/* Section Styling */
	.stSelectbox, .stTextInput, .stButton {
	margin-bottom: 1rem;
	}

	/* Card-like containers for results */
	.result-card {
	background: white;
	padding: 1.5rem;
	border-radius: 15px;
	box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
	margin-bottom: 1.5rem;
	border-left: 5px solid #3b82f6;
	}

	/* Step indicators */
	.step-header {
	font-weight: 600;
	color: #1e293b;
	margin-bottom: 1rem;
	display: flex;
	align-items: center;
	gap: 0.5rem;
	}

	.step-number {
	background: #3b82f6;
	color: white;
	width: 24px;
	height: 24px;
	border-radius: 50%;
	display: flex;
	align-items: center;
	justify-content: center;
	font-size: 0.8rem;
	}
	</style>

	<div class="title-container">
	<div class="main-title">DORA PubTypeConverter</div>
	<div class="sub-title">DORA Publication Type Transformation Helper</div>
	</div>
	""", unsafe_allow_html=True)

	# Setup paths
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	RESOURCE_DIR = os.path.join(BASE_DIR, "PubTypeConverter_resources")
	TEMPLATE_DIR = os.path.join(RESOURCE_DIR, "PubTypeConverter_templates")
	CONFIG_FILE = os.path.join(RESOURCE_DIR, "PubTypeConverterConfig.xml")

	# Initialize Session State
	if 'converter' not in st.session_state:
	converter = ModsConverter()
	if os.path.exists(CONFIG_FILE):
	converter.load_config(CONFIG_FILE)
	else:
	st.warning(f"Configuration file not found at {CONFIG_FILE}. Content moving rules will not be applied.")
	st.session_state.converter = converter

	if 'loaded_files' not in st.session_state:
	st.session_state.loaded_files = [] # List of dicts: {'name': str, 'content': bytes}

	# Load templates
	templates = []
	if os.path.exists(TEMPLATE_DIR):
	for f in os.listdir(TEMPLATE_DIR):
	if f.lower().endswith(".xml"):
	templates.append(f)
	else:
	st.error(f"Template directory not found: {TEMPLATE_DIR}")
	templates.sort()


	# Helper function to fetch from DORA
	def fetch_from_dora(pid_or_url, repo):
	url = pid_or_url.strip()

	# Handle double-encoded PIDs (e.g. psi%253A84411 -> psi%3A84411 -> psi:84411)
	# We unquote until the string stabilizes
	prev = None
	while url != prev:
	prev = url
	url = unquote(url)
	fallback_www = None
	fallback_oai = None

	if not url.startswith("http"):
	# Construct URL from PID
	# Assume PID format like "psi:12345" or just "12345"
	if ":" in url:
	parts = url.split(":")
	repo_prefix = parts[0].lower()
	# Force prefix to lowercase for repo mapping
	pid_val = f"{repo_prefix}:{parts[1]}"
	repo = repo_prefix
	else:
	# Use selected repo
	pid_val = f"{repo}:{url}"

	# URL Encode the PID part - essential for Islandora
	quoted_pid = quote(pid_val)

	# Primary: Admin (Intranet, preferred)
	url = f"https://admin.dora.lib4ri.ch/{repo}/islandora/object/{quoted_pid}/datastream/MODS/view"

	# Fallback 1: WWW (Public mirror)
	fallback_www = f"https://www.dora.lib4ri.ch/{repo}/islandora/object/{quoted_pid}/datastream/MODS/view"

	# Fallback 2: OAI-PMH (Robust Public Access)
	# Use quoted PID for identifier to be safe (suggested by user example)
	fallback_oai = f"https://www.dora.lib4ri.ch/{repo}/oai2/request?verb=GetRecord&metadataPrefix=mods&identifier={quoted_pid}"

	headers = {'User-Agent': 'curl/7.68.0'}

	# helper to check OAI response for valid content
	def check_oai_response(content):
	try:
	root = etree.fromstring(content)

	# Check for OAI error code
	if root.xpath(".//*[local-name()='error']"):
	return False

	# Ultimate robust search: find first 'mods' element regardless of namespace/prefix
	mods_nodes = root.xpath(".//*[local-name()='mods']")
	if mods_nodes:
	return etree.tostring(mods_nodes[0], encoding='utf-8')
	except:
	pass
	return None

	try:
	# Try primary URL
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()
	return response.content, url
	except Exception as e:
	status_errors = [f"Admin: {e}"]

	# Try Fallback 1: WWW
	if fallback_www:
	try:
	response = requests.get(fallback_www, headers=headers, timeout=10)
	response.raise_for_status()
	return response.content, fallback_www
	except Exception as e2:
	status_errors.append(f"WWW: {e2}")

	# Try Fallback 2: OAI-PMH
	if fallback_oai:
	try:
	response = requests.get(fallback_oai, headers=headers, timeout=10)
	if response.status_code == 200:
	mods_content = check_oai_response(response.content)
	if mods_content:
	return mods_content, fallback_oai
	else:
	status_errors.append("OAI: Valid HTTP but no MODS found in response")
	else:
	status_errors.append(f"OAI: HTTP {response.status_code}")
	except Exception as e3:
	status_errors.append(f"OAI: {e3}")

	return None, f"Failed to fetch. Details: {'; '.join(status_errors)}"

	# UI Layout
	main_col1, main_col2 = st.columns([0.6, 0.4], gap="large")

	with main_col1:
	st.markdown('<div class="step-header"><div class="step-number">1</div><span>Select Source Data</span></div>', unsafe_allow_html=True)

	input_tab1, input_tab2 = st.tabs(["🌐 Pull from DORA", "📁 Upload Local XML"])

	with input_tab1:
	dora_col1, dora_col2 = st.columns([0.7, 0.3])
	dora_input = dora_col1.text_input("PID or URL", placeholder="e.g. psi:84411", label_visibility="collapsed")
	repo_select = dora_col2.selectbox("Repo", ["psi", "eawag", "empa", "wsl"], label_visibility="collapsed")

	if st.button("Fetch and Load Record", use_container_width=True):
	if dora_input:
	with st.spinner("Retrieving from DORA..."):
	content, error_or_url = fetch_from_dora(dora_input, repo_select)
	if content:
	filename = dora_input.replace(":", "_").replace("/", "_") + ".xml"
	if filename.startswith("http"): filename = "dora_record.xml"
	st.session_state.loaded_files.append({"name": filename, "content": content, "source": error_or_url})
	st.toast(f"Loaded {filename}", icon="✅")
	else:
	st.error(f"Fetch failed: {error_or_url}")
	else:
	st.warning("Please provide a identifier first.")

	with input_tab2:
	uploaded_files = st.file_uploader("Upload MODS XML files", type=['xml'], accept_multiple_files=True, label_visibility="collapsed")

	# Display loaded files in a modern list
	if st.session_state.loaded_files:
	st.markdown("### Loaded Documents")
	for i, file_data in enumerate(st.session_state.loaded_files):
	with st.container():
	f_col1, f_col2 = st.columns([0.85, 0.15])
	f_col1.markdown(f"📄 {file_data['name']}")
	if f_col2.button("🗑️", key=f"remove_{i}", help="Remove this file"):
	st.session_state.loaded_files.pop(i)
	st.rerun()

	with main_col2:
	st.markdown('<div class="step-header"><div class="step-number">2</div><span>Target Format</span></div>', unsafe_allow_html=True)
	selected_template = st.selectbox("Choose the destination publication type", templates if templates else ["No templates found"], label_visibility="collapsed")

	st.markdown("---")
	if st.button("🚀 Start Conversion", disabled=not (uploaded_files or st.session_state.loaded_files) or not templates, use_container_width=True, type="primary"):
	st.session_state.start_convert = True
	else:
	st.session_state.start_convert = False

	# Combine sources
	all_files = []
	if uploaded_files:
	for f in uploaded_files:
	all_files.append({"name": f.name, "content": f.getvalue()})
	if st.session_state.loaded_files:
	all_files.extend(st.session_state.loaded_files)

	if st.session_state.get("start_convert"):
	st.markdown('<div class="step-header"><div class="step-number">3</div><span>Conversion Reports</span></div>', unsafe_allow_html=True)

	for file_data in all_files:
	content = file_data['content']
	filename = file_data['name']

	# Try to decode if bytes, though lxml can parse bytes directly
	# But we pass string/bytes/path to converter

	template_path = os.path.join(TEMPLATE_DIR, selected_template)

	try:
	# result_xml is the XML string, log is the structured data dict
	result_xml, log_data = st.session_state.converter.convert(content, template_path)

	with st.container():
	st.markdown(f"""
	<div class="result-card">
	<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 1rem;">
	<span style="font-weight: 700; font-size: 1.2rem; color: #1e3a8a;">{filename}</span>
	<span style="background: #dbeafe; color: #1e40af; padding: 0.2rem 0.8rem; border-radius: 999px; font-size: 0.8rem; font-weight: 600;">TRANSFORMED</span>
	</div>
	<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 1.5rem;">
	<span style="background: #f1f5f9; padding: 0.4rem 1rem; border-radius: 8px; font-size: 0.9rem; border: 1px solid #e2e8f0;">{log_data['old_genre']}</span>
	<span style="color: #94a3b8;">➡️</span>
	<span style="background: #ecfdf5; color: #065f46; padding: 0.4rem 1rem; border-radius: 8px; font-size: 0.9rem; border: 1px solid #d1fae5; font-weight: 600;">{log_data['new_genre']}</span>
	</div>
	""", unsafe_allow_html=True)

	s_col1, s_col2, s_col3 = st.columns(3)
	s_col1.metric("Transfers", len(log_data["moves"]))
	s_col2.metric("Additions", len(log_data["additions"]))
	s_col3.metric("Removals", len(log_data["deletions"]))

	with st.expander("Audit Transformation Details", expanded=False):
	if log_data["moves"]:
	st.write("🔄 Content Transfers")
	for m in log_data["moves"]:
	st.caption(f"• {m['summary']}")
	if log_data["additions"]:
	st.write("✨ Smart Additions")
	for a in log_data["additions"]:
	st.caption(f"• {a['summary']}")
	if log_data["deletions"]:
	st.write("🗑️ Legacy Cleanup")
	del_labels = [d['label'] for d in log_data["deletions"]]
	st.caption(f"Removed {len(del_labels)} unused fields: " + ", ".join(del_labels[:8]) + ("..." if len(del_labels) > 8 else ""))

	if log_data["warnings"]:
	for w in log_data["warnings"]:
	st.warning(w)

	if result_xml:
	st.download_button(
	label=f"⬇️ Download {filename}",
	data=result_xml,
	file_name=f"{os.path.splitext(filename)[0]}_converted.xml",
	mime="application/xml",
	key=f"dl_{filename}",
	use_container_width=True
	)
	st.markdown('</div>', unsafe_allow_html=True)
	except Exception as e:
	st.error(f"Error converting {filename}: {e}")
	import traceback
	st.code(traceback.format_exc())