""" Tests du générateur METS v1.12 (Sprint 3 — Session B). Vérifie : - XML produit est valide (parseable par lxml) - 6 sections obligatoires présentes : metsHdr, dmdSec, amdSec, fileSec, structMap PHYSICAL, structMap LOGICAL - fileSec : 3 fileGrp (master, derivative_web, alto), 3 fichiers par page - structMap PHYSICAL : ordre respecte PageMaster.sequence (pas l'ordre de la liste) - structMap LOGICAL : 1 seul div TYPE="manuscript" - Chemins ALTO construits depuis corpus_slug + folio_label - Métadonnées Dublin Core présentes dans dmdSec - amdSec : techMD global avec model_id du premier master.processing - manuscript_id, label, corpus_slug obligatoires → ValueError sinon - Liste vide → ValueError explicite - Scénarios réalistes : Beatus HR + BR (1 manuscrit), Grandes Chroniques (autre) """ # 1. stdlib import json from datetime import datetime, timezone from pathlib import Path # 2. third-party import pytest from lxml import etree # 3. local from app.schemas.page_master import EditorialInfo, EditorialStatus, OCRResult, PageMaster, ProcessingInfo from app.services.export.mets import generate_mets, write_mets # ── Namespaces ──────────────────────────────────────────────────────────────── _METS_NS = "http://www.loc.gov/METS/" _DC_NS = "http://purl.org/dc/elements/1.1/" _XLINK_NS = "http://www.w3.org/1999/xlink" _NS = {"m": _METS_NS, "dc": _DC_NS, "xlink": _XLINK_NS} # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _parse(xml_str: str) -> etree._Element: return etree.fromstring(xml_str.encode("utf-8")) def _xp(root: etree._Element, path: str) -> list: return root.xpath(path, namespaces=_NS) def _one(root: etree._Element, path: str) -> etree._Element: results = _xp(root, path) assert len(results) == 1, f"Expected 1 for {path!r}, got {len(results)}: {results}" return results[0] def _make_page( page_id: str, folio_label: str, sequence: int, original_url: str = "", derivative_web: str = "", with_processing: bool = False, ocr_text: str = "", ) -> PageMaster: processing = None if with_processing: processing = ProcessingInfo( provider="google_ai_studio", model_id="gemini-2.0-flash", model_display_name="Gemini 2.0 Flash", prompt_version="prompts/medieval-illuminated/primary_v1.txt", raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json", processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc), ) ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None return PageMaster( page_id=page_id, corpus_profile="medieval-illuminated", manuscript_id="ms-test", folio_label=folio_label, sequence=sequence, image={ "master": original_url or f"https://example.com/{folio_label}.jpg", "derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg", "thumbnail": f"/data/thumb/{folio_label}.jpg", "width": 1500, "height": 2000, }, layout={"regions": []}, ocr=ocr, processing=processing, editorial=EditorialInfo(status=EditorialStatus.MACHINE_DRAFT), ) def _base_meta(corpus_slug: str = "test-ms", label: str = "Test Manuscript") -> dict: return { "manuscript_id": "ms-test-001", "label": label, "corpus_slug": corpus_slug, } # ── Fixtures réalistes (3 PageMaster du Sprint 2) ─────────────────────────── @pytest.fixture def beatus_pages(): """2 pages du Beatus (HR + BR) — même manuscrit, 2 folios.""" return [ _make_page( page_id="beatus-lat8878-hr-f233", folio_label="f233-hr", sequence=233, original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b52505441p/f233/full/full/0/native.jpg", derivative_web="/data/corpora/beatus-lat8878/derivatives/f233-hr.jpg", with_processing=True, ocr_text="Incipit explanatio", ), _make_page( page_id="beatus-lat8878-br-f233", folio_label="f233-br", sequence=234, original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b52505441p/f233/full/600,/0/native.jpg", derivative_web="/data/corpora/beatus-lat8878/derivatives/f233-br.jpg", with_processing=False, ), ] @pytest.fixture def beatus_meta(): return { "manuscript_id": "BnF-Latin-8878", "label": "Beatus de Saint-Sever", "corpus_slug": "beatus-lat8878", "language": "la", "repository": "Bibliothèque nationale de France", "shelfmark": "Latin 8878", "date_label": "XIe siècle", "institution": "BnF", } @pytest.fixture def chroniques_pages(): return [ _make_page( page_id="chroniques-btv1b84472995-f16", folio_label="f16", sequence=16, original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b84472995/f16/full/full/0/native.jpg", derivative_web="/data/corpora/grandes-chroniques/derivatives/f16.jpg", with_processing=True, ocr_text="Cy commence le prologue", ), ] @pytest.fixture def chroniques_meta(): return { "manuscript_id": "BnF-btv1b84472995", "label": "Grandes Chroniques de France", "corpus_slug": "grandes-chroniques", "language": "fr", "repository": "Bibliothèque nationale de France", } # --------------------------------------------------------------------------- # Tests — validité XML # --------------------------------------------------------------------------- def test_generate_mets_returns_string(beatus_pages, beatus_meta): result = generate_mets(beatus_pages, beatus_meta) assert isinstance(result, str) def test_generate_mets_valid_xml(beatus_pages, beatus_meta): xml_str = generate_mets(beatus_pages, beatus_meta) root = _parse(xml_str) assert root is not None def test_generate_mets_xml_declaration(beatus_pages, beatus_meta): xml_str = generate_mets(beatus_pages, beatus_meta) assert xml_str.startswith("