IIIF-Studio / backend /app /api /v1 /export.py
Claude
fix: comprehensive repo audit β€” 15 issues fixed
9097545 unverified
"""
Endpoints d'export documentaire (R10 β€” prΓ©fixe /api/v1/).
GET /api/v1/manuscripts/{manuscript_id}/iiif-manifest β†’ JSON
GET /api/v1/manuscripts/{manuscript_id}/mets β†’ XML
GET /api/v1/pages/{page_id}/alto β†’ XML
GET /api/v1/manuscripts/{manuscript_id}/export.zip β†’ ZIP
Règle (R02) : toutes les sorties sont générées depuis les PageMasters
(master.json), jamais depuis les rΓ©ponses brutes de l'IA.
"""
# 1. stdlib
import asyncio
import io
import json
import logging
import zipfile
from pathlib import Path
# 2. third-party
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response, StreamingResponse
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
# 3. local
from app import config as _config_module
from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
from app.models.database import get_db
from app.schemas.page_master import PageMaster
from app.services.export.alto import generate_alto
from app.services.export.iiif import generate_manifest
from app.services.export.mets import generate_mets
logger = logging.getLogger(__name__)
router = APIRouter(tags=["export"])
# ── Helpers ───────────────────────────────────────────────────────────────────
async def _load_manuscript_with_masters(
manuscript_id: str,
db: AsyncSession,
) -> tuple[ManuscriptModel, CorpusModel, list[PageMaster]]:
"""Charge un manuscrit, son corpus et tous ses PageMasters.
Raises:
HTTPException 404: si le manuscrit ou son corpus est introuvable.
HTTPException 404: si aucun master.json n'est disponible.
"""
manuscript = await db.get(ManuscriptModel, manuscript_id)
if manuscript is None:
raise HTTPException(status_code=404, detail="Manuscrit introuvable")
corpus = await db.get(CorpusModel, manuscript.corpus_id)
if corpus is None:
raise HTTPException(status_code=404, detail="Corpus introuvable")
# Pages dans l'ordre de sΓ©quence
result = await db.execute(
select(PageModel)
.where(PageModel.manuscript_id == manuscript_id)
.order_by(PageModel.sequence)
)
pages = list(result.scalars().all())
masters: list[PageMaster] = []
for page in pages:
master = await _read_master_json(corpus.slug, page.folio_label)
if master is not None:
masters.append(master)
if not masters:
raise HTTPException(
status_code=404,
detail="Aucun master.json disponible pour ce manuscrit",
)
return manuscript, corpus, masters
def _read_master_json_sync(corpus_slug: str, folio_label: str) -> PageMaster | None:
"""Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
path = (
_config_module.settings.data_dir
/ "corpora"
/ corpus_slug
/ "pages"
/ folio_label
/ "master.json"
)
if not path.exists():
return None
raw = json.loads(path.read_text(encoding="utf-8"))
return PageMaster.model_validate(raw)
async def _read_master_json(corpus_slug: str, folio_label: str) -> PageMaster | None:
"""Version async — délègue la lecture au threadpool."""
return await asyncio.to_thread(_read_master_json_sync, corpus_slug, folio_label)
def _build_manuscript_meta(
manuscript: ManuscriptModel, corpus: CorpusModel
) -> dict:
return {
"manuscript_id": manuscript.id,
"label": manuscript.title,
"corpus_slug": corpus.slug,
"shelfmark": manuscript.shelfmark,
"date_label": manuscript.date_label,
}
# ── Endpoints ─────────────────────────────────────────────────────────────────
@router.get("/manuscripts/{manuscript_id}/iiif-manifest")
async def get_iiif_manifest(
manuscript_id: str, db: AsyncSession = Depends(get_db)
) -> dict:
"""Génère et retourne le Manifest IIIF 3.0 du manuscrit (R02)."""
manuscript, corpus, masters = await _load_manuscript_with_masters(
manuscript_id, db
)
meta = _build_manuscript_meta(manuscript, corpus)
manifest = generate_manifest(
masters, meta, _config_module.settings.base_url
)
logger.info(
"Manifest IIIF servi",
extra={"manuscript_id": manuscript_id, "pages": len(masters)},
)
return manifest
@router.get("/manuscripts/{manuscript_id}/mets")
async def get_mets(
manuscript_id: str, db: AsyncSession = Depends(get_db)
) -> Response:
"""Génère et retourne le METS XML du manuscrit (R02)."""
manuscript, corpus, masters = await _load_manuscript_with_masters(
manuscript_id, db
)
meta = _build_manuscript_meta(manuscript, corpus)
mets_xml = generate_mets(masters, meta)
return Response(
content=mets_xml,
media_type="application/xml; charset=utf-8",
)
@router.get("/pages/{page_id}/alto")
async def get_alto(page_id: str, db: AsyncSession = Depends(get_db)) -> Response:
"""Génère et retourne l'ALTO XML d'une page (R02)."""
page = await db.get(PageModel, page_id)
if page is None:
raise HTTPException(status_code=404, detail="Page introuvable")
manuscript = await db.get(ManuscriptModel, page.manuscript_id)
if manuscript is None:
raise HTTPException(status_code=404, detail="Manuscrit introuvable")
corpus = await db.get(CorpusModel, manuscript.corpus_id)
if corpus is None:
raise HTTPException(status_code=404, detail="Corpus introuvable")
master = await _read_master_json(corpus.slug, page.folio_label)
if master is None:
raise HTTPException(
status_code=404,
detail="master.json introuvable β€” la page n'a pas encore Γ©tΓ© analysΓ©e",
)
alto_xml = generate_alto(master)
return Response(
content=alto_xml,
media_type="application/xml; charset=utf-8",
)
@router.get("/manuscripts/{manuscript_id}/export.zip")
async def get_export_zip(
manuscript_id: str, db: AsyncSession = Depends(get_db)
) -> StreamingResponse:
"""Génère et retourne un ZIP contenant manifest.json + mets.xml + alto par page.
Structure du ZIP :
manifest.json
mets.xml
alto/{page_id}.xml
"""
manuscript, corpus, masters = await _load_manuscript_with_masters(
manuscript_id, db
)
if len(masters) > 500:
raise HTTPException(
status_code=413,
detail=f"Le manuscrit contient {len(masters)} pages. "
"L'export ZIP est limitΓ© Γ  500 pages maximum. "
"Exportez les pages individuellement via GET /pages/{id}/alto.",
)
meta = _build_manuscript_meta(manuscript, corpus)
buf = io.BytesIO()
with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
# manifest.json
manifest = generate_manifest(
masters, meta, _config_module.settings.base_url
)
zf.writestr(
"manifest.json",
json.dumps(manifest, ensure_ascii=False, indent=2),
)
# mets.xml
mets_xml = generate_mets(masters, meta)
zf.writestr("mets.xml", mets_xml)
# alto/{page_id}.xml
for master in masters:
alto_xml = generate_alto(master)
zf.writestr(f"alto/{master.page_id}.xml", alto_xml)
buf.seek(0)
filename = f"{manuscript_id}.zip"
logger.info(
"Export ZIP gΓ©nΓ©rΓ©",
extra={"manuscript_id": manuscript_id, "pages": len(masters)},
)
return StreamingResponse(
buf,
media_type="application/zip",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)