IIIF-Studio / backend /tests /test_api_ingest.py
Claude
fix(sprint-f4-f5): tests faux-positifs, frontend bugs, Dockerfile unique
35a94af unverified
"""
Tests des endpoints d'ingestion /api/v1/corpora/{id}/ingest/* (Sprint 4 — Session B).
Stratégie :
- BDD SQLite en mémoire
- Appels réseau mockés via monkeypatch (_fetch_json_manifest)
- Écriture disque mockée via monkeypatch (Path.mkdir, Path.write_bytes)
Vérifie :
- POST /ingest/files → pages créées, IDs retournés
- POST /ingest/iiif-manifest → manifest parsé, pages créées
- POST /ingest/iiif-images → pages créées depuis liste d'URLs
- 404 si corpus inexistant
- 422 si données invalides
"""
# 1. stdlib
import uuid
from datetime import datetime, timezone
from pathlib import Path
from unittest.mock import AsyncMock, patch
# 2. third-party
import pytest
# 3. local
import app.api.v1.ingest as ingest_module
from app.models.corpus import CorpusModel
from tests.conftest_api import async_client, db_session # noqa: F401
_NOW = datetime.now(timezone.utc)
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
async def _make_corpus(db, slug="test-ingest"):
corpus = CorpusModel(
id=str(uuid.uuid4()), slug=slug, title="Corpus Test",
profile_id="medieval-illuminated", created_at=_NOW, updated_at=_NOW,
)
db.add(corpus)
await db.commit()
await db.refresh(corpus)
return corpus
def _iiif3_manifest(n_canvases: int = 3) -> dict:
"""Génère un manifest IIIF 3.0 minimal avec n canvases."""
return {
"@context": "http://iiif.io/api/presentation/3/context.json",
"id": "https://example.com/manifest",
"type": "Manifest",
"label": {"fr": ["Beatus de Saint-Sever"]},
"items": [
{
"id": f"https://example.com/canvas/{i}",
"type": "Canvas",
"label": {"none": [f"f{i:03d}r"]},
"width": 1500, "height": 2000,
"items": [
{
"id": f"https://example.com/canvas/{i}/page",
"type": "AnnotationPage",
"items": [
{
"id": f"https://example.com/canvas/{i}/annotation",
"type": "Annotation",
"motivation": "painting",
"body": {
"id": f"https://example.com/images/{i}.jpg",
"type": "Image",
"format": "image/jpeg",
},
"target": f"https://example.com/canvas/{i}",
}
],
}
],
}
for i in range(1, n_canvases + 1)
],
}
def _iiif2_manifest(n_canvases: int = 2) -> dict:
"""Génère un manifest IIIF 2.x minimal."""
return {
"@context": "http://iiif.io/api/presentation/2/context.json",
"@type": "sc:Manifest",
"label": "Test Manuscript 2.x",
"sequences": [
{
"canvases": [
{
"@id": f"https://example.com/canvas/{i}",
"@type": "sc:Canvas",
"label": f"f{i:03d}r",
"images": [
{
"resource": {
"@id": f"https://example.com/images/{i}.jpg"
}
}
],
}
for i in range(1, n_canvases + 1)
]
}
],
}
# ---------------------------------------------------------------------------
# POST /api/v1/corpora/{id}/ingest/files
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_ingest_files_corpus_not_found(async_client):
response = await async_client.post(
"/api/v1/corpora/nonexistent/ingest/files",
files=[("files", ("img.jpg", b"data", "image/jpeg"))],
)
assert response.status_code == 404
@pytest.mark.asyncio
async def test_ingest_files_ok(async_client, db_session, tmp_path, monkeypatch):
corpus = await _make_corpus(db_session)
monkeypatch.setattr(_config_module := __import__("app.config", fromlist=["config"]), "settings",
type("S", (), {"data_dir": tmp_path})())
import app.config as _cfg
import app.api.v1.ingest as _ingest
original_data_dir = _cfg.settings.data_dir
_cfg.settings.data_dir = tmp_path
try:
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/files",
files=[
("files", ("f001r.jpg", b"fake_jpeg_data_1", "image/jpeg")),
("files", ("f002r.jpg", b"fake_jpeg_data_2", "image/jpeg")),
],
)
assert response.status_code == 201
data = response.json()
assert data["pages_created"] == 2
assert len(data["page_ids"]) == 2
assert data["corpus_id"] == corpus.id
finally:
_cfg.settings.data_dir = original_data_dir
@pytest.mark.asyncio
async def test_ingest_files_creates_manuscript(async_client, db_session, tmp_path):
corpus = await _make_corpus(db_session)
import app.config as _cfg
original = _cfg.settings.data_dir
_cfg.settings.data_dir = tmp_path
try:
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/files",
files=[("files", ("f001r.jpg", b"data", "image/jpeg"))],
)
data = response.json()
assert "manuscript_id" in data
assert data["manuscript_id"] # non-vide
finally:
_cfg.settings.data_dir = original
@pytest.mark.asyncio
async def test_ingest_files_folio_from_filename(async_client, db_session, tmp_path):
"""Le folio_label est dérivé du nom de fichier (sans extension)."""
corpus = await _make_corpus(db_session)
import app.config as _cfg
original = _cfg.settings.data_dir
_cfg.settings.data_dir = tmp_path
try:
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/files",
files=[("files", ("f013v.jpg", b"data", "image/jpeg"))],
)
data = response.json()
# L'ID de page contient le folio_label
assert any("f013v" in pid for pid in data["page_ids"])
finally:
_cfg.settings.data_dir = original
@pytest.mark.asyncio
async def test_ingest_files_writes_to_disk(async_client, db_session, tmp_path):
"""Les fichiers sont bien écrits dans data/corpora/{slug}/masters/."""
corpus = await _make_corpus(db_session, slug="test-write")
import app.config as _cfg
original = _cfg.settings.data_dir
_cfg.settings.data_dir = tmp_path
try:
await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/files",
files=[("files", ("f001r.jpg", b"JPEG_CONTENT", "image/jpeg"))],
)
expected = tmp_path / "corpora" / "test-write" / "masters" / "f001r" / "f001r.jpg"
assert expected.exists()
assert expected.read_bytes() == b"JPEG_CONTENT"
finally:
_cfg.settings.data_dir = original
# ---------------------------------------------------------------------------
# POST /api/v1/corpora/{id}/ingest/iiif-manifest
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_ingest_manifest_corpus_not_found(async_client):
response = await async_client.post(
"/api/v1/corpora/nonexistent/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert response.status_code == 404
@pytest.mark.asyncio
async def test_ingest_manifest_iiif3_ok(async_client, db_session, monkeypatch):
corpus = await _make_corpus(db_session)
manifest = _iiif3_manifest(n_canvases=3)
async def fake_fetch(url: str) -> dict:
return manifest
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", fake_fetch)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert response.status_code == 201
data = response.json()
assert data["pages_created"] == 3
assert len(data["page_ids"]) == 3
@pytest.mark.asyncio
async def test_ingest_manifest_iiif2_ok(async_client, db_session, monkeypatch):
corpus = await _make_corpus(db_session)
manifest = _iiif2_manifest(n_canvases=2)
async def fake_fetch(url: str) -> dict:
return manifest
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", fake_fetch)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert response.status_code == 201
assert response.json()["pages_created"] == 2
@pytest.mark.asyncio
async def test_ingest_manifest_extracts_folio_labels(async_client, db_session, monkeypatch):
"""Les folio_labels sont extraits des labels des canvases."""
corpus = await _make_corpus(db_session)
manifest = _iiif3_manifest(n_canvases=2)
async def fake_fetch(url: str) -> dict:
return manifest
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", fake_fetch)
data = (await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)).json()
# Canvas labels: "f001r", "f002r"
assert any("f001r" in pid for pid in data["page_ids"])
assert any("f002r" in pid for pid in data["page_ids"])
@pytest.mark.asyncio
async def test_ingest_manifest_empty_canvases_422(async_client, db_session, monkeypatch):
"""Manifest sans canvases → 422."""
corpus = await _make_corpus(db_session)
async def fake_fetch(url: str) -> dict:
return {"type": "Manifest", "items": []}
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", fake_fetch)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert response.status_code == 422
@pytest.mark.asyncio
async def test_ingest_manifest_network_error_502(async_client, db_session, monkeypatch):
"""Erreur réseau → 502."""
corpus = await _make_corpus(db_session)
import httpx
async def fake_fetch(url: str) -> dict:
raise httpx.RequestError("Connection refused")
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", fake_fetch)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert response.status_code == 502
@pytest.mark.asyncio
async def test_ingest_manifest_returns_corpus_id(async_client, db_session, monkeypatch):
corpus = await _make_corpus(db_session)
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", AsyncMock(return_value=_iiif3_manifest(1)))
data = (await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)).json()
assert data["corpus_id"] == corpus.id
# ---------------------------------------------------------------------------
# POST /api/v1/corpora/{id}/ingest/iiif-images
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_ingest_images_corpus_not_found(async_client):
response = await async_client.post(
"/api/v1/corpora/nonexistent/ingest/iiif-images",
json={"urls": ["https://x.com/1.jpg"], "folio_labels": ["f001r"]},
)
assert response.status_code == 404
@pytest.mark.asyncio
async def test_ingest_images_ok(async_client, db_session):
corpus = await _make_corpus(db_session)
urls = ["https://example.com/img1.jpg", "https://example.com/img2.jpg"]
labels = ["f001r", "f002r"]
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images",
json={"urls": urls, "folio_labels": labels},
)
assert response.status_code == 201
data = response.json()
assert data["pages_created"] == 2
assert len(data["page_ids"]) == 2
@pytest.mark.asyncio
async def test_ingest_images_folio_labels_in_ids(async_client, db_session):
corpus = await _make_corpus(db_session)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images",
json={
"urls": ["https://example.com/a.jpg"],
"folio_labels": ["f013v"],
},
)
data = response.json()
assert any("f013v" in pid for pid in data["page_ids"])
@pytest.mark.asyncio
async def test_ingest_images_mismatched_lengths_422(async_client, db_session):
"""urls et folio_labels de longueurs différentes → 422."""
corpus = await _make_corpus(db_session)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images",
json={"urls": ["https://a.com/1.jpg", "https://a.com/2.jpg"], "folio_labels": ["f001r"]},
)
assert response.status_code == 422
@pytest.mark.asyncio
async def test_ingest_images_empty_urls_422(async_client, db_session):
corpus = await _make_corpus(db_session)
response = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images",
json={"urls": [], "folio_labels": []},
)
assert response.status_code == 422
@pytest.mark.asyncio
async def test_ingest_images_pages_in_sequence_order(async_client, db_session):
"""Les pages ont des séquences consécutives."""
corpus = await _make_corpus(db_session)
n = 4
urls = [f"https://example.com/{i}.jpg" for i in range(1, n + 1)]
labels = [f"f{i:03d}r" for i in range(1, n + 1)]
data = (await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images",
json={"urls": urls, "folio_labels": labels},
)).json()
assert data["pages_created"] == n
@pytest.mark.asyncio
async def test_ingest_images_corpus_id_in_response(async_client, db_session):
corpus = await _make_corpus(db_session)
data = (await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images",
json={"urls": ["https://x.com/1.jpg"], "folio_labels": ["f001r"]},
)).json()
assert data["corpus_id"] == corpus.id
# ---------------------------------------------------------------------------
# Réingestion — pas de 500
# ---------------------------------------------------------------------------
@pytest.mark.asyncio
async def test_reingest_manifest_skips_existing_pages(async_client, db_session, monkeypatch):
"""Réingérer le même manifest ne provoque pas de 500 (UNIQUE constraint).
La deuxième ingestion doit retourner 201 avec pages_created=0 et pages_skipped=N.
"""
corpus = await _make_corpus(db_session, slug="reingest")
manifest = _iiif3_manifest(n_canvases=2)
async def fake_fetch(url: str) -> dict:
return manifest
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", fake_fetch)
# Première ingestion
resp1 = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert resp1.status_code == 201
data1 = resp1.json()
assert data1["pages_created"] == 2
assert data1["pages_skipped"] == 0
# Deuxième ingestion — même manifest
resp2 = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert resp2.status_code == 201
data2 = resp2.json()
assert data2["pages_created"] == 0
assert data2["pages_skipped"] == 2
# Vérifier que la BDD n'a bien que 2 pages (pas de doublons)
from sqlalchemy import select as sa_select
from app.models.corpus import PageModel
page_result = await db_session.execute(
sa_select(PageModel).where(PageModel.manuscript_id == data1["manuscript_id"])
)
pages_in_db = list(page_result.scalars().all())
assert len(pages_in_db) == 2
@pytest.mark.asyncio
async def test_reingest_images_skips_existing_pages(async_client, db_session):
"""Réingérer les mêmes images ne provoque pas de 500."""
corpus = await _make_corpus(db_session, slug="reingest2")
payload = {"urls": ["https://x.com/a.jpg"], "folio_labels": ["f001r"]}
resp1 = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images", json=payload,
)
assert resp1.status_code == 201
assert resp1.json()["pages_created"] == 1
resp2 = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-images", json=payload,
)
assert resp2.status_code == 201
assert resp2.json()["pages_created"] == 0
assert resp2.json()["pages_skipped"] == 1
@pytest.mark.asyncio
async def test_ingest_manifest_duplicate_labels_no_collision(async_client, db_session, monkeypatch):
"""Deux canvases avec le même label ne provoquent pas de collision d'ID."""
corpus = await _make_corpus(db_session, slug="dupe-labels")
manifest = {
"@context": "http://iiif.io/api/presentation/3/context.json",
"type": "Manifest",
"label": {"fr": ["Test"]},
"items": [
{
"id": f"https://example.com/canvas/{i}",
"type": "Canvas",
"label": {"none": ["NP"]},
"items": [{
"type": "AnnotationPage",
"items": [{
"type": "Annotation",
"motivation": "painting",
"body": {"id": f"https://example.com/img/{i}.jpg", "type": "Image"},
"target": f"https://example.com/canvas/{i}",
}],
}],
}
for i in range(1, 4)
],
}
monkeypatch.setattr(ingest_module, "_fetch_json_manifest", AsyncMock(return_value=manifest))
resp = await async_client.post(
f"/api/v1/corpora/{corpus.id}/ingest/iiif-manifest",
json={"manifest_url": "https://example.com/manifest"},
)
assert resp.status_code == 201
data = resp.json()
assert data["pages_created"] == 3
# All IDs must be distinct
assert len(set(data["page_ids"])) == 3