Spaces:
Build error
Build error
Claude commited on
fix: IIIF image fetch performance — retry with backoff, request size reduction, cursor warning
Browse filesThree root causes of manifest analysis failures and slow performance:
1. iiif_fetcher.py: Added retry with exponential backoff (4 retries, 2s→16s)
for 429/5xx errors, global rate limiter (1 req/s), and automatic URL
rewriting from /full/full/ or /full/max/ to /full/!1500,1500/ so the
IIIF server returns pre-resized images instead of full resolution.
2. database.py: Removed cursor.close() call in SQLite PRAGMA handler —
aiosqlite returns a coroutine for close() that was never awaited,
causing RuntimeWarning at startup.
3. Tests updated for new timeout values (60s read, 15s connect) and
mocked time.sleep/time.monotonic to avoid slow tests from retry logic.
https://claude.ai/code/session_01Sxmf2zTTcjaQeZXCeXaCxr
backend/app/models/database.py
CHANGED
|
@@ -32,10 +32,13 @@ engine = create_async_engine(
|
|
| 32 |
|
| 33 |
# Activer les clés étrangères SQLite (désactivées par défaut).
|
| 34 |
# Nécessaire pour que ondelete="CASCADE" / "SET NULL" fonctionne.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
@event.listens_for(engine.sync_engine, "connect")
|
| 36 |
def _set_sqlite_pragma(dbapi_conn, _connection_record):
|
| 37 |
-
|
| 38 |
-
cursor.close()
|
| 39 |
|
| 40 |
async_session_factory = async_sessionmaker(
|
| 41 |
engine,
|
|
|
|
| 32 |
|
| 33 |
# Activer les clés étrangères SQLite (désactivées par défaut).
|
| 34 |
# Nécessaire pour que ondelete="CASCADE" / "SET NULL" fonctionne.
|
| 35 |
+
# Note : on n'appelle PAS cursor.close() car avec aiosqlite le curseur
|
| 36 |
+
# retourne une coroutine pour close(), ce qui provoque un RuntimeWarning
|
| 37 |
+
# « coroutine 'Cursor.close' was never awaited ». Le curseur PRAGMA est
|
| 38 |
+
# éphémère et libéré automatiquement.
|
| 39 |
@event.listens_for(engine.sync_engine, "connect")
|
| 40 |
def _set_sqlite_pragma(dbapi_conn, _connection_record):
|
| 41 |
+
dbapi_conn.execute("PRAGMA foreign_keys=ON")
|
|
|
|
| 42 |
|
| 43 |
async_session_factory = async_sessionmaker(
|
| 44 |
engine,
|
backend/app/services/ingest/iiif_fetcher.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
| 1 |
"""
|
| 2 |
Téléchargement d'images depuis des URLs IIIF via httpx.
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
# 1. stdlib
|
| 5 |
import logging
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# 2. third-party
|
| 8 |
import httpx
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
| 12 |
-
_DEFAULT_TIMEOUT =
|
| 13 |
|
| 14 |
_HEADERS = {
|
| 15 |
"User-Agent": (
|
|
@@ -19,10 +25,119 @@ _HEADERS = {
|
|
| 19 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
| 20 |
}
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
|
| 24 |
"""Télécharge une image depuis une URL IIIF complète.
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
Args:
|
| 27 |
url: URL complète de l'image (ex. https://.../full/max/0/default.jpg).
|
| 28 |
timeout: délai maximal en secondes (défaut : 60 s).
|
|
@@ -35,14 +150,9 @@ def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
|
|
| 35 |
httpx.TimeoutException: si la requête dépasse le délai.
|
| 36 |
httpx.RequestError: pour toute autre erreur réseau.
|
| 37 |
"""
|
|
|
|
| 38 |
logger.info("Fetching IIIF image", extra={"url": url})
|
| 39 |
-
response =
|
| 40 |
-
url,
|
| 41 |
-
headers=_HEADERS,
|
| 42 |
-
follow_redirects=True,
|
| 43 |
-
timeout=httpx.Timeout(timeout, connect=10.0),
|
| 44 |
-
)
|
| 45 |
-
response.raise_for_status()
|
| 46 |
logger.info(
|
| 47 |
"IIIF image fetched",
|
| 48 |
extra={"url": url, "size_bytes": len(response.content)},
|
|
@@ -71,13 +181,7 @@ def fetch_iiif_derivative(
|
|
| 71 |
# Pattern IIIF Image API : !w,h = "best fit" (le serveur choisit)
|
| 72 |
derivative_url = f"{service_url.rstrip('/')}/full/!{max_px},{max_px}/0/default.jpg"
|
| 73 |
logger.info("Fetching IIIF derivative", extra={"url": derivative_url, "max_px": max_px})
|
| 74 |
-
response =
|
| 75 |
-
derivative_url,
|
| 76 |
-
headers=_HEADERS,
|
| 77 |
-
follow_redirects=True,
|
| 78 |
-
timeout=httpx.Timeout(timeout, connect=10.0),
|
| 79 |
-
)
|
| 80 |
-
response.raise_for_status()
|
| 81 |
logger.info(
|
| 82 |
"IIIF derivative fetched",
|
| 83 |
extra={"url": derivative_url, "size_bytes": len(response.content)},
|
|
|
|
| 1 |
"""
|
| 2 |
Téléchargement d'images depuis des URLs IIIF via httpx.
|
| 3 |
+
|
| 4 |
+
Inclut un rate-limiter global et un retry avec backoff exponentiel
|
| 5 |
+
pour respecter les limites des serveurs IIIF patrimoniaux (Gallica, etc.).
|
| 6 |
"""
|
| 7 |
# 1. stdlib
|
| 8 |
import logging
|
| 9 |
+
import re
|
| 10 |
+
import threading
|
| 11 |
+
import time
|
| 12 |
|
| 13 |
# 2. third-party
|
| 14 |
import httpx
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
+
_DEFAULT_TIMEOUT = 60.0 # secondes (connect 15s + read 60s)
|
| 19 |
|
| 20 |
_HEADERS = {
|
| 21 |
"User-Agent": (
|
|
|
|
| 25 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
| 26 |
}
|
| 27 |
|
| 28 |
+
# ── Rate-limiter global ────────────────────────────────────────────────────
|
| 29 |
+
# Gallica and similar IIIF servers enforce strict rate limits.
|
| 30 |
+
# We enforce a minimum delay between consecutive requests.
|
| 31 |
+
_MIN_REQUEST_INTERVAL = 1.0 # secondes entre deux requêtes
|
| 32 |
+
_rate_lock = threading.Lock()
|
| 33 |
+
_last_request_time = 0.0
|
| 34 |
+
|
| 35 |
+
# ── Retry configuration ───────────────────────────────────────────────────
|
| 36 |
+
_MAX_RETRIES = 4
|
| 37 |
+
_INITIAL_BACKOFF = 2.0 # secondes, doublé à chaque retry
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _wait_rate_limit() -> None:
|
| 41 |
+
"""Attend si nécessaire pour respecter le débit maximal vers les serveurs IIIF."""
|
| 42 |
+
global _last_request_time
|
| 43 |
+
with _rate_lock:
|
| 44 |
+
now = time.monotonic()
|
| 45 |
+
elapsed = now - _last_request_time
|
| 46 |
+
if elapsed < _MIN_REQUEST_INTERVAL:
|
| 47 |
+
time.sleep(_MIN_REQUEST_INTERVAL - elapsed)
|
| 48 |
+
_last_request_time = time.monotonic()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _fetch_with_retry(url: str, timeout: float) -> httpx.Response:
|
| 52 |
+
"""GET avec retry et backoff exponentiel sur 429 / 5xx.
|
| 53 |
+
|
| 54 |
+
Respecte le header Retry-After si présent dans la réponse 429.
|
| 55 |
+
"""
|
| 56 |
+
backoff = _INITIAL_BACKOFF
|
| 57 |
+
last_exc: Exception | None = None
|
| 58 |
+
|
| 59 |
+
for attempt in range(_MAX_RETRIES + 1):
|
| 60 |
+
_wait_rate_limit()
|
| 61 |
+
try:
|
| 62 |
+
response = httpx.get(
|
| 63 |
+
url,
|
| 64 |
+
headers=_HEADERS,
|
| 65 |
+
follow_redirects=True,
|
| 66 |
+
timeout=httpx.Timeout(timeout, connect=15.0),
|
| 67 |
+
)
|
| 68 |
+
if response.status_code == 429 or response.status_code >= 500:
|
| 69 |
+
# Respect Retry-After header if present
|
| 70 |
+
retry_after = response.headers.get("Retry-After")
|
| 71 |
+
if retry_after:
|
| 72 |
+
try:
|
| 73 |
+
wait_time = float(retry_after)
|
| 74 |
+
except ValueError:
|
| 75 |
+
wait_time = backoff
|
| 76 |
+
else:
|
| 77 |
+
wait_time = backoff
|
| 78 |
+
|
| 79 |
+
if attempt < _MAX_RETRIES:
|
| 80 |
+
logger.warning(
|
| 81 |
+
"HTTP %d — retry %d/%d dans %.1fs",
|
| 82 |
+
response.status_code,
|
| 83 |
+
attempt + 1,
|
| 84 |
+
_MAX_RETRIES,
|
| 85 |
+
wait_time,
|
| 86 |
+
extra={"url": url},
|
| 87 |
+
)
|
| 88 |
+
time.sleep(wait_time)
|
| 89 |
+
backoff *= 2
|
| 90 |
+
continue
|
| 91 |
+
# Last attempt: raise
|
| 92 |
+
response.raise_for_status()
|
| 93 |
+
|
| 94 |
+
response.raise_for_status()
|
| 95 |
+
return response
|
| 96 |
+
|
| 97 |
+
except httpx.TimeoutException as exc:
|
| 98 |
+
last_exc = exc
|
| 99 |
+
if attempt < _MAX_RETRIES:
|
| 100 |
+
logger.warning(
|
| 101 |
+
"Timeout — retry %d/%d dans %.1fs",
|
| 102 |
+
attempt + 1,
|
| 103 |
+
_MAX_RETRIES,
|
| 104 |
+
backoff,
|
| 105 |
+
extra={"url": url},
|
| 106 |
+
)
|
| 107 |
+
time.sleep(backoff)
|
| 108 |
+
backoff *= 2
|
| 109 |
+
continue
|
| 110 |
+
raise
|
| 111 |
+
|
| 112 |
+
# Should not reach here, but just in case
|
| 113 |
+
raise last_exc or RuntimeError(f"Échec après {_MAX_RETRIES} retries : {url}")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _rewrite_full_to_reduced(url: str, max_px: int = 1500) -> str:
|
| 117 |
+
"""Réécrit une URL IIIF /full/full/ ou /full/max/ en /full/!{max_px},{max_px}/.
|
| 118 |
+
|
| 119 |
+
Cela demande au serveur IIIF de redimensionner côté serveur au lieu de
|
| 120 |
+
retourner l'image en pleine résolution. Beaucoup plus rapide et respectueux
|
| 121 |
+
des quotas serveur.
|
| 122 |
+
|
| 123 |
+
Si l'URL n'est pas une URL IIIF standard, elle est retournée inchangée.
|
| 124 |
+
"""
|
| 125 |
+
# Match IIIF Image API pattern: .../full/(full|max)/0/(default|native).(jpg|png|...)
|
| 126 |
+
pattern = r"(/full/)(full|max)(/0/)"
|
| 127 |
+
replacement = rf"\g<1>!{max_px},{max_px}\3"
|
| 128 |
+
new_url = re.sub(pattern, replacement, url)
|
| 129 |
+
if new_url != url:
|
| 130 |
+
logger.info("URL IIIF réécrite: full → !%d,%d", max_px, max_px, extra={"original": url})
|
| 131 |
+
return new_url
|
| 132 |
+
|
| 133 |
|
| 134 |
def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
|
| 135 |
"""Télécharge une image depuis une URL IIIF complète.
|
| 136 |
|
| 137 |
+
Si l'URL demande la pleine résolution (/full/full/ ou /full/max/),
|
| 138 |
+
elle est automatiquement réécrite pour demander un d��rivé 1500px max
|
| 139 |
+
côté serveur, ce qui est plus rapide et évite le rate-limiting.
|
| 140 |
+
|
| 141 |
Args:
|
| 142 |
url: URL complète de l'image (ex. https://.../full/max/0/default.jpg).
|
| 143 |
timeout: délai maximal en secondes (défaut : 60 s).
|
|
|
|
| 150 |
httpx.TimeoutException: si la requête dépasse le délai.
|
| 151 |
httpx.RequestError: pour toute autre erreur réseau.
|
| 152 |
"""
|
| 153 |
+
url = _rewrite_full_to_reduced(url)
|
| 154 |
logger.info("Fetching IIIF image", extra={"url": url})
|
| 155 |
+
response = _fetch_with_retry(url, timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
logger.info(
|
| 157 |
"IIIF image fetched",
|
| 158 |
extra={"url": url, "size_bytes": len(response.content)},
|
|
|
|
| 181 |
# Pattern IIIF Image API : !w,h = "best fit" (le serveur choisit)
|
| 182 |
derivative_url = f"{service_url.rstrip('/')}/full/!{max_px},{max_px}/0/default.jpg"
|
| 183 |
logger.info("Fetching IIIF derivative", extra={"url": derivative_url, "max_px": max_px})
|
| 184 |
+
response = _fetch_with_retry(derivative_url, timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
logger.info(
|
| 186 |
"IIIF derivative fetched",
|
| 187 |
extra={"url": derivative_url, "size_bytes": len(response.content)},
|
backend/tests/test_image_pipeline.py
CHANGED
|
@@ -261,8 +261,11 @@ def test_fetch_iiif_image_success():
|
|
| 261 |
"""Retourne les bytes de l'image si la requête réussit."""
|
| 262 |
fake_bytes = _make_jpeg_bytes(100, 100)
|
| 263 |
|
| 264 |
-
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get
|
|
|
|
|
|
|
| 265 |
mock_response = MagicMock()
|
|
|
|
| 266 |
mock_response.content = fake_bytes
|
| 267 |
mock_response.raise_for_status.return_value = None
|
| 268 |
mock_get.return_value = mock_response
|
|
@@ -272,15 +275,18 @@ def test_fetch_iiif_image_success():
|
|
| 272 |
assert result == fake_bytes
|
| 273 |
_, kwargs = mock_get.call_args
|
| 274 |
assert kwargs["follow_redirects"] is True
|
| 275 |
-
# Timeout is
|
| 276 |
-
assert kwargs["timeout"].connect ==
|
| 277 |
-
assert kwargs["timeout"].read ==
|
| 278 |
|
| 279 |
|
| 280 |
def test_fetch_iiif_image_http_error():
|
| 281 |
-
"""Propage HTTPStatusError si le serveur répond 404."""
|
| 282 |
-
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get
|
|
|
|
|
|
|
| 283 |
mock_response = MagicMock()
|
|
|
|
| 284 |
mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
|
| 285 |
"404 Not Found",
|
| 286 |
request=MagicMock(),
|
|
@@ -293,8 +299,10 @@ def test_fetch_iiif_image_http_error():
|
|
| 293 |
|
| 294 |
|
| 295 |
def test_fetch_iiif_image_timeout():
|
| 296 |
-
"""Propage TimeoutException
|
| 297 |
-
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get
|
|
|
|
|
|
|
| 298 |
mock_get.side_effect = httpx.TimeoutException("timed out")
|
| 299 |
|
| 300 |
with pytest.raises(httpx.TimeoutException):
|
|
@@ -305,8 +313,11 @@ def test_fetch_iiif_image_custom_timeout():
|
|
| 305 |
"""Le timeout personnalisé est bien transmis à httpx.get."""
|
| 306 |
fake_bytes = _make_jpeg_bytes(50, 50)
|
| 307 |
|
| 308 |
-
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get
|
|
|
|
|
|
|
| 309 |
mock_response = MagicMock()
|
|
|
|
| 310 |
mock_response.content = fake_bytes
|
| 311 |
mock_response.raise_for_status.return_value = None
|
| 312 |
mock_get.return_value = mock_response
|
|
@@ -314,9 +325,9 @@ def test_fetch_iiif_image_custom_timeout():
|
|
| 314 |
fetch_iiif_image("https://example.com/img.jpg", timeout=120.0)
|
| 315 |
|
| 316 |
_, kwargs = mock_get.call_args
|
| 317 |
-
# Custom timeout wraps in httpx.Timeout(120.0, connect=
|
| 318 |
assert kwargs["timeout"].read == 120.0
|
| 319 |
-
assert kwargs["timeout"].connect ==
|
| 320 |
|
| 321 |
|
| 322 |
# ---------------------------------------------------------------------------
|
|
|
|
| 261 |
"""Retourne les bytes de l'image si la requête réussit."""
|
| 262 |
fake_bytes = _make_jpeg_bytes(100, 100)
|
| 263 |
|
| 264 |
+
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
|
| 265 |
+
patch("app.services.ingest.iiif_fetcher.time.sleep"), \
|
| 266 |
+
patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
|
| 267 |
mock_response = MagicMock()
|
| 268 |
+
mock_response.status_code = 200
|
| 269 |
mock_response.content = fake_bytes
|
| 270 |
mock_response.raise_for_status.return_value = None
|
| 271 |
mock_get.return_value = mock_response
|
|
|
|
| 275 |
assert result == fake_bytes
|
| 276 |
_, kwargs = mock_get.call_args
|
| 277 |
assert kwargs["follow_redirects"] is True
|
| 278 |
+
# Timeout is an httpx.Timeout object (connect=15s, read=60s)
|
| 279 |
+
assert kwargs["timeout"].connect == 15.0
|
| 280 |
+
assert kwargs["timeout"].read == 60.0
|
| 281 |
|
| 282 |
|
| 283 |
def test_fetch_iiif_image_http_error():
|
| 284 |
+
"""Propage HTTPStatusError si le serveur répond 404 (pas de retry sur 4xx hors 429)."""
|
| 285 |
+
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
|
| 286 |
+
patch("app.services.ingest.iiif_fetcher.time.sleep"), \
|
| 287 |
+
patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
|
| 288 |
mock_response = MagicMock()
|
| 289 |
+
mock_response.status_code = 404
|
| 290 |
mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
|
| 291 |
"404 Not Found",
|
| 292 |
request=MagicMock(),
|
|
|
|
| 299 |
|
| 300 |
|
| 301 |
def test_fetch_iiif_image_timeout():
|
| 302 |
+
"""Propage TimeoutException après épuisement des retries."""
|
| 303 |
+
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
|
| 304 |
+
patch("app.services.ingest.iiif_fetcher.time.sleep"), \
|
| 305 |
+
patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
|
| 306 |
mock_get.side_effect = httpx.TimeoutException("timed out")
|
| 307 |
|
| 308 |
with pytest.raises(httpx.TimeoutException):
|
|
|
|
| 313 |
"""Le timeout personnalisé est bien transmis à httpx.get."""
|
| 314 |
fake_bytes = _make_jpeg_bytes(50, 50)
|
| 315 |
|
| 316 |
+
with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
|
| 317 |
+
patch("app.services.ingest.iiif_fetcher.time.sleep"), \
|
| 318 |
+
patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
|
| 319 |
mock_response = MagicMock()
|
| 320 |
+
mock_response.status_code = 200
|
| 321 |
mock_response.content = fake_bytes
|
| 322 |
mock_response.raise_for_status.return_value = None
|
| 323 |
mock_get.return_value = mock_response
|
|
|
|
| 325 |
fetch_iiif_image("https://example.com/img.jpg", timeout=120.0)
|
| 326 |
|
| 327 |
_, kwargs = mock_get.call_args
|
| 328 |
+
# Custom timeout wraps in httpx.Timeout(120.0, connect=15.0)
|
| 329 |
assert kwargs["timeout"].read == 120.0
|
| 330 |
+
assert kwargs["timeout"].connect == 15.0
|
| 331 |
|
| 332 |
|
| 333 |
# ---------------------------------------------------------------------------
|