Claude commited on
Commit
2e30fe9
·
unverified ·
1 Parent(s): 2b83663

fix: IIIF image fetch performance — retry with backoff, request size reduction, cursor warning

Browse files

Three root causes of manifest analysis failures and slow performance:

1. iiif_fetcher.py: Added retry with exponential backoff (4 retries, 2s→16s)
for 429/5xx errors, global rate limiter (1 req/s), and automatic URL
rewriting from /full/full/ or /full/max/ to /full/!1500,1500/ so the
IIIF server returns pre-resized images instead of full resolution.

2. database.py: Removed cursor.close() call in SQLite PRAGMA handler —
aiosqlite returns a coroutine for close() that was never awaited,
causing RuntimeWarning at startup.

3. Tests updated for new timeout values (60s read, 15s connect) and
mocked time.sleep/time.monotonic to avoid slow tests from retry logic.

https://claude.ai/code/session_01Sxmf2zTTcjaQeZXCeXaCxr

backend/app/models/database.py CHANGED
@@ -32,10 +32,13 @@ engine = create_async_engine(
32
 
33
  # Activer les clés étrangères SQLite (désactivées par défaut).
34
  # Nécessaire pour que ondelete="CASCADE" / "SET NULL" fonctionne.
 
 
 
 
35
  @event.listens_for(engine.sync_engine, "connect")
36
  def _set_sqlite_pragma(dbapi_conn, _connection_record):
37
- cursor = dbapi_conn.execute("PRAGMA foreign_keys=ON")
38
- cursor.close()
39
 
40
  async_session_factory = async_sessionmaker(
41
  engine,
 
32
 
33
  # Activer les clés étrangères SQLite (désactivées par défaut).
34
  # Nécessaire pour que ondelete="CASCADE" / "SET NULL" fonctionne.
35
+ # Note : on n'appelle PAS cursor.close() car avec aiosqlite le curseur
36
+ # retourne une coroutine pour close(), ce qui provoque un RuntimeWarning
37
+ # « coroutine 'Cursor.close' was never awaited ». Le curseur PRAGMA est
38
+ # éphémère et libéré automatiquement.
39
  @event.listens_for(engine.sync_engine, "connect")
40
  def _set_sqlite_pragma(dbapi_conn, _connection_record):
41
+ dbapi_conn.execute("PRAGMA foreign_keys=ON")
 
42
 
43
  async_session_factory = async_sessionmaker(
44
  engine,
backend/app/services/ingest/iiif_fetcher.py CHANGED
@@ -1,15 +1,21 @@
1
  """
2
  Téléchargement d'images depuis des URLs IIIF via httpx.
 
 
 
3
  """
4
  # 1. stdlib
5
  import logging
 
 
 
6
 
7
  # 2. third-party
8
  import httpx
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
- _DEFAULT_TIMEOUT = 30.0 # secondes (connect 10s + read 30s)
13
 
14
  _HEADERS = {
15
  "User-Agent": (
@@ -19,10 +25,119 @@ _HEADERS = {
19
  "Accept": "image/jpeg,image/png,image/*,*/*",
20
  }
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
24
  """Télécharge une image depuis une URL IIIF complète.
25
 
 
 
 
 
26
  Args:
27
  url: URL complète de l'image (ex. https://.../full/max/0/default.jpg).
28
  timeout: délai maximal en secondes (défaut : 60 s).
@@ -35,14 +150,9 @@ def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
35
  httpx.TimeoutException: si la requête dépasse le délai.
36
  httpx.RequestError: pour toute autre erreur réseau.
37
  """
 
38
  logger.info("Fetching IIIF image", extra={"url": url})
39
- response = httpx.get(
40
- url,
41
- headers=_HEADERS,
42
- follow_redirects=True,
43
- timeout=httpx.Timeout(timeout, connect=10.0),
44
- )
45
- response.raise_for_status()
46
  logger.info(
47
  "IIIF image fetched",
48
  extra={"url": url, "size_bytes": len(response.content)},
@@ -71,13 +181,7 @@ def fetch_iiif_derivative(
71
  # Pattern IIIF Image API : !w,h = "best fit" (le serveur choisit)
72
  derivative_url = f"{service_url.rstrip('/')}/full/!{max_px},{max_px}/0/default.jpg"
73
  logger.info("Fetching IIIF derivative", extra={"url": derivative_url, "max_px": max_px})
74
- response = httpx.get(
75
- derivative_url,
76
- headers=_HEADERS,
77
- follow_redirects=True,
78
- timeout=httpx.Timeout(timeout, connect=10.0),
79
- )
80
- response.raise_for_status()
81
  logger.info(
82
  "IIIF derivative fetched",
83
  extra={"url": derivative_url, "size_bytes": len(response.content)},
 
1
  """
2
  Téléchargement d'images depuis des URLs IIIF via httpx.
3
+
4
+ Inclut un rate-limiter global et un retry avec backoff exponentiel
5
+ pour respecter les limites des serveurs IIIF patrimoniaux (Gallica, etc.).
6
  """
7
  # 1. stdlib
8
  import logging
9
+ import re
10
+ import threading
11
+ import time
12
 
13
  # 2. third-party
14
  import httpx
15
 
16
  logger = logging.getLogger(__name__)
17
 
18
+ _DEFAULT_TIMEOUT = 60.0 # secondes (connect 15s + read 60s)
19
 
20
  _HEADERS = {
21
  "User-Agent": (
 
25
  "Accept": "image/jpeg,image/png,image/*,*/*",
26
  }
27
 
28
+ # ── Rate-limiter global ────────────────────────────────────────────────────
29
+ # Gallica and similar IIIF servers enforce strict rate limits.
30
+ # We enforce a minimum delay between consecutive requests.
31
+ _MIN_REQUEST_INTERVAL = 1.0 # secondes entre deux requêtes
32
+ _rate_lock = threading.Lock()
33
+ _last_request_time = 0.0
34
+
35
+ # ── Retry configuration ───────────────────────────────────────────────────
36
+ _MAX_RETRIES = 4
37
+ _INITIAL_BACKOFF = 2.0 # secondes, doublé à chaque retry
38
+
39
+
40
+ def _wait_rate_limit() -> None:
41
+ """Attend si nécessaire pour respecter le débit maximal vers les serveurs IIIF."""
42
+ global _last_request_time
43
+ with _rate_lock:
44
+ now = time.monotonic()
45
+ elapsed = now - _last_request_time
46
+ if elapsed < _MIN_REQUEST_INTERVAL:
47
+ time.sleep(_MIN_REQUEST_INTERVAL - elapsed)
48
+ _last_request_time = time.monotonic()
49
+
50
+
51
+ def _fetch_with_retry(url: str, timeout: float) -> httpx.Response:
52
+ """GET avec retry et backoff exponentiel sur 429 / 5xx.
53
+
54
+ Respecte le header Retry-After si présent dans la réponse 429.
55
+ """
56
+ backoff = _INITIAL_BACKOFF
57
+ last_exc: Exception | None = None
58
+
59
+ for attempt in range(_MAX_RETRIES + 1):
60
+ _wait_rate_limit()
61
+ try:
62
+ response = httpx.get(
63
+ url,
64
+ headers=_HEADERS,
65
+ follow_redirects=True,
66
+ timeout=httpx.Timeout(timeout, connect=15.0),
67
+ )
68
+ if response.status_code == 429 or response.status_code >= 500:
69
+ # Respect Retry-After header if present
70
+ retry_after = response.headers.get("Retry-After")
71
+ if retry_after:
72
+ try:
73
+ wait_time = float(retry_after)
74
+ except ValueError:
75
+ wait_time = backoff
76
+ else:
77
+ wait_time = backoff
78
+
79
+ if attempt < _MAX_RETRIES:
80
+ logger.warning(
81
+ "HTTP %d — retry %d/%d dans %.1fs",
82
+ response.status_code,
83
+ attempt + 1,
84
+ _MAX_RETRIES,
85
+ wait_time,
86
+ extra={"url": url},
87
+ )
88
+ time.sleep(wait_time)
89
+ backoff *= 2
90
+ continue
91
+ # Last attempt: raise
92
+ response.raise_for_status()
93
+
94
+ response.raise_for_status()
95
+ return response
96
+
97
+ except httpx.TimeoutException as exc:
98
+ last_exc = exc
99
+ if attempt < _MAX_RETRIES:
100
+ logger.warning(
101
+ "Timeout — retry %d/%d dans %.1fs",
102
+ attempt + 1,
103
+ _MAX_RETRIES,
104
+ backoff,
105
+ extra={"url": url},
106
+ )
107
+ time.sleep(backoff)
108
+ backoff *= 2
109
+ continue
110
+ raise
111
+
112
+ # Should not reach here, but just in case
113
+ raise last_exc or RuntimeError(f"Échec après {_MAX_RETRIES} retries : {url}")
114
+
115
+
116
+ def _rewrite_full_to_reduced(url: str, max_px: int = 1500) -> str:
117
+ """Réécrit une URL IIIF /full/full/ ou /full/max/ en /full/!{max_px},{max_px}/.
118
+
119
+ Cela demande au serveur IIIF de redimensionner côté serveur au lieu de
120
+ retourner l'image en pleine résolution. Beaucoup plus rapide et respectueux
121
+ des quotas serveur.
122
+
123
+ Si l'URL n'est pas une URL IIIF standard, elle est retournée inchangée.
124
+ """
125
+ # Match IIIF Image API pattern: .../full/(full|max)/0/(default|native).(jpg|png|...)
126
+ pattern = r"(/full/)(full|max)(/0/)"
127
+ replacement = rf"\g<1>!{max_px},{max_px}\3"
128
+ new_url = re.sub(pattern, replacement, url)
129
+ if new_url != url:
130
+ logger.info("URL IIIF réécrite: full → !%d,%d", max_px, max_px, extra={"original": url})
131
+ return new_url
132
+
133
 
134
  def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
135
  """Télécharge une image depuis une URL IIIF complète.
136
 
137
+ Si l'URL demande la pleine résolution (/full/full/ ou /full/max/),
138
+ elle est automatiquement réécrite pour demander un d��rivé 1500px max
139
+ côté serveur, ce qui est plus rapide et évite le rate-limiting.
140
+
141
  Args:
142
  url: URL complète de l'image (ex. https://.../full/max/0/default.jpg).
143
  timeout: délai maximal en secondes (défaut : 60 s).
 
150
  httpx.TimeoutException: si la requête dépasse le délai.
151
  httpx.RequestError: pour toute autre erreur réseau.
152
  """
153
+ url = _rewrite_full_to_reduced(url)
154
  logger.info("Fetching IIIF image", extra={"url": url})
155
+ response = _fetch_with_retry(url, timeout)
 
 
 
 
 
 
156
  logger.info(
157
  "IIIF image fetched",
158
  extra={"url": url, "size_bytes": len(response.content)},
 
181
  # Pattern IIIF Image API : !w,h = "best fit" (le serveur choisit)
182
  derivative_url = f"{service_url.rstrip('/')}/full/!{max_px},{max_px}/0/default.jpg"
183
  logger.info("Fetching IIIF derivative", extra={"url": derivative_url, "max_px": max_px})
184
+ response = _fetch_with_retry(derivative_url, timeout)
 
 
 
 
 
 
185
  logger.info(
186
  "IIIF derivative fetched",
187
  extra={"url": derivative_url, "size_bytes": len(response.content)},
backend/tests/test_image_pipeline.py CHANGED
@@ -261,8 +261,11 @@ def test_fetch_iiif_image_success():
261
  """Retourne les bytes de l'image si la requête réussit."""
262
  fake_bytes = _make_jpeg_bytes(100, 100)
263
 
264
- with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get:
 
 
265
  mock_response = MagicMock()
 
266
  mock_response.content = fake_bytes
267
  mock_response.raise_for_status.return_value = None
268
  mock_get.return_value = mock_response
@@ -272,15 +275,18 @@ def test_fetch_iiif_image_success():
272
  assert result == fake_bytes
273
  _, kwargs = mock_get.call_args
274
  assert kwargs["follow_redirects"] is True
275
- # Timeout is now an httpx.Timeout object (connect=10s, read=30s)
276
- assert kwargs["timeout"].connect == 10.0
277
- assert kwargs["timeout"].read == 30.0
278
 
279
 
280
  def test_fetch_iiif_image_http_error():
281
- """Propage HTTPStatusError si le serveur répond 404."""
282
- with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get:
 
 
283
  mock_response = MagicMock()
 
284
  mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
285
  "404 Not Found",
286
  request=MagicMock(),
@@ -293,8 +299,10 @@ def test_fetch_iiif_image_http_error():
293
 
294
 
295
  def test_fetch_iiif_image_timeout():
296
- """Propage TimeoutException si la requête dépasse le délai."""
297
- with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get:
 
 
298
  mock_get.side_effect = httpx.TimeoutException("timed out")
299
 
300
  with pytest.raises(httpx.TimeoutException):
@@ -305,8 +313,11 @@ def test_fetch_iiif_image_custom_timeout():
305
  """Le timeout personnalisé est bien transmis à httpx.get."""
306
  fake_bytes = _make_jpeg_bytes(50, 50)
307
 
308
- with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get:
 
 
309
  mock_response = MagicMock()
 
310
  mock_response.content = fake_bytes
311
  mock_response.raise_for_status.return_value = None
312
  mock_get.return_value = mock_response
@@ -314,9 +325,9 @@ def test_fetch_iiif_image_custom_timeout():
314
  fetch_iiif_image("https://example.com/img.jpg", timeout=120.0)
315
 
316
  _, kwargs = mock_get.call_args
317
- # Custom timeout wraps in httpx.Timeout(120.0, connect=10.0)
318
  assert kwargs["timeout"].read == 120.0
319
- assert kwargs["timeout"].connect == 10.0
320
 
321
 
322
  # ---------------------------------------------------------------------------
 
261
  """Retourne les bytes de l'image si la requête réussit."""
262
  fake_bytes = _make_jpeg_bytes(100, 100)
263
 
264
+ with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
265
+ patch("app.services.ingest.iiif_fetcher.time.sleep"), \
266
+ patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
267
  mock_response = MagicMock()
268
+ mock_response.status_code = 200
269
  mock_response.content = fake_bytes
270
  mock_response.raise_for_status.return_value = None
271
  mock_get.return_value = mock_response
 
275
  assert result == fake_bytes
276
  _, kwargs = mock_get.call_args
277
  assert kwargs["follow_redirects"] is True
278
+ # Timeout is an httpx.Timeout object (connect=15s, read=60s)
279
+ assert kwargs["timeout"].connect == 15.0
280
+ assert kwargs["timeout"].read == 60.0
281
 
282
 
283
  def test_fetch_iiif_image_http_error():
284
+ """Propage HTTPStatusError si le serveur répond 404 (pas de retry sur 4xx hors 429)."""
285
+ with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
286
+ patch("app.services.ingest.iiif_fetcher.time.sleep"), \
287
+ patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
288
  mock_response = MagicMock()
289
+ mock_response.status_code = 404
290
  mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
291
  "404 Not Found",
292
  request=MagicMock(),
 
299
 
300
 
301
  def test_fetch_iiif_image_timeout():
302
+ """Propage TimeoutException après épuisement des retries."""
303
+ with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
304
+ patch("app.services.ingest.iiif_fetcher.time.sleep"), \
305
+ patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
306
  mock_get.side_effect = httpx.TimeoutException("timed out")
307
 
308
  with pytest.raises(httpx.TimeoutException):
 
313
  """Le timeout personnalisé est bien transmis à httpx.get."""
314
  fake_bytes = _make_jpeg_bytes(50, 50)
315
 
316
+ with patch("app.services.ingest.iiif_fetcher.httpx.get") as mock_get, \
317
+ patch("app.services.ingest.iiif_fetcher.time.sleep"), \
318
+ patch("app.services.ingest.iiif_fetcher.time.monotonic", return_value=0.0):
319
  mock_response = MagicMock()
320
+ mock_response.status_code = 200
321
  mock_response.content = fake_bytes
322
  mock_response.raise_for_status.return_value = None
323
  mock_get.return_value = mock_response
 
325
  fetch_iiif_image("https://example.com/img.jpg", timeout=120.0)
326
 
327
  _, kwargs = mock_get.call_args
328
+ # Custom timeout wraps in httpx.Timeout(120.0, connect=15.0)
329
  assert kwargs["timeout"].read == 120.0
330
+ assert kwargs["timeout"].connect == 15.0
331
 
332
 
333
  # ---------------------------------------------------------------------------