deepshield / tests /test_efficientnet_regression.py
ar07xd's picture
Sync from GitHub via hub-sync
fba30db verified
"""Gate G3 regression harness — EfficientNetAutoAttB4 accuracy on anchor set.
Acceptance criteria (MERGE_PLAN §9.1 G3):
- >=88% accuracy on the anchor set
- <=8% real->fake false-positive rate
Anchor set priority:
1. LOCAL — bundled ICPR2020 notebook/samples/ frames (always available, minimal set)
2. FFPP — training/datasets/ffpp/ when present (full G3 gate, 50+ images)
3. DFDC — training/datasets/dfdc/ when present
NOTE: ThisPersonDoesNotExist.com (StyleGAN2) is NOT valid for G3 — EfficientNetAutoAttB4
is trained on DFDC video face-swaps and does NOT generalise to GAN-portrait detection.
The full G3 gate requires FFPP c40 data (run scripts/fit_calibrator.py first).
Run from backend/:
.venv/Scripts/python.exe -m pytest tests/test_efficientnet_regression.py -v
"""
from __future__ import annotations
import io
import sys
import time
import urllib.request
from pathlib import Path
from typing import Tuple
import numpy as np
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
# ---------------------------------------------------------------------------
# Anchor image sources
# ---------------------------------------------------------------------------
# Local: bundled ICPR2020 sample frames (ground-truth labels from their scores).
# lynaeydofd_fr0.jpg → EfficientNet scores 0.011 (REAL)
# mqzvfufzoq_fr0.jpg → EfficientNet scores 0.873 (FAKE)
_ICPR_SAMPLES = (
Path(__file__).resolve().parent.parent
/ "models" / "icpr2020dfdc" / "notebook" / "samples"
)
LOCAL_REAL_IMAGES = [_ICPR_SAMPLES / "lynaeydofd_fr0.jpg"]
LOCAL_FAKE_IMAGES = [_ICPR_SAMPLES / "mqzvfufzoq_fr0.jpg"]
# FFPP / DFDC local data (full G3 gate — available after running training/datasets download scripts).
_FFPP_REAL = Path(__file__).resolve().parent.parent / "training" / "datasets" / "ffpp" / "c40" / "real"
_FFPP_FAKE = Path(__file__).resolve().parent.parent / "training" / "datasets" / "ffpp" / "c40" / "fake"
_IMAGE_EXTS = {".jpg", ".jpeg", ".png"}
# Network: thispersondoesnotexist.com — used for G2 gate only (face detection).
# NOT used for G3 accuracy gate: StyleGAN2 faces are a different distribution
# from DFDC video face-swaps (the model's training domain).
TPDNE_URL = "https://thispersondoesnotexist.com/"
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _fetch(url: str, timeout: int = 20) -> bytes:
req = urllib.request.Request(url, headers={"User-Agent": "DeepShield-Test/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read()
@pytest.fixture(scope="module")
def detector():
"""Load the EfficientNetDetector once per module."""
from services.efficientnet_service import EfficientNetDetector
return EfficientNetDetector()
@pytest.fixture(scope="module")
def anchor_set(detector) -> Tuple[list, list]:
"""Score anchor images. Returns (real_results, fake_results).
Priority order:
1. FFPP c40 images (training/datasets/ffpp/c40/{real,fake}/) — full G3 gate
2. Bundled ICPR2020 notebook samples — minimal sanity check
"""
from PIL import Image
def score_dir(directory: Path, limit: int = 50) -> list:
results = []
if not directory.is_dir():
return results
paths = sorted(p for p in directory.rglob("*") if p.suffix.lower() in _IMAGE_EXTS)[:limit]
for p in paths:
try:
pil = Image.open(p).convert("RGB")
results.append(detector.detect_image(pil))
except Exception:
pass
return results
# --- FFPP c40 (full gate) ---
real_results = score_dir(_FFPP_REAL)
fake_results = score_dir(_FFPP_FAKE)
# --- Fallback: bundled ICPR2020 samples ---
if not real_results:
for p in LOCAL_REAL_IMAGES:
if p.exists():
pil = Image.open(p).convert("RGB")
real_results.append(detector.detect_image(pil))
if not fake_results:
for p in LOCAL_FAKE_IMAGES:
if p.exists():
pil = Image.open(p).convert("RGB")
fake_results.append(detector.detect_image(pil))
return real_results, fake_results
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestEfficientNetRegression:
def test_g1_detector_loads(self, detector):
"""G1: model loaded without crash."""
assert detector is not None
assert detector.net is not None
assert detector.face_extractor is not None
def test_g2_blazeface_detects_face_on_tpdne(self, detector):
"""G2: BlazeFace detects ≥1 face on a TPDNE image."""
from PIL import Image
data = _fetch(TPDNE_URL)
pil = Image.open(io.BytesIO(data)).convert("RGB")
img_np = np.array(pil)
frame_data = detector.face_extractor.process_image(img=img_np)
faces = frame_data.get("faces", [])
assert len(faces) >= 1, "BlazeFace detected 0 faces on TPDNE image"
def test_g3_accuracy_and_fpr(self, anchor_set):
"""G3: ≥88% accuracy and ≤8% real→fake FPR on the anchor set."""
real_results, fake_results = anchor_set
# Only count images where a face was detected (no-face = excluded from G3).
real_scored = [r for r in real_results if r.get("score") is not None]
fake_scored = [r for r in fake_results if r.get("score") is not None]
if len(real_scored) + len(fake_scored) < 5:
pytest.skip("Too few face-detectable images in anchor set for meaningful G3 check")
# Correctness: REAL → score ≤ 0.5, FAKE → score > 0.5
real_correct = sum(1 for r in real_scored if r["result"] == "REAL")
fake_correct = sum(1 for r in fake_scored if r["result"] == "FAKE")
total = len(real_scored) + len(fake_scored)
accuracy = (real_correct + fake_correct) / total * 100
fpr = (len(real_scored) - real_correct) / max(len(real_scored), 1) * 100
print(f"\n Anchor set: {len(real_scored)} real | {len(fake_scored)} fake")
print(f" Accuracy: {accuracy:.1f}% (need >=88%)")
print(f" FPR: {fpr:.1f}% (need <=8%)")
for tag, results, exp in [("REAL", real_scored, "REAL"), ("FAKE", fake_scored, "FAKE")]:
for r in results:
mark = "✓" if r["result"] == exp else "✗"
print(f" [{tag}] {mark} score={r['score']:.3f} cal={r.get('calibrator_applied')}")
assert accuracy >= 88.0, f"G3 accuracy {accuracy:.1f}% < 88%"
assert fpr <= 8.0, f"G3 FPR {fpr:.1f}% > 8%"
def test_no_face_returns_gracefully(self, detector):
"""Noise image with no face should return error='no_face', not raise."""
from PIL import Image
noise = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
result = detector.detect_image(noise)
assert result["error"] == "no_face"
assert result["score"] is None
def test_g8_memory_under_threshold(self):
"""G8: RSS after model load < 2500 MB."""
import psutil
rss_mb = psutil.Process().memory_info().rss / 1024 / 1024
print(f"\n RSS: {rss_mb:.0f} MB")
assert rss_mb < 2500, f"G8: RSS {rss_mb:.0f} MB exceeds 2500 MB threshold"