Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

deepshield / tests /test_efficientnet_regression.py

ar07xd

Sync from GitHub via hub-sync

fba30db verified 12 days ago

raw

history blame contribute delete

7.52 kB

	"""Gate G3 regression harness — EfficientNetAutoAttB4 accuracy on anchor set.

	Acceptance criteria (MERGE_PLAN §9.1 G3):
	- >=88% accuracy on the anchor set
	- <=8% real->fake false-positive rate

	Anchor set priority:
	1. LOCAL — bundled ICPR2020 notebook/samples/ frames (always available, minimal set)
	2. FFPP — training/datasets/ffpp/ when present (full G3 gate, 50+ images)
	3. DFDC — training/datasets/dfdc/ when present

	NOTE: ThisPersonDoesNotExist.com (StyleGAN2) is NOT valid for G3 — EfficientNetAutoAttB4
	is trained on DFDC video face-swaps and does NOT generalise to GAN-portrait detection.
	The full G3 gate requires FFPP c40 data (run scripts/fit_calibrator.py first).

	Run from backend/:
	.venv/Scripts/python.exe -m pytest tests/test_efficientnet_regression.py -v
	"""
	from __future__ import annotations

	import io
	import sys
	import time
	import urllib.request
	from pathlib import Path
	from typing import Tuple

	import numpy as np
	import pytest

	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	# ---------------------------------------------------------------------------
	# Anchor image sources
	# ---------------------------------------------------------------------------
	# Local: bundled ICPR2020 sample frames (ground-truth labels from their scores).
	# lynaeydofd_fr0.jpg → EfficientNet scores 0.011 (REAL)
	# mqzvfufzoq_fr0.jpg → EfficientNet scores 0.873 (FAKE)
	_ICPR_SAMPLES = (
	Path(__file__).resolve().parent.parent
	/ "models" / "icpr2020dfdc" / "notebook" / "samples"
	)
	LOCAL_REAL_IMAGES = [_ICPR_SAMPLES / "lynaeydofd_fr0.jpg"]
	LOCAL_FAKE_IMAGES = [_ICPR_SAMPLES / "mqzvfufzoq_fr0.jpg"]

	# FFPP / DFDC local data (full G3 gate — available after running training/datasets download scripts).
	_FFPP_REAL = Path(__file__).resolve().parent.parent / "training" / "datasets" / "ffpp" / "c40" / "real"
	_FFPP_FAKE = Path(__file__).resolve().parent.parent / "training" / "datasets" / "ffpp" / "c40" / "fake"
	_IMAGE_EXTS = {".jpg", ".jpeg", ".png"}

	# Network: thispersondoesnotexist.com — used for G2 gate only (face detection).
	# NOT used for G3 accuracy gate: StyleGAN2 faces are a different distribution
	# from DFDC video face-swaps (the model's training domain).
	TPDNE_URL = "https://thispersondoesnotexist.com/"


	# ---------------------------------------------------------------------------
	# Fixtures
	# ---------------------------------------------------------------------------

	def _fetch(url: str, timeout: int = 20) -> bytes:
	req = urllib.request.Request(url, headers={"User-Agent": "DeepShield-Test/1.0"})
	with urllib.request.urlopen(req, timeout=timeout) as r:
	return r.read()


	@pytest.fixture(scope="module")
	def detector():
	"""Load the EfficientNetDetector once per module."""
	from services.efficientnet_service import EfficientNetDetector
	return EfficientNetDetector()


	@pytest.fixture(scope="module")
	def anchor_set(detector) -> Tuple[list, list]:
	"""Score anchor images. Returns (real_results, fake_results).

	Priority order:
	1. FFPP c40 images (training/datasets/ffpp/c40/{real,fake}/) — full G3 gate
	2. Bundled ICPR2020 notebook samples — minimal sanity check
	"""
	from PIL import Image

	def score_dir(directory: Path, limit: int = 50) -> list:
	results = []
	if not directory.is_dir():
	return results
	paths = sorted(p for p in directory.rglob("*") if p.suffix.lower() in _IMAGE_EXTS)[:limit]
	for p in paths:
	try:
	pil = Image.open(p).convert("RGB")
	results.append(detector.detect_image(pil))
	except Exception:
	pass
	return results

	# --- FFPP c40 (full gate) ---
	real_results = score_dir(_FFPP_REAL)
	fake_results = score_dir(_FFPP_FAKE)

	# --- Fallback: bundled ICPR2020 samples ---
	if not real_results:
	for p in LOCAL_REAL_IMAGES:
	if p.exists():
	pil = Image.open(p).convert("RGB")
	real_results.append(detector.detect_image(pil))
	if not fake_results:
	for p in LOCAL_FAKE_IMAGES:
	if p.exists():
	pil = Image.open(p).convert("RGB")
	fake_results.append(detector.detect_image(pil))

	return real_results, fake_results


	# ---------------------------------------------------------------------------
	# Tests
	# ---------------------------------------------------------------------------

	class TestEfficientNetRegression:

	def test_g1_detector_loads(self, detector):
	"""G1: model loaded without crash."""
	assert detector is not None
	assert detector.net is not None
	assert detector.face_extractor is not None

	def test_g2_blazeface_detects_face_on_tpdne(self, detector):
	"""G2: BlazeFace detects ≥1 face on a TPDNE image."""
	from PIL import Image
	data = _fetch(TPDNE_URL)
	pil = Image.open(io.BytesIO(data)).convert("RGB")
	img_np = np.array(pil)
	frame_data = detector.face_extractor.process_image(img=img_np)
	faces = frame_data.get("faces", [])
	assert len(faces) >= 1, "BlazeFace detected 0 faces on TPDNE image"

	def test_g3_accuracy_and_fpr(self, anchor_set):
	"""G3: ≥88% accuracy and ≤8% real→fake FPR on the anchor set."""
	real_results, fake_results = anchor_set

	# Only count images where a face was detected (no-face = excluded from G3).
	real_scored = [r for r in real_results if r.get("score") is not None]
	fake_scored = [r for r in fake_results if r.get("score") is not None]

	if len(real_scored) + len(fake_scored) < 5:
	pytest.skip("Too few face-detectable images in anchor set for meaningful G3 check")

	# Correctness: REAL → score ≤ 0.5, FAKE → score > 0.5
	real_correct = sum(1 for r in real_scored if r["result"] == "REAL")
	fake_correct = sum(1 for r in fake_scored if r["result"] == "FAKE")

	total = len(real_scored) + len(fake_scored)
	accuracy = (real_correct + fake_correct) / total * 100

	fpr = (len(real_scored) - real_correct) / max(len(real_scored), 1) * 100

	print(f"\n Anchor set: {len(real_scored)} real \| {len(fake_scored)} fake")
	print(f" Accuracy: {accuracy:.1f}% (need >=88%)")
	print(f" FPR: {fpr:.1f}% (need <=8%)")
	for tag, results, exp in [("REAL", real_scored, "REAL"), ("FAKE", fake_scored, "FAKE")]:
	for r in results:
	mark = "✓" if r["result"] == exp else "✗"
	print(f" [{tag}] {mark} score={r['score']:.3f} cal={r.get('calibrator_applied')}")

	assert accuracy >= 88.0, f"G3 accuracy {accuracy:.1f}% < 88%"
	assert fpr <= 8.0, f"G3 FPR {fpr:.1f}% > 8%"

	def test_no_face_returns_gracefully(self, detector):
	"""Noise image with no face should return error='no_face', not raise."""
	from PIL import Image
	noise = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
	result = detector.detect_image(noise)
	assert result["error"] == "no_face"
	assert result["score"] is None

	def test_g8_memory_under_threshold(self):
	"""G8: RSS after model load < 2500 MB."""
	import psutil
	rss_mb = psutil.Process().memory_info().rss / 1024 / 1024
	print(f"\n RSS: {rss_mb:.0f} MB")
	assert rss_mb < 2500, f"G8: RSS {rss_mb:.0f} MB exceeds 2500 MB threshold"