Spaces:
Running
Running
File size: 6,086 Bytes
29bfc1f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """
Threshold calibration tool.
Use this to find the FACE_MATCH_THRESHOLD that gives you the best
precision/recall tradeoff for YOUR specific data. Default 0.28 is an
industry-average — your data may differ.
Usage:
1. Build two test sets:
- POSITIVE_PAIRS: pairs of (query_image, gallery_image) of the SAME person
- NEGATIVE_PAIRS: pairs of DIFFERENT people (hard negatives help most)
2. Populate TEST_PAIRS below with local image paths
3. Run: python scripts/calibrate_threshold.py
Output: table of thresholds with TP/FP/FN/precision/recall/F1.
"""
import sys
import os
from pathlib import Path
# Add project root to path so `src.*` imports work when running from scripts/
sys.path.insert(0, str(Path(__file__).parent.parent))
import numpy as np
from PIL import Image
# ── EDIT THESE ──────────────────────────────────────────────
# Each tuple: (path_to_query_image, path_to_gallery_image, is_same_person)
TEST_PAIRS = [
# Example positives (same person, different photos)
# ("test_data/alice_1.jpg", "test_data/alice_2.jpg", True),
# ("test_data/alice_1.jpg", "test_data/alice_3.jpg", True),
# ("test_data/bob_1.jpg", "test_data/bob_2.jpg", True),
# Example hard negatives (different people, similar looking)
# ("test_data/alice_1.jpg", "test_data/carol_1.jpg", False),
# ("test_data/bob_1.jpg", "test_data/dave_1.jpg", False),
]
# ────────────────────────────────────────────────────────────
def cosine(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))
def compute_pair_scores():
"""Returns list of (fused_score, arcface_score, adaface_score, is_positive)."""
from src.services.ai_manager import AIModelManager
print("Loading models...")
ai = AIModelManager()
results = []
for query_path, gallery_path, is_positive in TEST_PAIRS:
if not (os.path.exists(query_path) and os.path.exists(gallery_path)):
print(f" Skipping missing: {query_path} or {gallery_path}")
continue
with open(query_path, "rb") as f:
q_vectors = ai.process_image_bytes(f.read(), detect_faces=True)
with open(gallery_path, "rb") as f:
g_vectors = ai.process_image_bytes(f.read(), detect_faces=True)
q_faces = [v for v in q_vectors if v["type"] == "face"]
g_faces = [v for v in g_vectors if v["type"] == "face"]
if not q_faces or not g_faces:
print(f" No face in: {query_path} or {gallery_path}")
continue
# Take largest face from each
qf = max(q_faces, key=lambda f: f.get("face_width_px", 0))
gf = max(g_faces, key=lambda f: f.get("face_width_px", 0))
arc_score = cosine(qf["arcface_vector"], gf["arcface_vector"])
if qf.get("has_adaface") and gf.get("has_adaface"):
ada_score = cosine(qf["adaface_vector"], gf["adaface_vector"])
else:
ada_score = 0.15
fused = 0.6 * arc_score + 0.4 * ada_score
results.append({
"query": query_path,
"gallery": gallery_path,
"is_positive": is_positive,
"arcface": arc_score,
"adaface": ada_score,
"fused": fused,
})
tag = "SAME" if is_positive else "DIFF"
print(f" [{tag}] arc={arc_score:.3f} ada={ada_score:.3f} fused={fused:.3f}")
return results
def evaluate_thresholds(results):
"""Sweep thresholds and compute P/R/F1 for each."""
if not results:
print("\nNo results to evaluate. Add pairs to TEST_PAIRS above.")
return
print("\n" + "=" * 78)
print(f"{'arcface_thr':<14}{'fused_thr':<14}{'TP':>6}{'FP':>6}{'FN':>6}"
f"{'Precision':>12}{'Recall':>10}{'F1':>8}")
print("=" * 78)
n_positive = sum(1 for r in results if r["is_positive"])
best = {"f1": 0, "arc_thr": 0, "fused_thr": 0}
for arc_thr in [0.20, 0.24, 0.28, 0.32, 0.36, 0.40, 0.45]:
for fused_thr in [0.22, 0.26, 0.30, 0.34, 0.38]:
tp = fp = fn = 0
for r in results:
# A match passes both thresholds
predicted_match = (r["arcface"] >= arc_thr and r["fused"] >= fused_thr)
if r["is_positive"]:
if predicted_match:
tp += 1
else:
fn += 1
else:
if predicted_match:
fp += 1
precision = tp / (tp + fp) if (tp + fp) else 0
recall = tp / (tp + fn) if (tp + fn) else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
if f1 > best["f1"]:
best = {"f1": f1, "arc_thr": arc_thr, "fused_thr": fused_thr,
"tp": tp, "fp": fp, "fn": fn,
"precision": precision, "recall": recall}
print(f"{arc_thr:<14.2f}{fused_thr:<14.2f}{tp:>6}{fp:>6}{fn:>6}"
f"{precision:>12.3f}{recall:>10.3f}{f1:>8.3f}")
print("=" * 78)
print(f"\nBest F1: {best['f1']:.3f}")
print(f" FACE_MATCH_THRESHOLD = {best['arc_thr']}")
print(f" FUSED_MATCH_THRESHOLD = {best['fused_thr']}")
print(f" Precision = {best['precision']:.3f}, Recall = {best['recall']:.3f}")
print("\nUpdate these in your HF Space env vars.")
if __name__ == "__main__":
if not TEST_PAIRS:
print("Edit scripts/calibrate_threshold.py and populate TEST_PAIRS with")
print("10-30 positive pairs and 10-30 hard-negative pairs, then re-run.")
print("\nTip: export ~50 face photos from your own gallery, hand-label")
print("the same-person pairs, and use those for calibration.")
sys.exit(1)
results = compute_pair_scores()
evaluate_thresholds(results) |