""" Threshold calibration tool. Use this to find the FACE_MATCH_THRESHOLD that gives you the best precision/recall tradeoff for YOUR specific data. Default 0.28 is an industry-average — your data may differ. Usage: 1. Build two test sets: - POSITIVE_PAIRS: pairs of (query_image, gallery_image) of the SAME person - NEGATIVE_PAIRS: pairs of DIFFERENT people (hard negatives help most) 2. Populate TEST_PAIRS below with local image paths 3. Run: python scripts/calibrate_threshold.py Output: table of thresholds with TP/FP/FN/precision/recall/F1. """ import sys import os from pathlib import Path # Add project root to path so `src.*` imports work when running from scripts/ sys.path.insert(0, str(Path(__file__).parent.parent)) import numpy as np from PIL import Image # ── EDIT THESE ────────────────────────────────────────────── # Each tuple: (path_to_query_image, path_to_gallery_image, is_same_person) TEST_PAIRS = [ # Example positives (same person, different photos) # ("test_data/alice_1.jpg", "test_data/alice_2.jpg", True), # ("test_data/alice_1.jpg", "test_data/alice_3.jpg", True), # ("test_data/bob_1.jpg", "test_data/bob_2.jpg", True), # Example hard negatives (different people, similar looking) # ("test_data/alice_1.jpg", "test_data/carol_1.jpg", False), # ("test_data/bob_1.jpg", "test_data/dave_1.jpg", False), ] # ──────────────────────────────────────────────────────────── def cosine(a: np.ndarray, b: np.ndarray) -> float: return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) def compute_pair_scores(): """Returns list of (fused_score, arcface_score, adaface_score, is_positive).""" from src.services.ai_manager import AIModelManager print("Loading models...") ai = AIModelManager() results = [] for query_path, gallery_path, is_positive in TEST_PAIRS: if not (os.path.exists(query_path) and os.path.exists(gallery_path)): print(f" Skipping missing: {query_path} or {gallery_path}") continue with open(query_path, "rb") as f: q_vectors = ai.process_image_bytes(f.read(), detect_faces=True) with open(gallery_path, "rb") as f: g_vectors = ai.process_image_bytes(f.read(), detect_faces=True) q_faces = [v for v in q_vectors if v["type"] == "face"] g_faces = [v for v in g_vectors if v["type"] == "face"] if not q_faces or not g_faces: print(f" No face in: {query_path} or {gallery_path}") continue # Take largest face from each qf = max(q_faces, key=lambda f: f.get("face_width_px", 0)) gf = max(g_faces, key=lambda f: f.get("face_width_px", 0)) arc_score = cosine(qf["arcface_vector"], gf["arcface_vector"]) if qf.get("has_adaface") and gf.get("has_adaface"): ada_score = cosine(qf["adaface_vector"], gf["adaface_vector"]) else: ada_score = 0.15 fused = 0.6 * arc_score + 0.4 * ada_score results.append({ "query": query_path, "gallery": gallery_path, "is_positive": is_positive, "arcface": arc_score, "adaface": ada_score, "fused": fused, }) tag = "SAME" if is_positive else "DIFF" print(f" [{tag}] arc={arc_score:.3f} ada={ada_score:.3f} fused={fused:.3f}") return results def evaluate_thresholds(results): """Sweep thresholds and compute P/R/F1 for each.""" if not results: print("\nNo results to evaluate. Add pairs to TEST_PAIRS above.") return print("\n" + "=" * 78) print(f"{'arcface_thr':<14}{'fused_thr':<14}{'TP':>6}{'FP':>6}{'FN':>6}" f"{'Precision':>12}{'Recall':>10}{'F1':>8}") print("=" * 78) n_positive = sum(1 for r in results if r["is_positive"]) best = {"f1": 0, "arc_thr": 0, "fused_thr": 0} for arc_thr in [0.20, 0.24, 0.28, 0.32, 0.36, 0.40, 0.45]: for fused_thr in [0.22, 0.26, 0.30, 0.34, 0.38]: tp = fp = fn = 0 for r in results: # A match passes both thresholds predicted_match = (r["arcface"] >= arc_thr and r["fused"] >= fused_thr) if r["is_positive"]: if predicted_match: tp += 1 else: fn += 1 else: if predicted_match: fp += 1 precision = tp / (tp + fp) if (tp + fp) else 0 recall = tp / (tp + fn) if (tp + fn) else 0 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0 if f1 > best["f1"]: best = {"f1": f1, "arc_thr": arc_thr, "fused_thr": fused_thr, "tp": tp, "fp": fp, "fn": fn, "precision": precision, "recall": recall} print(f"{arc_thr:<14.2f}{fused_thr:<14.2f}{tp:>6}{fp:>6}{fn:>6}" f"{precision:>12.3f}{recall:>10.3f}{f1:>8.3f}") print("=" * 78) print(f"\nBest F1: {best['f1']:.3f}") print(f" FACE_MATCH_THRESHOLD = {best['arc_thr']}") print(f" FUSED_MATCH_THRESHOLD = {best['fused_thr']}") print(f" Precision = {best['precision']:.3f}, Recall = {best['recall']:.3f}") print("\nUpdate these in your HF Space env vars.") if __name__ == "__main__": if not TEST_PAIRS: print("Edit scripts/calibrate_threshold.py and populate TEST_PAIRS with") print("10-30 positive pairs and 10-30 hard-negative pairs, then re-run.") print("\nTip: export ~50 face photos from your own gallery, hand-label") print("the same-person pairs, and use those for calibration.") sys.exit(1) results = compute_pair_scores() evaluate_thresholds(results)