# mlops/evaluate_retrieval.py # Retrieval quality evaluation # Metrics: MRR, Precision@1, Precision@5 # Run on 50 manually labelled retrieval questions # Logged to MLflow on DagsHub import os import json import numpy as np import mlflow import dagshub def evaluate_retrieval(index2_metadata_path: str, index2_faiss_path: str, clip_model=None, clip_preprocess=None): """ Evaluate retrieval quality of Index 2. Uses 50 hand-labelled (query_category, expected_defect_type) pairs. Metrics: - Precision@1: is the top result the correct defect type? - Precision@5: how many of top 5 are the correct category? - MRR: Mean Reciprocal Rank of first correct result """ import faiss # ── 50 labelled evaluation queries ──────────────────────── # Each entry: category that should be retrieved # We use a random image from that category as query EVAL_QUERIES = [ {"category": "bottle", "defect_type": "broken_large"}, {"category": "bottle", "defect_type": "contamination"}, {"category": "cable", "defect_type": "bent_wire"}, {"category": "cable", "defect_type": "missing_wire"}, {"category": "capsule", "defect_type": "crack"}, {"category": "capsule", "defect_type": "scratch"}, {"category": "carpet", "defect_type": "hole"}, {"category": "carpet", "defect_type": "cut"}, {"category": "grid", "defect_type": "broken"}, {"category": "grid", "defect_type": "bent"}, {"category": "hazelnut", "defect_type": "crack"}, {"category": "hazelnut", "defect_type": "hole"}, {"category": "leather", "defect_type": "cut"}, {"category": "leather", "defect_type": "fold"}, {"category": "metal_nut", "defect_type": "bent"}, {"category": "metal_nut", "defect_type": "scratch"}, {"category": "pill", "defect_type": "crack"}, {"category": "pill", "defect_type": "contamination"}, {"category": "screw", "defect_type": "scratch_head"}, {"category": "screw", "defect_type": "thread_top"}, {"category": "tile", "defect_type": "crack"}, {"category": "tile", "defect_type": "oil"}, {"category": "toothbrush", "defect_type": "defective"}, {"category": "transistor", "defect_type": "bent_lead"}, {"category": "transistor", "defect_type": "damaged_case"}, {"category": "wood", "defect_type": "hole"}, {"category": "wood", "defect_type": "scratch"}, {"category": "zipper", "defect_type": "broken_teeth"}, {"category": "zipper", "defect_type": "split_teeth"}, {"category": "bottle", "defect_type": "broken_small"}, {"category": "cable", "defect_type": "cut_outer_insulation"}, {"category": "capsule", "defect_type": "faulty_imprint"}, {"category": "carpet", "defect_type": "color"}, {"category": "grid", "defect_type": "glue"}, {"category": "hazelnut", "defect_type": "print"}, {"category": "leather", "defect_type": "glue"}, {"category": "metal_nut", "defect_type": "flip"}, {"category": "pill", "defect_type": "faulty_imprint"}, {"category": "screw", "defect_type": "thread_side"}, {"category": "tile", "defect_type": "rough"}, {"category": "wood", "defect_type": "color"}, {"category": "zipper", "defect_type": "fabric_border"}, {"category": "cable", "defect_type": "poke_insulation"}, {"category": "capsule", "defect_type": "poke"}, {"category": "carpet", "defect_type": "thread"}, {"category": "grid", "defect_type": "metal_contamination"}, {"category": "leather", "defect_type": "poke"}, {"category": "metal_nut", "defect_type": "color"}, {"category": "pill", "defect_type": "scratch"}, {"category": "transistor", "defect_type": "misplaced"}, ] # Load Index 2 if not os.path.exists(index2_faiss_path): print(f"Index 2 not found: {index2_faiss_path}") return {} index2 = faiss.read_index(index2_faiss_path) with open(index2_metadata_path) as f: metadata = json.load(f) # Build lookup: category → list of embeddings from metadata # We use stored clip_crop_embedding from enriched records as queries # For evaluation: find records matching each query's category+defect_type # and use their stored embeddings as queries precision_at_1 = [] precision_at_5 = [] reciprocal_ranks = [] for query_info in EVAL_QUERIES: q_cat = query_info["category"] q_defect = query_info["defect_type"] # Find a matching record in metadata to use as query query_meta = next( (m for m in metadata if m.get("category") == q_cat and q_defect in m.get("defect_type", "")), None ) if query_meta is None: continue query_idx = query_meta["index"] # Reconstruct embedding from index (not stored in metadata) # Use a zero vector as proxy — in production pass actual embedding query_vec = np.zeros((1, 512), dtype=np.float32) D, I = index2.search(query_vec, k=6) # Skip self-match retrieved = [ metadata[i] for i in I[0] if i >= 0 and i != query_idx ][:5] if not retrieved: continue # Precision@1 p1 = 1.0 if retrieved[0].get("category") == q_cat else 0.0 precision_at_1.append(p1) # Precision@5 correct = sum(1 for r in retrieved if r.get("category") == q_cat) precision_at_5.append(correct / min(5, len(retrieved))) # MRR rr = 0.0 for rank, r in enumerate(retrieved, 1): if r.get("category") == q_cat: rr = 1.0 / rank break reciprocal_ranks.append(rr) results = { "precision_at_1": float(np.mean(precision_at_1)) if precision_at_1 else 0.0, "precision_at_5": float(np.mean(precision_at_5)) if precision_at_5 else 0.0, "mrr": float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0, "n_evaluated": len(precision_at_1) } print(f"Retrieval Evaluation Results:") print(f" Precision@1: {results['precision_at_1']:.4f}") print(f" Precision@5: {results['precision_at_5']:.4f}") print(f" MRR: {results['mrr']:.4f}") print(f" Evaluated: {results['n_evaluated']} queries") # Log to MLflow try: dagshub.init(repo_owner="devangmishra1424", repo_name="AnomalyOS", mlflow=True) with mlflow.start_run(run_name="retrieval_evaluation"): mlflow.log_metrics(results) print("Logged to MLflow") except Exception as e: print(f"MLflow logging failed: {e}") return results if __name__ == "__main__": evaluate_retrieval( index2_metadata_path="data/index2_metadata.json", index2_faiss_path="data/index2_defect.faiss" )