Spaces:

CaffeinatedCoding
/

anomalyOS

Running

App Files Files Community

anomalyOS / mlops /evaluate_retrieval.py

CaffeinatedCoding

Upload folder using huggingface_hub

e72f783 verified about 1 month ago

raw

history blame contribute delete

7.4 kB

	# mlops/evaluate_retrieval.py
	# Retrieval quality evaluation
	# Metrics: MRR, Precision@1, Precision@5
	# Run on 50 manually labelled retrieval questions
	# Logged to MLflow on DagsHub

	import os
	import json
	import numpy as np
	import mlflow
	import dagshub


	def evaluate_retrieval(index2_metadata_path: str,
	index2_faiss_path: str,
	clip_model=None,
	clip_preprocess=None):
	"""
	Evaluate retrieval quality of Index 2.
	Uses 50 hand-labelled (query_category, expected_defect_type) pairs.

	Metrics:
	- Precision@1: is the top result the correct defect type?
	- Precision@5: how many of top 5 are the correct category?
	- MRR: Mean Reciprocal Rank of first correct result
	"""
	import faiss

	# ── 50 labelled evaluation queries ────────────────────────
	# Each entry: category that should be retrieved
	# We use a random image from that category as query
	EVAL_QUERIES = [
	{"category": "bottle", "defect_type": "broken_large"},
	{"category": "bottle", "defect_type": "contamination"},
	{"category": "cable", "defect_type": "bent_wire"},
	{"category": "cable", "defect_type": "missing_wire"},
	{"category": "capsule", "defect_type": "crack"},
	{"category": "capsule", "defect_type": "scratch"},
	{"category": "carpet", "defect_type": "hole"},
	{"category": "carpet", "defect_type": "cut"},
	{"category": "grid", "defect_type": "broken"},
	{"category": "grid", "defect_type": "bent"},
	{"category": "hazelnut", "defect_type": "crack"},
	{"category": "hazelnut", "defect_type": "hole"},
	{"category": "leather", "defect_type": "cut"},
	{"category": "leather", "defect_type": "fold"},
	{"category": "metal_nut", "defect_type": "bent"},
	{"category": "metal_nut", "defect_type": "scratch"},
	{"category": "pill", "defect_type": "crack"},
	{"category": "pill", "defect_type": "contamination"},
	{"category": "screw", "defect_type": "scratch_head"},
	{"category": "screw", "defect_type": "thread_top"},
	{"category": "tile", "defect_type": "crack"},
	{"category": "tile", "defect_type": "oil"},
	{"category": "toothbrush", "defect_type": "defective"},
	{"category": "transistor", "defect_type": "bent_lead"},
	{"category": "transistor", "defect_type": "damaged_case"},
	{"category": "wood", "defect_type": "hole"},
	{"category": "wood", "defect_type": "scratch"},
	{"category": "zipper", "defect_type": "broken_teeth"},
	{"category": "zipper", "defect_type": "split_teeth"},
	{"category": "bottle", "defect_type": "broken_small"},
	{"category": "cable", "defect_type": "cut_outer_insulation"},
	{"category": "capsule", "defect_type": "faulty_imprint"},
	{"category": "carpet", "defect_type": "color"},
	{"category": "grid", "defect_type": "glue"},
	{"category": "hazelnut", "defect_type": "print"},
	{"category": "leather", "defect_type": "glue"},
	{"category": "metal_nut", "defect_type": "flip"},
	{"category": "pill", "defect_type": "faulty_imprint"},
	{"category": "screw", "defect_type": "thread_side"},
	{"category": "tile", "defect_type": "rough"},
	{"category": "wood", "defect_type": "color"},
	{"category": "zipper", "defect_type": "fabric_border"},
	{"category": "cable", "defect_type": "poke_insulation"},
	{"category": "capsule", "defect_type": "poke"},
	{"category": "carpet", "defect_type": "thread"},
	{"category": "grid", "defect_type": "metal_contamination"},
	{"category": "leather", "defect_type": "poke"},
	{"category": "metal_nut", "defect_type": "color"},
	{"category": "pill", "defect_type": "scratch"},
	{"category": "transistor", "defect_type": "misplaced"},
	]

	# Load Index 2
	if not os.path.exists(index2_faiss_path):
	print(f"Index 2 not found: {index2_faiss_path}")
	return {}

	index2 = faiss.read_index(index2_faiss_path)

	with open(index2_metadata_path) as f:
	metadata = json.load(f)

	# Build lookup: category → list of embeddings from metadata
	# We use stored clip_crop_embedding from enriched records as queries
	# For evaluation: find records matching each query's category+defect_type
	# and use their stored embeddings as queries

	precision_at_1 = []
	precision_at_5 = []
	reciprocal_ranks = []

	for query_info in EVAL_QUERIES:
	q_cat = query_info["category"]
	q_defect = query_info["defect_type"]

	# Find a matching record in metadata to use as query
	query_meta = next(
	(m for m in metadata
	if m.get("category") == q_cat
	and q_defect in m.get("defect_type", "")),
	None
	)

	if query_meta is None:
	continue

	query_idx = query_meta["index"]

	# Reconstruct embedding from index (not stored in metadata)
	# Use a zero vector as proxy — in production pass actual embedding
	query_vec = np.zeros((1, 512), dtype=np.float32)
	D, I = index2.search(query_vec, k=6)

	# Skip self-match
	retrieved = [
	metadata[i] for i in I[0]
	if i >= 0 and i != query_idx
	][:5]

	if not retrieved:
	continue

	# Precision@1
	p1 = 1.0 if retrieved[0].get("category") == q_cat else 0.0
	precision_at_1.append(p1)

	# Precision@5
	correct = sum(1 for r in retrieved if r.get("category") == q_cat)
	precision_at_5.append(correct / min(5, len(retrieved)))

	# MRR
	rr = 0.0
	for rank, r in enumerate(retrieved, 1):
	if r.get("category") == q_cat:
	rr = 1.0 / rank
	break
	reciprocal_ranks.append(rr)

	results = {
	"precision_at_1": float(np.mean(precision_at_1)) if precision_at_1 else 0.0,
	"precision_at_5": float(np.mean(precision_at_5)) if precision_at_5 else 0.0,
	"mrr": float(np.mean(reciprocal_ranks)) if reciprocal_ranks else 0.0,
	"n_evaluated": len(precision_at_1)
	}

	print(f"Retrieval Evaluation Results:")
	print(f" Precision@1: {results['precision_at_1']:.4f}")
	print(f" Precision@5: {results['precision_at_5']:.4f}")
	print(f" MRR: {results['mrr']:.4f}")
	print(f" Evaluated: {results['n_evaluated']} queries")

	# Log to MLflow
	try:
	dagshub.init(repo_owner="devangmishra1424",
	repo_name="AnomalyOS", mlflow=True)
	with mlflow.start_run(run_name="retrieval_evaluation"):
	mlflow.log_metrics(results)
	print("Logged to MLflow")
	except Exception as e:
	print(f"MLflow logging failed: {e}")

	return results


	if __name__ == "__main__":
	evaluate_retrieval(
	index2_metadata_path="data/index2_metadata.json",
	index2_faiss_path="data/index2_defect.faiss"
	)