Spaces:

vihaan2code
/

SpendWise-Backend

Sleeping

App Files Files Community

SpendWise-Backend / backend /app /api /ml.py

VihaanShinde10

Initial commit

1313d86 28 days ago

raw

history blame contribute delete

7.78 kB

	from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
	from app.dependencies import get_current_user
	from app.db.client import get_supabase
	from app.background.categorise import process_transactions_batch
	from app.ml.layers.layer3_gating import GatingNetwork
	from app.ml.layers.layer4_assign import CATEGORY_NAMES
	import numpy as np
	import pandas as pd
	from loguru import logger
	from sklearn.metrics import silhouette_score, davies_bouldin_score
	import umap
	from typing import List, Dict

	router = APIRouter()

	# Paper Targets (Hardcoded benchmarks from Section 5 of the research paper)
	PAPER_BENCHMARKS = {
	"silhouette": 0.42,
	"davies_bouldin": 1.15,
	"precision": 0.941,
	"f1_score": 0.892,
	"latency_ms": 1.2
	}

	@router.get("/clustering-metrics")
	async def clustering_metrics(user_id: str = Depends(get_current_user)):
	"""Silhouette, DB Index, and overall quality metrics."""
	supabase = get_supabase()

	# Fetch embeddings and their labels
	res = supabase.table("transaction_embeddings").select(
	"embedding, transactions(category_id)"
	).eq("user_id", user_id).execute()

	data = res.data or []
	if len(data) < 10:
	return {"status": "insufficient_data", "required": 10}

	embeddings = np.array([t['embedding'] for t in data])
	labels = [t['transactions']['category_id'] for t in data if t['transactions']]

	# Filter out noise (None)
	valid_indices = [i for i, l in enumerate(labels) if l is not None]
	if len(set([labels[i] for i in valid_indices])) < 2:
	return {"status": "insufficient_clusters", "required": 2}

	X = embeddings[valid_indices]
	L = [labels[i] for i in valid_indices]

	s_score = silhouette_score(X, L)
	db_index = davies_bouldin_score(X, L)

	return {
	"silhouette": round(s_score, 3),
	"davies_bouldin": round(db_index, 3),
	"paper_targets": PAPER_BENCHMARKS,
	"status": "ready"
	}

	@router.get("/pipeline-stats")
	async def pipeline_stats(user_id: str = Depends(get_current_user)):
	"""Detailed breakdown of category sources + confidence per lane."""
	supabase = get_supabase()
	res = supabase.table("transactions").select(
	"category_source, confidence_score"
	).eq("user_id", user_id).execute()

	df = pd.DataFrame(res.data or [])
	if df.empty:
	return {"total": 0, "distribution": []}

	# Ensure source is never null for grouping
	df['category_source'] = df['category_source'].fillna('pending')

	stats = df.groupby('category_source').agg(
	count=('category_source', 'size'),
	avg_confidence=('confidence_score', 'mean')
	).reset_index().rename(columns={'category_source': 'source'})

	stats['percentage'] = (stats['count'] / len(df) * 100).round(1)
	stats['avg_confidence'] = stats['avg_confidence'].round(3)

	return {
	"total": len(df),
	"distribution": stats.to_dict(orient='records'),
	"paper_benchmark_conf": 0.85
	}

	@router.get("/coldstart-metrics")
	async def coldstart_metrics(user_id: str = Depends(get_current_user)):
	"""Bins transactions by order into buckets to show system improvement."""
	supabase = get_supabase()
	res = supabase.table("transactions").select(
	"id, category_id, created_at"
	).eq("user_id", user_id).order("created_at").execute()

	txns = res.data or []
	if len(txns) < 15:
	return {"status": "insufficient_data", "message": "Need at least 15 transactions to show trends"}

	# Bucket into 3 groups: Early (0-10), Developing (11-30), Established (30+)
	buckets = [
	{"name": "Cold-start", "range": (0, 10)},
	{"name": "Developing", "range": (10, 30)},
	{"name": "Established", "range": (30, 999999)}
	]

	metrics_per_bucket = []
	for bucket in buckets:
	subset = txns[bucket['range'][0] : bucket['range'][1]]
	if not subset: continue

	# Calculate coverage (how many have non-'Others' categories)
	# Note: For simplicity, just count named categories
	coverage = sum(1 for t in subset if t['category_id'] is not None) / len(subset)
	metrics_per_bucket.append({
	"stage": bucket['name'],
	"coverage": round(coverage, 2),
	"count": len(subset)
	})

	return metrics_per_bucket

	@router.get("/gating-analysis")
	async def gating_analysis(user_id: str = Depends(get_current_user)):
	"""Mean alpha across different transaction types."""
	supabase = get_supabase()
	res = supabase.table("transactions").select(
	"gating_alpha, is_recurring, is_low_descriptiveness"
	).eq("user_id", user_id).execute()

	df = pd.DataFrame(res.data or [])
	if df.empty: return []

	# Map to logical groups
	analysis = []

	# 1. Recurring
	rec = df[df['is_recurring'] == True]['gating_alpha'].mean()
	analysis.append({"type": "Recurring (Habit)", "alpha": round(rec, 3) if not pd.isna(rec) else 0.5})

	# 2. Noisy (Standard descriptions)
	noisy = df[df['is_low_descriptiveness'] == True]['gating_alpha'].mean()
	analysis.append({"type": "Noisy (Context-heavy)", "alpha": round(noisy, 3) if not pd.isna(noisy) else 0.5})

	# 3. New Merchants
	new_m = df[df['is_recurring'] == False]['gating_alpha'].mean()
	analysis.append({"type": "New Merchant (Semantic)", "alpha": round(new_m, 3) if not pd.isna(new_m) else 0.5})

	return analysis

	@router.get("/cluster-map")
	async def cluster_map(user_id: str = Depends(get_current_user)):
	"""Returns UMAP coordinates (cached or computed) for scatter plot."""
	supabase = get_supabase()

	# Fetch embeddings and existing coordinates
	# We specify categories!transactions_category_id_fkey to resolve the ambiguity
	res = supabase.table("transaction_embeddings").select(
	"id, embedding, umap_x, umap_y, transactions(merchant_name, confidence_score, category_source, categories!transactions_category_id_fkey(name))"
	).eq("user_id", user_id).execute()

	data = res.data or []
	if not data: return []

	# Check if we need to compute (if any umap_x is null)
	needs_compute = any(t['umap_x'] is None for t in data)

	if needs_compute and len(data) >= 5:
	embeddings = np.array([t['embedding'] for t in data])
	# UMAP requires n_neighbors < n_samples
	n_neighbors = min(15, len(data) - 1)
	reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=42)
	coords = reducer.fit_transform(embeddings)

	# Batch update the coordinates in DB
	for i, t in enumerate(data):
	supabase.table("transaction_embeddings").update({
	"umap_x": float(coords[i, 0]),
	"umap_y": float(coords[i, 1])
	}).eq("id", t['id']).execute()
	t['umap_x'] = float(coords[i, 0])
	t['umap_y'] = float(coords[i, 1])

	# Format for Recharts
	points = []
	for t in data:
	points.append({
	"x": t['umap_x'],
	"y": t['umap_y'],
	"merchant": t['transactions']['merchant_name'],
	"category": t['transactions']['categories']['name'] if t['transactions']['categories'] else "Uncategorized",
	"confidence": t['transactions']['confidence_score'],
	"source": t['transactions']['category_source']
	})

	return points

	@router.post("/retrain-gating")
	async def retrain_gating(
	background_tasks: BackgroundTasks,
	user_id: str = Depends(get_current_user)
	):
	"""(Existing code remains but fits into the dashboard flow)"""
	# Logic already exists, dashboard will link here
	return {"message": "Gating retraining scheduled"}