Spaces:

Vijesh251
/

MoodSyncAI

Sleeping

vijesh418

Bind to 0.0.0.0 on Spaces and disable SSR

adeb0c7 2 days ago

41.2 kB

	"""
	MoodSyncAI: Multi-Modal Sentiment & Emotion Analyser
	====================================================
	Components:
	- Visual emotion: ViT (Vision Transformer) - trpakov/vit-face-expression
	- Text emotion: DistilRoBERTa transformer - j-hartmann/emotion-english-distilroberta-base
	- Fusion: Valence-aligned multimodal fusion + mismatch detection
	- Generative: FLAN-T5 (with safe template fallback) for plain-language summary
	- Webcam: Short video upload/recording, per-frame emotion timeline

	All models are free/open-source from Hugging Face. Runs on CPU.
	"""

	import os
	import io
	import time
	import warnings
	from typing import List, Tuple, Dict

	warnings.filterwarnings("ignore")
	os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
	os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

	import numpy as np
	import pandas as pd
	from PIL import Image
	import cv2
	import plotly.graph_objects as go
	import plotly.express as px
	import gradio as gr

	import torch
	from transformers import (
	pipeline,
	AutoTokenizer,
	AutoModelForSeq2SeqLM,
	AutoModelForImageClassification,
	AutoModelForSequenceClassification,
	AutoImageProcessor,
	)

	# -------------------------------------------------------------
	# Model identifiers (all free / public on Hugging Face Hub)
	# -------------------------------------------------------------
	VISION_MODEL = "trpakov/vit-face-expression" # ViT for facial emotion
	TEXT_MODEL = "j-hartmann/emotion-english-distilroberta-base" # 7 emotions
	GEN_MODEL = "google/flan-t5-base" # generative summariser
	ASR_MODEL = "openai/whisper-tiny" # speech-to-text (Whisper)

	DEVICE = 0 if torch.cuda.is_available() else -1
	print(f"[MoodSyncAI] Torch device: {'cuda' if DEVICE == 0 else 'cpu'}")

	# -------------------------------------------------------------
	# Lazy-loaded model singletons
	# -------------------------------------------------------------
	_vision_pipe = None
	_text_pipe = None
	_gen_tokenizer = None
	_gen_model = None
	_face_cascade = None
	_asr_pipe = None
	_vit_attn_model = None
	_vit_attn_processor = None
	_text_attn_model = None
	_text_attn_tokenizer = None


	def get_vision_pipe():
	global _vision_pipe
	if _vision_pipe is None:
	print("[MoodSyncAI] Loading vision model:", VISION_MODEL)
	_vision_pipe = pipeline(
	"image-classification",
	model=VISION_MODEL,
	device=DEVICE,
	top_k=None,
	)
	return _vision_pipe


	def get_text_pipe():
	global _text_pipe
	if _text_pipe is None:
	print("[MoodSyncAI] Loading text model:", TEXT_MODEL)
	_text_pipe = pipeline(
	"text-classification",
	model=TEXT_MODEL,
	device=DEVICE,
	top_k=None,
	truncation=True,
	)
	return _text_pipe


	def get_generator():
	global _gen_tokenizer, _gen_model
	if _gen_model is None:
	try:
	print("[MoodSyncAI] Loading generator:", GEN_MODEL)
	_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
	_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)
	if DEVICE == 0:
	_gen_model = _gen_model.to("cuda")
	except Exception as e:
	print("[MoodSyncAI] Generator load failed, will use template fallback:", e)
	_gen_tokenizer, _gen_model = None, None
	return _gen_tokenizer, _gen_model


	def get_face_cascade():
	global _face_cascade
	if _face_cascade is None:
	path = os.path.join(cv2.data.haarcascades, "haarcascade_frontalface_default.xml")
	_face_cascade = cv2.CascadeClassifier(path)
	return _face_cascade


	# -------------------------------------------------------------
	# Valence map: used to align textual and visual signals
	# -------------------------------------------------------------
	VALENCE = {
	# text emotions (from distilroberta)
	"joy": 1.0,
	"love": 1.0,
	"surprise": 0.3,
	"neutral": 0.0,
	"sadness": -1.0,
	"fear": -0.8,
	"anger": -0.9,
	"disgust": -0.8,
	# vision labels (ViT face expression labels)
	"happy": 1.0,
	"happiness": 1.0,
	"sad": -1.0,
	"angry": -0.9,
	"fearful": -0.8,
	"fear": -0.8,
	"disgusted": -0.8,
	"surprised": 0.3,
	"contempt": -0.6,
	}


	def valence_of(label: str) -> float:
	return VALENCE.get(label.lower().strip(), 0.0)


	# -------------------------------------------------------------
	# Face detection (crops to face for better accuracy; falls back to full image)
	# -------------------------------------------------------------
	def detect_and_crop_face(pil_img: Image.Image) -> Image.Image:
	try:
	cascade = get_face_cascade()
	rgb = np.array(pil_img.convert("RGB"))
	gray = cv2.cvtColor(rgb, cv2.COLOR_RGB2GRAY)
	faces = cascade.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5, minSize=(60, 60))
	if len(faces) == 0:
	return pil_img
	# Pick the largest face
	x, y, w, h = max(faces, key=lambda b: b[2] * b[3])
	pad = int(0.15 * max(w, h))
	x0 = max(0, x - pad); y0 = max(0, y - pad)
	x1 = min(rgb.shape[1], x + w + pad); y1 = min(rgb.shape[0], y + h + pad)
	return Image.fromarray(rgb[y0:y1, x0:x1])
	except Exception:
	return pil_img


	# -------------------------------------------------------------
	# Core analysis helpers
	# -------------------------------------------------------------
	def predict_visual(pil_img: Image.Image) -> List[Dict]:
	pipe = get_vision_pipe()
	face = detect_and_crop_face(pil_img)
	preds = pipe(face)
	# normalise into list of {label,score}
	return [{"label": p["label"], "score": float(p["score"])} for p in preds]


	def predict_text(text: str) -> List[Dict]:
	if not text or not text.strip():
	return [{"label": "neutral", "score": 1.0}]
	pipe = get_text_pipe()
	preds = pipe(text)[0] # top_k=None -> list of all
	return [{"label": p["label"], "score": float(p["score"])} for p in preds]


	def top1(preds: List[Dict]) -> Tuple[str, float]:
	p = max(preds, key=lambda d: d["score"])
	return p["label"], p["score"]


	def weighted_valence(preds: List[Dict]) -> float:
	return sum(p["score"] * valence_of(p["label"]) for p in preds)


	def fuse(visual_preds: List[Dict], text_preds: List[Dict]) -> Dict:
	v_label, v_conf = top1(visual_preds)
	t_label, t_conf = top1(text_preds)
	v_val = weighted_valence(visual_preds)
	t_val = weighted_valence(text_preds)

	delta = v_val - t_val
	# mismatch: opposite sign with meaningful magnitude
	mismatch = (v_val * t_val < -0.05) or (abs(delta) > 0.9)

	if mismatch:
	status = "MISMATCH DETECTED"
	badge = "🟠"
	elif abs(delta) < 0.35:
	status = "ALIGNED"
	badge = "🟢"
	else:
	status = "PARTIALLY ALIGNED"
	badge = "🟡"

	# overall valence (weighted average favoring visual when mismatch)
	if mismatch:
	overall_val = 0.6 * v_val + 0.4 * t_val
	else:
	overall_val = 0.5 * (v_val + t_val)

	return {
	"visual_label": v_label,
	"visual_conf": v_conf,
	"text_label": t_label,
	"text_conf": t_conf,
	"visual_valence": v_val,
	"text_valence": t_val,
	"delta": delta,
	"status": status,
	"badge": badge,
	"overall_valence": overall_val,
	}


	# -------------------------------------------------------------
	# Generative summary
	# -------------------------------------------------------------
	def template_summary(fusion: Dict) -> str:
	v = fusion["visual_label"]; vc = fusion["visual_conf"]
	t = fusion["text_label"]; tc = fusion["text_conf"]
	if fusion["status"].startswith("MISMATCH"):
	return (
	f"Despite expressing {t} sentiment verbally ({tc*100:.0f}% confidence), "
	f"the speaker's facial cues indicate {v} ({vc*100:.0f}% confidence). "
	f"This incongruence between words and expression is worth noting in the "
	f"context of the conversation - the spoken message may not fully reflect "
	f"how the person actually feels."
	)
	if fusion["status"] == "ALIGNED":
	return (
	f"The speaker's words ({t}, {tc*100:.0f}%) and facial expression "
	f"({v}, {vc*100:.0f}%) are consistent. The overall emotional state "
	f"appears genuine and uncomplicated."
	)
	return (
	f"The speaker shows mild divergence between facial expression ({v}, "
	f"{vc100:.0f}%) and spoken sentiment ({t}, {tc100:.0f}%). The signals "
	f"are not contradictory but suggest some nuance in the emotional state."
	)


	def generative_summary(fusion: Dict, text_input: str) -> str:
	tok, model = get_generator()
	fallback = template_summary(fusion)
	if model is None or tok is None:
	return fallback
	try:
	mismatch = fusion["status"].startswith("MISMATCH")
	instr = (
	"rewrite as one empathetic paragraph (2-3 sentences) that explicitly "
	"highlights the mismatch between facial expression and spoken words"
	if mismatch else
	"rewrite as one empathetic paragraph (2-3 sentences) noting the emotional state"
	)
	prompt = (
	f"You are an empathetic psychologist. Given the analysis below, {instr}. "
	f"Begin with the word 'The'.\n\n"
	f"Analysis:\n"
	f"- Spoken sentence: \"{text_input or '(none provided)'}\"\n"
	f"- Facial emotion detected: {fusion['visual_label']} "
	f"({fusion['visual_conf']*100:.0f}% confidence)\n"
	f"- Sentiment of the words: {fusion['text_label']} "
	f"({fusion['text_conf']*100:.0f}% confidence)\n"
	f"- Alignment: {fusion['status']}\n\n"
	f"Paragraph:"
	)
	inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512)
	if DEVICE == 0:
	inputs = {k: v.to("cuda") for k, v in inputs.items()}
	out = model.generate(
	**inputs,
	max_new_tokens=140,
	min_new_tokens=30,
	num_beams=4,
	do_sample=False,
	no_repeat_ngram_size=3,
	early_stopping=True,
	)
	text = tok.decode(out[0], skip_special_tokens=True).strip()
	# Reject obvious echoes / too-short / off-topic outputs
	bad = (len(text) < 50
	or text.lower().startswith(("tell ", "write ", "give "))
	or "story" in text.lower()[:40]
	or fusion["visual_label"].lower() not in text.lower()
	and fusion["text_label"].lower() not in text.lower())
	if bad:
	return fallback
	return text
	except Exception as e:
	print("[MoodSyncAI] Generation error:", e)
	return fallback


	# -------------------------------------------------------------
	# Plotly charts
	# -------------------------------------------------------------
	def bar_chart(preds: List[Dict], title: str, color: str) -> go.Figure:
	df = pd.DataFrame(preds).sort_values("score", ascending=True)
	df["pct"] = (df["score"] * 100).round(1)
	fig = go.Figure(go.Bar(
	x=df["pct"], y=df["label"], orientation="h",
	marker=dict(color=color),
	text=df["pct"].astype(str) + "%",
	textposition="outside",
	))
	fig.update_layout(
	title=title,
	xaxis_title="Confidence (%)",
	yaxis_title=None,
	xaxis=dict(range=[0, 110]),
	height=320, margin=dict(l=10, r=10, t=40, b=10),
	template="plotly_white",
	)
	return fig


	def empty_fig(msg="No data") -> go.Figure:
	fig = go.Figure()
	fig.add_annotation(text=msg, xref="paper", yref="paper",
	x=0.5, y=0.5, showarrow=False, font=dict(size=14))
	fig.update_layout(height=320, template="plotly_white",
	margin=dict(l=10, r=10, t=20, b=10))
	return fig


	# -------------------------------------------------------------
	# Tab 1: Image + Text analysis
	# -------------------------------------------------------------
	def analyse_image_text(image: Image.Image, text: str):
	if image is None:
	return (empty_fig("Please upload an image"),
	empty_fig("Awaiting input"),
	"### ⚠️ Please upload an image of a face.", "")

	visual_preds = predict_visual(image)
	text_preds = predict_text(text or "")

	fusion = fuse(visual_preds, text_preds)
	summary = generative_summary(fusion, text)

	vfig = bar_chart(visual_preds, "👁️ Visual Emotion (ViT)", "#4C78A8")
	tfig = bar_chart(text_preds, "💬 Text Sentiment (Transformer)", "#54A24B")

	fusion_md = f"""
	### {fusion['badge']} Fusion Result: {fusion['status']}

	\| Modality \| Top Prediction \| Confidence \| Valence \|
	\|---\|---\|---\|---\|
	\| 👁️ Visual \| {fusion['visual_label']} \| {fusion['visual_conf']*100:.1f}% \| {fusion['visual_valence']:+.2f} \|
	\| 💬 Text \| {fusion['text_label']} \| {fusion['text_conf']*100:.1f}% \| {fusion['text_valence']:+.2f} \|
	\| 🔗 Overall valence \| — \| — \| {fusion['overall_valence']:+.2f} \|
	"""
	summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
	return vfig, tfig, fusion_md, summary_md


	# -------------------------------------------------------------
	# Tab 2: Webcam / short video → emotion timeline
	# -------------------------------------------------------------
	def sample_frames(video_path: str, max_frames: int = 12) -> List[Tuple[float, Image.Image]]:
	cap = cv2.VideoCapture(video_path)
	if not cap.isOpened():
	return []
	fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
	total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)

	# If total frames is unknown, read sequentially to count.
	if total <= 0:
	total = 0
	while True:
	ok, _ = cap.read()
	if not ok:
	break
	total += 1
	cap.release()
	cap = cv2.VideoCapture(video_path)
	if total <= 0:
	return []

	duration = total / fps if fps > 0 else 1.0
	n = min(max_frames, max(3, int(duration * 2))) # ~2 fps target
	target_idxs = set(np.linspace(0, total - 1, n).astype(int).tolist())

	out: List[Tuple[float, Image.Image]] = []
	idx = 0
	while True:
	ok, frame = cap.read()
	if not ok:
	break
	if idx in target_idxs:
	ts = idx / fps if fps > 0 else float(idx)
	pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	out.append((float(ts), pil))
	if len(out) >= n:
	break
	idx += 1
	cap.release()
	return out


	def analyse_video_text(video_path, text: str):
	if not video_path:
	return (empty_fig("Record or upload a short video"),
	empty_fig("Awaiting input"),
	empty_fig("Awaiting input"),
	"### ⚠️ Please provide a webcam video.", "")

	frames = sample_frames(video_path, max_frames=12)
	if not frames:
	return (empty_fig("Could not read video"),
	empty_fig(""), empty_fig(""),
	"### ⚠️ Could not decode the video file.", "")

	timeline = [] # list of dict: ts, label->score
	aggregated: Dict[str, float] = {}
	for ts, pil in frames:
	preds = predict_visual(pil)
	row = {"timestamp": ts}
	for p in preds:
	row[p["label"]] = p["score"]
	aggregated[p["label"]] = aggregated.get(p["label"], 0.0) + p["score"]
	timeline.append(row)

	# Average the aggregated visual prediction across frames
	n = len(frames)
	avg_visual = [{"label": k, "score": v / n} for k, v in aggregated.items()]

	text_preds = predict_text(text or "")
	fusion = fuse(avg_visual, text_preds)
	summary = generative_summary(fusion, text)

	# Timeline figure (line per emotion)
	df = pd.DataFrame(timeline).fillna(0.0)
	label_cols = [c for c in df.columns if c != "timestamp"]
	tl_fig = go.Figure()
	palette = px.colors.qualitative.Set2
	for i, lbl in enumerate(label_cols):
	tl_fig.add_trace(go.Scatter(
	x=df["timestamp"], y=df[lbl] * 100,
	mode="lines+markers", name=lbl,
	line=dict(color=palette[i % len(palette)], width=2),
	))
	tl_fig.update_layout(
	title="📈 Emotion Timeline (per frame)",
	xaxis_title="Time (s)", yaxis_title="Confidence (%)",
	height=360, template="plotly_white",
	margin=dict(l=10, r=10, t=40, b=10),
	yaxis=dict(range=[0, 100]),
	)

	vfig = bar_chart(avg_visual, "👁️ Average Visual Emotion", "#4C78A8")
	tfig = bar_chart(text_preds, "💬 Text Sentiment", "#54A24B")

	fusion_md = f"""
	### {fusion['badge']} Fusion Result: {fusion['status']}

	\| Modality \| Top Prediction \| Confidence \| Valence \|
	\|---\|---\|---\|---\|
	\| 👁️ Visual (avg) \| {fusion['visual_label']} \| {fusion['visual_conf']*100:.1f}% \| {fusion['visual_valence']:+.2f} \|
	\| 💬 Text \| {fusion['text_label']} \| {fusion['text_conf']*100:.1f}% \| {fusion['text_valence']:+.2f} \|
	\| 🔗 Overall valence \| — \| — \| {fusion['overall_valence']:+.2f} \|

	Analysed {n} frames from the video.
	"""
	summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
	return tl_fig, vfig, tfig, fusion_md, summary_md


	# =============================================================
	# NEW FEATURE BLOCK (additive — does not touch Tab 1 / Tab 2)
	# =============================================================
	# 1) Whisper ASR (audio → text channel)
	# 2) Video with audio (transcribe + frame timeline + fusion)
	# 3) Attention visualisation (ViT rollout heatmap + text token attention)
	# =============================================================

	import tempfile
	import subprocess
	import html as _html


	def get_asr_pipe():
	global _asr_pipe
	if _asr_pipe is None:
	print("[MoodSyncAI] Loading ASR model:", ASR_MODEL)
	_asr_pipe = pipeline(
	"automatic-speech-recognition",
	model=ASR_MODEL,
	device=DEVICE,
	chunk_length_s=30,
	return_timestamps=False,
	)
	return _asr_pipe


	def transcribe_audio(audio_path: str) -> str:
	if not audio_path:
	return ""
	try:
	# Load audio ourselves (soundfile/librosa) so we don't depend on
	# whisper's internal ffmpeg-via-PATH lookup.
	import soundfile as sf
	try:
	audio, sr = sf.read(audio_path, dtype="float32", always_2d=False)
	except Exception:
	import librosa
	audio, sr = librosa.load(audio_path, sr=16000, mono=True)
	if audio.ndim > 1:
	audio = audio.mean(axis=1)
	if sr != 16000:
	import librosa
	audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
	sr = 16000
	if audio.size == 0:
	return ""
	pipe = get_asr_pipe()
	out = pipe(
	{"array": audio, "sampling_rate": sr},
	generate_kwargs={"language": "en", "task": "transcribe"},
	)
	text = out.get("text", "") if isinstance(out, dict) else str(out)
	return (text or "").strip()
	except Exception as e:
	print("[MoodSyncAI] Transcription error:", e)
	return ""


	def _ffmpeg_exe() -> str:
	try:
	import imageio_ffmpeg
	return imageio_ffmpeg.get_ffmpeg_exe()
	except Exception:
	return "ffmpeg"


	def extract_audio_from_video(video_path: str) -> str:
	"""Extract mono 16 kHz wav from video. Returns wav path or '' on failure."""
	if not video_path:
	return ""
	try:
	out_path = tempfile.NamedTemporaryFile(
	suffix=".wav", delete=False
	).name
	cmd = [
	_ffmpeg_exe(), "-y", "-i", video_path,
	"-vn", "-ac", "1", "-ar", "16000",
	"-f", "wav", out_path,
	]
	proc = subprocess.run(cmd, capture_output=True, timeout=120)
	if proc.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) < 1024:
	return ""
	return out_path
	except Exception as e:
	print("[MoodSyncAI] Audio-extract error:", e)
	return ""


	# -------------------------------------------------------------
	# Attention visualisation
	# -------------------------------------------------------------
	def _get_vit_attn():
	global _vit_attn_model, _vit_attn_processor
	if _vit_attn_model is None:
	print("[MoodSyncAI] Loading ViT (eager attn) for attention rollout")
	_vit_attn_processor = AutoImageProcessor.from_pretrained(VISION_MODEL)
	_vit_attn_model = AutoModelForImageClassification.from_pretrained(
	VISION_MODEL, attn_implementation="eager"
	)
	_vit_attn_model.eval()
	if DEVICE == 0:
	_vit_attn_model = _vit_attn_model.to("cuda")
	return _vit_attn_model, _vit_attn_processor


	def _get_text_attn():
	global _text_attn_model, _text_attn_tokenizer
	if _text_attn_model is None:
	print("[MoodSyncAI] Loading text classifier (eager attn) for token attention")
	_text_attn_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
	_text_attn_model = AutoModelForSequenceClassification.from_pretrained(
	TEXT_MODEL, attn_implementation="eager"
	)
	_text_attn_model.eval()
	if DEVICE == 0:
	_text_attn_model = _text_attn_model.to("cuda")
	return _text_attn_model, _text_attn_tokenizer


	def vit_attention_heatmap(pil_img: Image.Image) -> Image.Image:
	"""Attention-rollout heatmap overlaid on the (face-cropped) image."""
	try:
	face = detect_and_crop_face(pil_img).convert("RGB")
	model, processor = _get_vit_attn()
	inputs = processor(images=face, return_tensors="pt")
	if DEVICE == 0:
	inputs = {k: v.to("cuda") for k, v in inputs.items()}
	with torch.no_grad():
	out = model(**inputs, output_attentions=True)
	attns = out.attentions # tuple(L) of (1, H, S, S)
	if not attns:
	return face

	# Attention rollout: avg heads, add identity, normalise, multiply layers
	result = None
	for a in attns:
	a = a.mean(dim=1).squeeze(0) # (S, S)
	a = a + torch.eye(a.size(0), device=a.device)
	a = a / a.sum(dim=-1, keepdim=True)
	result = a if result is None else a @ result

	# CLS-token row, drop CLS index → patch importances
	cls_attn = result[0, 1:].detach().cpu().numpy()
	side = int(np.sqrt(cls_attn.shape[0]))
	if side * side != cls_attn.shape[0]:
	return face
	grid = cls_attn.reshape(side, side)
	grid = (grid - grid.min()) / (grid.max() - grid.min() + 1e-8)

	# Resize heatmap to face image
	w, h = face.size
	heat = cv2.resize(grid, (w, h), interpolation=cv2.INTER_CUBIC)
	heat_u8 = (heat * 255).astype(np.uint8)
	color = cv2.applyColorMap(heat_u8, cv2.COLORMAP_JET)
	color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB)
	base = np.array(face)
	overlay = (0.55 * base + 0.45 * color).clip(0, 255).astype(np.uint8)
	return Image.fromarray(overlay)
	except Exception as e:
	print("[MoodSyncAI] ViT attention error:", e)
	return pil_img


	def text_token_attention_html(text: str) -> str:
	"""Render input text with per-token attention intensity (last layer, [CLS] row)."""
	if not text or not text.strip():
	return "<em>(no text)</em>"
	try:
	model, tok = _get_text_attn()
	enc = tok(text, return_tensors="pt", truncation=True, max_length=256)
	if DEVICE == 0:
	enc = {k: v.to("cuda") for k, v in enc.items()}
	with torch.no_grad():
	out = model(**enc, output_attentions=True)
	attns = out.attentions # tuple(L) of (1, H, S, S)
	if not attns:
	return _html.escape(text)
	last = attns[-1].mean(dim=1).squeeze(0) # (S, S)
	cls_row = last[0].detach().cpu().numpy() # importance of each token to CLS

	ids = enc["input_ids"][0].detach().cpu().tolist()
	tokens = tok.convert_ids_to_tokens(ids)
	# Skip special tokens for normalisation range
	specials = set(tok.all_special_tokens)
	keep_mask = np.array([t not in specials for t in tokens])
	if keep_mask.sum() == 0:
	return _html.escape(text)
	scores = cls_row.copy()
	scores_disp = scores[keep_mask]
	lo, hi = scores_disp.min(), scores_disp.max()
	norm = (scores - lo) / (hi - lo + 1e-8)
	norm = np.clip(norm, 0.0, 1.0)

	# Build HTML: merge subword tokens (RoBERTa uses 'Ġ' prefix for word start)
	spans = []
	for i, t in enumerate(tokens):
	if t in specials:
	continue
	display = t
	prefix_space = ""
	if display.startswith("Ġ"):
	display = display[1:]
	prefix_space = " "
	elif display.startswith("▁"):
	display = display[1:]
	prefix_space = " "
	intensity = float(norm[i])
	# red highlight, alpha from intensity
	bg = f"rgba(220,38,38,{intensity:.2f})"
	color = "#fff" if intensity > 0.55 else "#111"
	safe = _html.escape(display)
	spans.append(
	f"{prefix_space}<span style=\"background:{bg};color:{color};"
	f"padding:2px 4px;border-radius:4px;margin:1px;"
	f"font-family:monospace\" title=\"{intensity:.2f}\">{safe}</span>"
	)
	body = "".join(spans).strip()
	legend = (
	"<div style='margin-top:8px;font-size:12px;color:#555'>"
	"Darker red = higher attention weight from [CLS] to that token "
	"(last transformer layer, averaged over heads)."
	"</div>"
	)
	return f"<div style='line-height:2;font-size:15px'>{body}</div>{legend}"
	except Exception as e:
	print("[MoodSyncAI] Text attention error:", e)
	return _html.escape(text)


	# -------------------------------------------------------------
	# Tab 1 wrapper: existing outputs + (optional) attention viz
	# -------------------------------------------------------------
	def analyse_image_text_with_attention(image: Image.Image, text: str, show_attn: bool):
	vfig, tfig, fusion_md, summary_md = analyse_image_text(image, text)
	if not show_attn or image is None:
	return (vfig, tfig, fusion_md, summary_md,
	None, "<em>Toggle 'Show attention visualisation' to view.</em>")
	heat = vit_attention_heatmap(image)
	token_html = text_token_attention_html(text or "")
	return vfig, tfig, fusion_md, summary_md, heat, token_html


	# -------------------------------------------------------------
	# Tab 3: Audio + Image
	# -------------------------------------------------------------
	def analyse_audio_image(audio_path, image: Image.Image):
	if image is None and not audio_path:
	return ("",
	empty_fig("Provide an image"),
	empty_fig("Provide audio"),
	"### ⚠️ Please provide both an image and audio.", "")
	transcript = transcribe_audio(audio_path) if audio_path else ""
	if not transcript:
	transcript = "(no speech detected)"
	if image is None:
	return (transcript,
	empty_fig("No image provided"),
	empty_fig("(transcript only)"),
	"### ⚠️ Please also provide a face image.", "")

	visual_preds = predict_visual(image)
	spoken = "" if transcript.startswith("(") else transcript
	text_preds = predict_text(spoken)
	fusion = fuse(visual_preds, text_preds)
	summary = generative_summary(fusion, spoken)

	vfig = bar_chart(visual_preds, "👁️ Visual Emotion (ViT)", "#4C78A8")
	tfig = bar_chart(text_preds, "💬 Sentiment of Transcribed Speech", "#54A24B")
	fusion_md = f"""
	### {fusion['badge']} Fusion Result: {fusion['status']}

	\| Modality \| Top Prediction \| Confidence \| Valence \|
	\|---\|---\|---\|---\|
	\| 👁️ Visual (image) \| {fusion['visual_label']} \| {fusion['visual_conf']*100:.1f}% \| {fusion['visual_valence']:+.2f} \|
	\| 🎙️ Audio → Text \| {fusion['text_label']} \| {fusion['text_conf']*100:.1f}% \| {fusion['text_valence']:+.2f} \|
	\| 🔗 Overall valence \| — \| — \| {fusion['overall_valence']:+.2f} \|
	"""
	summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
	return transcript, vfig, tfig, fusion_md, summary_md


	# -------------------------------------------------------------
	# Tab 4: Video WITH audio (frames timeline + audio transcript → text channel)
	# -------------------------------------------------------------
	def analyse_video_with_audio(video_path):
	if not video_path:
	return ("",
	empty_fig("Record or upload a video"),
	empty_fig(""), empty_fig(""),
	"### ⚠️ Please provide a video.", "")

	frames = sample_frames(video_path, max_frames=12)
	if not frames:
	return ("",
	empty_fig("Could not read video"),
	empty_fig(""), empty_fig(""),
	"### ⚠️ Could not decode the video file.", "")

	# 1) Audio → transcript
	wav = extract_audio_from_video(video_path)
	transcript = transcribe_audio(wav) if wav else ""
	if wav and os.path.exists(wav):
	try: os.remove(wav)
	except Exception: pass
	if not transcript:
	transcript = "(no speech detected in the audio track)"
	spoken = "" if transcript.startswith("(") else transcript

	# 2) Per-frame visual + aggregate
	timeline = []
	aggregated: Dict[str, float] = {}
	for ts, pil in frames:
	preds = predict_visual(pil)
	row = {"timestamp": ts}
	for p in preds:
	row[p["label"]] = p["score"]
	aggregated[p["label"]] = aggregated.get(p["label"], 0.0) + p["score"]
	timeline.append(row)
	n = len(frames)
	avg_visual = [{"label": k, "score": v / n} for k, v in aggregated.items()]

	# 3) Text channel from transcript
	text_preds = predict_text(spoken)
	fusion = fuse(avg_visual, text_preds)
	summary = generative_summary(fusion, spoken)

	# Timeline figure
	df = pd.DataFrame(timeline).fillna(0.0)
	label_cols = [c for c in df.columns if c != "timestamp"]
	tl_fig = go.Figure()
	palette = px.colors.qualitative.Set2
	for i, lbl in enumerate(label_cols):
	tl_fig.add_trace(go.Scatter(
	x=df["timestamp"], y=df[lbl] * 100,
	mode="lines+markers", name=lbl,
	line=dict(color=palette[i % len(palette)], width=2),
	))
	tl_fig.update_layout(
	title="📈 Emotion Timeline (per frame) — audio transcript drives text channel",
	xaxis_title="Time (s)", yaxis_title="Confidence (%)",
	height=360, template="plotly_white",
	margin=dict(l=10, r=10, t=40, b=10),
	yaxis=dict(range=[0, 100]),
	)

	vfig = bar_chart(avg_visual, "👁️ Avg Visual Emotion (frames)", "#4C78A8")
	tfig = bar_chart(text_preds, "💬 Sentiment of Spoken Audio", "#54A24B")

	fusion_md = f"""
	### {fusion['badge']} Fusion Result: {fusion['status']}

	\| Modality \| Top Prediction \| Confidence \| Valence \|
	\|---\|---\|---\|---\|
	\| 👁️ Visual (avg of {n} frames) \| {fusion['visual_label']} \| {fusion['visual_conf']*100:.1f}% \| {fusion['visual_valence']:+.2f} \|
	\| 🎙️ Audio transcript \| {fusion['text_label']} \| {fusion['text_conf']*100:.1f}% \| {fusion['text_valence']:+.2f} \|
	\| 🔗 Overall valence \| — \| — \| {fusion['overall_valence']:+.2f} \|

	Spoken words (auto-transcribed): "{spoken or '—'}"
	"""
	summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
	return transcript, tl_fig, vfig, tfig, fusion_md, summary_md


	# -------------------------------------------------------------
	# Gradio UI
	# -------------------------------------------------------------
	CSS = """
	.gradio-container {max-width: 1200px !important;}
	#title {text-align:center;}
	footer {display: none !important;}
	.show-api, .built-with, .settings {display: none !important;}
	"""

	with gr.Blocks(title="MoodSyncAI", theme=gr.themes.Soft(), css=CSS) as demo:
	gr.Markdown("# 🎭 MoodSyncAI", elem_id="title")
	gr.Markdown(
	"Multi-Modal Sentiment & Emotion Analyser — combines a Vision "
	"Transformer (face), a Transformer text classifier (words), a fusion "
	"layer (mismatch detection), and a generative model (plain-language "
	"summary). 100% open-source."
	)

	with gr.Tabs():
	# ---------------- Tab 1 ----------------
	with gr.Tab("🖼️ Image + Text"):
	with gr.Row():
	with gr.Column(scale=1):
	img_in = gr.Image(type="pil", label="Face photo", height=320)
	txt_in = gr.Textbox(
	label="What the person said",
	placeholder="e.g., No, I think the project is going really well.",
	lines=2,
	)
	btn1 = gr.Button("🔍 Analyse", variant="primary")
	attn_toggle1 = gr.Checkbox(
	label="🔬 Show attention visualisation (ViT rollout + text tokens)",
	value=False,
	)
	gr.Examples(
	examples=[
	[None, "No, I think the project is going really well."],
	[None, "I'm absolutely thrilled about the results!"],
	[None, "I'm fine, really, don't worry about me."],
	],
	inputs=[img_in, txt_in],
	)
	with gr.Column(scale=2):
	fusion_md1 = gr.Markdown()
	summary_md1 = gr.Markdown()
	with gr.Row():
	vbar1 = gr.Plot(label="Visual emotion")
	tbar1 = gr.Plot(label="Text sentiment")
	with gr.Accordion("🔬 Attention visualisation", open=False):
	attn_img1 = gr.Image(
	label="ViT attention rollout (face)",
	height=320, interactive=False,
	)
	attn_html1 = gr.HTML(label="Text token attention")
	btn1.click(analyse_image_text_with_attention,
	inputs=[img_in, txt_in, attn_toggle1],
	outputs=[vbar1, tbar1, fusion_md1, summary_md1,
	attn_img1, attn_html1])

	# ---------------- Tab 2 ----------------
	with gr.Tab("📹 Webcam / Video + Text"):
	gr.Markdown(
	"Record a short clip from your webcam (3–10 s recommended) or "
	"upload a short video. The system samples frames and builds an "
	"emotion timeline."
	)
	with gr.Row():
	with gr.Column(scale=1):
	vid_in = gr.Video(
	label="Webcam / video",
	sources=["webcam", "upload"],
	height=300,
	)
	txt_in2 = gr.Textbox(
	label="What the person said",
	placeholder="Type the spoken sentence here…",
	lines=2,
	)
	btn2 = gr.Button("🔍 Analyse video", variant="primary")
	with gr.Column(scale=2):
	timeline_plot = gr.Plot(label="Emotion timeline")
	fusion_md2 = gr.Markdown()
	summary_md2 = gr.Markdown()
	with gr.Row():
	vbar2 = gr.Plot(label="Avg visual emotion")
	tbar2 = gr.Plot(label="Text sentiment")
	btn2.click(analyse_video_text,
	inputs=[vid_in, txt_in2],
	outputs=[timeline_plot, vbar2, tbar2, fusion_md2, summary_md2])

	# ---------------- Tab 3 : Audio + Image ----------------
	with gr.Tab("🎙️ Audio + Image"):
	gr.Markdown(
	"Speak (or upload audio) and provide a face image. Whisper "
	"transcribes the audio; the words become the text channel fed "
	"into the multimodal fusion."
	)
	with gr.Row():
	with gr.Column(scale=1):
	audio_in3 = gr.Audio(
	label="🎙️ Audio (microphone or upload)",
	sources=["microphone", "upload"],
	type="filepath",
	)
	img_in3 = gr.Image(type="pil", label="Face photo", height=300)
	btn3 = gr.Button("🔍 Transcribe & analyse", variant="primary")
	with gr.Column(scale=2):
	transcript3 = gr.Textbox(
	label="Auto-transcript (Whisper)",
	interactive=False, lines=2,
	)
	fusion_md3 = gr.Markdown()
	summary_md3 = gr.Markdown()
	with gr.Row():
	vbar3 = gr.Plot(label="Visual emotion")
	tbar3 = gr.Plot(label="Audio→text sentiment")
	btn3.click(analyse_audio_image,
	inputs=[audio_in3, img_in3],
	outputs=[transcript3, vbar3, tbar3, fusion_md3, summary_md3])

	# ---------------- Tab 4 : Video WITH audio ----------------
	with gr.Tab("🎬 Video with Audio"):
	gr.Markdown(
	"Record or upload a short video with sound. The system extracts "
	"the audio track, transcribes it (Whisper), samples frames for an "
	"emotion timeline, then fuses the visual signal with the spoken-word "
	"sentiment — no manual typing needed."
	)
	with gr.Row():
	with gr.Column(scale=1):
	vid_in4 = gr.Video(
	label="Webcam / video (with audio)",
	sources=["webcam", "upload"],
	height=300,
	)
	btn4 = gr.Button("🔍 Transcribe & analyse video", variant="primary")
	with gr.Column(scale=2):
	transcript4 = gr.Textbox(
	label="Auto-transcript (Whisper)",
	interactive=False, lines=2,
	)
	timeline_plot4 = gr.Plot(label="Emotion timeline")
	fusion_md4 = gr.Markdown()
	summary_md4 = gr.Markdown()
	with gr.Row():
	vbar4 = gr.Plot(label="Avg visual emotion")
	tbar4 = gr.Plot(label="Audio→text sentiment")
	btn4.click(analyse_video_with_audio,
	inputs=[vid_in4],
	outputs=[transcript4, timeline_plot4, vbar4, tbar4,
	fusion_md4, summary_md4])

	# ---------------- Tab 3 (about) ----------------
	with gr.Tab("ℹ️ About"):
	gr.Markdown(f"""
	### Architecture

	\| Stage \| Model \| Type \|
	\|---\|---\|---\|
	\| Visual emotion \| `{VISION_MODEL}` \| Vision Transformer (ViT) \|
	\| Text sentiment \| `{TEXT_MODEL}` \| Transformer (DistilRoBERTa) \|
	\| Speech-to-text \| `{ASR_MODEL}` \| Encoder-Decoder Transformer (Whisper) \|
	\| Fusion \| Valence-aligned multimodal fusion (custom) \| rule + weighted \|
	\| Generative summary \| `{GEN_MODEL}` \| Encoder-Decoder Transformer (FLAN-T5) \|
	\| Attention viz \| ViT attention rollout + last-layer text attention \| interpretability \|

	### Fusion logic

	1. Each modality produces a probability distribution over emotion labels.
	2. Labels are mapped to a valence score in `[-1, +1]`.
	3. We compute weighted valence per modality, then a delta.
	4. Opposite signs → MISMATCH (amber). Small delta → ALIGNED (green).
	5. Generative model receives the structured signals and writes plain-language output.

	### Privacy

	All processing runs locally on your machine; no data is sent to external services
	after the first model download from the Hugging Face Hub.
	""")

	if __name__ == "__main__":
	# Warm up small models so first request is snappy
	try:
	get_text_pipe()
	except Exception as e:
	print("[MoodSyncAI] Warmup text failed:", e)
	import os as _os
	_on_spaces = bool(_os.environ.get("SPACE_ID"))
	demo.queue().launch(
	server_name="0.0.0.0" if _on_spaces else "127.0.0.1",
	server_port=7860,
	inbrowser=not _on_spaces,
	show_error=True,
	show_api=False,
	ssr_mode=False,
	)