MoodSyncAI / app.py
vijesh418
Bind to 0.0.0.0 on Spaces and disable SSR
adeb0c7
"""
MoodSyncAI: Multi-Modal Sentiment & Emotion Analyser
====================================================
Components:
- Visual emotion: ViT (Vision Transformer) - trpakov/vit-face-expression
- Text emotion: DistilRoBERTa transformer - j-hartmann/emotion-english-distilroberta-base
- Fusion: Valence-aligned multimodal fusion + mismatch detection
- Generative: FLAN-T5 (with safe template fallback) for plain-language summary
- Webcam: Short video upload/recording, per-frame emotion timeline
All models are free/open-source from Hugging Face. Runs on CPU.
"""
import os
import io
import time
import warnings
from typing import List, Tuple, Dict
warnings.filterwarnings("ignore")
os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import plotly.graph_objects as go
import plotly.express as px
import gradio as gr
import torch
from transformers import (
pipeline,
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForImageClassification,
AutoModelForSequenceClassification,
AutoImageProcessor,
)
# -------------------------------------------------------------
# Model identifiers (all free / public on Hugging Face Hub)
# -------------------------------------------------------------
VISION_MODEL = "trpakov/vit-face-expression" # ViT for facial emotion
TEXT_MODEL = "j-hartmann/emotion-english-distilroberta-base" # 7 emotions
GEN_MODEL = "google/flan-t5-base" # generative summariser
ASR_MODEL = "openai/whisper-tiny" # speech-to-text (Whisper)
DEVICE = 0 if torch.cuda.is_available() else -1
print(f"[MoodSyncAI] Torch device: {'cuda' if DEVICE == 0 else 'cpu'}")
# -------------------------------------------------------------
# Lazy-loaded model singletons
# -------------------------------------------------------------
_vision_pipe = None
_text_pipe = None
_gen_tokenizer = None
_gen_model = None
_face_cascade = None
_asr_pipe = None
_vit_attn_model = None
_vit_attn_processor = None
_text_attn_model = None
_text_attn_tokenizer = None
def get_vision_pipe():
global _vision_pipe
if _vision_pipe is None:
print("[MoodSyncAI] Loading vision model:", VISION_MODEL)
_vision_pipe = pipeline(
"image-classification",
model=VISION_MODEL,
device=DEVICE,
top_k=None,
)
return _vision_pipe
def get_text_pipe():
global _text_pipe
if _text_pipe is None:
print("[MoodSyncAI] Loading text model:", TEXT_MODEL)
_text_pipe = pipeline(
"text-classification",
model=TEXT_MODEL,
device=DEVICE,
top_k=None,
truncation=True,
)
return _text_pipe
def get_generator():
global _gen_tokenizer, _gen_model
if _gen_model is None:
try:
print("[MoodSyncAI] Loading generator:", GEN_MODEL)
_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)
if DEVICE == 0:
_gen_model = _gen_model.to("cuda")
except Exception as e:
print("[MoodSyncAI] Generator load failed, will use template fallback:", e)
_gen_tokenizer, _gen_model = None, None
return _gen_tokenizer, _gen_model
def get_face_cascade():
global _face_cascade
if _face_cascade is None:
path = os.path.join(cv2.data.haarcascades, "haarcascade_frontalface_default.xml")
_face_cascade = cv2.CascadeClassifier(path)
return _face_cascade
# -------------------------------------------------------------
# Valence map: used to align textual and visual signals
# -------------------------------------------------------------
VALENCE = {
# text emotions (from distilroberta)
"joy": 1.0,
"love": 1.0,
"surprise": 0.3,
"neutral": 0.0,
"sadness": -1.0,
"fear": -0.8,
"anger": -0.9,
"disgust": -0.8,
# vision labels (ViT face expression labels)
"happy": 1.0,
"happiness": 1.0,
"sad": -1.0,
"angry": -0.9,
"fearful": -0.8,
"fear": -0.8,
"disgusted": -0.8,
"surprised": 0.3,
"contempt": -0.6,
}
def valence_of(label: str) -> float:
return VALENCE.get(label.lower().strip(), 0.0)
# -------------------------------------------------------------
# Face detection (crops to face for better accuracy; falls back to full image)
# -------------------------------------------------------------
def detect_and_crop_face(pil_img: Image.Image) -> Image.Image:
try:
cascade = get_face_cascade()
rgb = np.array(pil_img.convert("RGB"))
gray = cv2.cvtColor(rgb, cv2.COLOR_RGB2GRAY)
faces = cascade.detectMultiScale(gray, scaleFactor=1.2, minNeighbors=5, minSize=(60, 60))
if len(faces) == 0:
return pil_img
# Pick the largest face
x, y, w, h = max(faces, key=lambda b: b[2] * b[3])
pad = int(0.15 * max(w, h))
x0 = max(0, x - pad); y0 = max(0, y - pad)
x1 = min(rgb.shape[1], x + w + pad); y1 = min(rgb.shape[0], y + h + pad)
return Image.fromarray(rgb[y0:y1, x0:x1])
except Exception:
return pil_img
# -------------------------------------------------------------
# Core analysis helpers
# -------------------------------------------------------------
def predict_visual(pil_img: Image.Image) -> List[Dict]:
pipe = get_vision_pipe()
face = detect_and_crop_face(pil_img)
preds = pipe(face)
# normalise into list of {label,score}
return [{"label": p["label"], "score": float(p["score"])} for p in preds]
def predict_text(text: str) -> List[Dict]:
if not text or not text.strip():
return [{"label": "neutral", "score": 1.0}]
pipe = get_text_pipe()
preds = pipe(text)[0] # top_k=None -> list of all
return [{"label": p["label"], "score": float(p["score"])} for p in preds]
def top1(preds: List[Dict]) -> Tuple[str, float]:
p = max(preds, key=lambda d: d["score"])
return p["label"], p["score"]
def weighted_valence(preds: List[Dict]) -> float:
return sum(p["score"] * valence_of(p["label"]) for p in preds)
def fuse(visual_preds: List[Dict], text_preds: List[Dict]) -> Dict:
v_label, v_conf = top1(visual_preds)
t_label, t_conf = top1(text_preds)
v_val = weighted_valence(visual_preds)
t_val = weighted_valence(text_preds)
delta = v_val - t_val
# mismatch: opposite sign with meaningful magnitude
mismatch = (v_val * t_val < -0.05) or (abs(delta) > 0.9)
if mismatch:
status = "MISMATCH DETECTED"
badge = "🟠"
elif abs(delta) < 0.35:
status = "ALIGNED"
badge = "🟢"
else:
status = "PARTIALLY ALIGNED"
badge = "🟡"
# overall valence (weighted average favoring visual when mismatch)
if mismatch:
overall_val = 0.6 * v_val + 0.4 * t_val
else:
overall_val = 0.5 * (v_val + t_val)
return {
"visual_label": v_label,
"visual_conf": v_conf,
"text_label": t_label,
"text_conf": t_conf,
"visual_valence": v_val,
"text_valence": t_val,
"delta": delta,
"status": status,
"badge": badge,
"overall_valence": overall_val,
}
# -------------------------------------------------------------
# Generative summary
# -------------------------------------------------------------
def template_summary(fusion: Dict) -> str:
v = fusion["visual_label"]; vc = fusion["visual_conf"]
t = fusion["text_label"]; tc = fusion["text_conf"]
if fusion["status"].startswith("MISMATCH"):
return (
f"Despite expressing **{t}** sentiment verbally ({tc*100:.0f}% confidence), "
f"the speaker's facial cues indicate **{v}** ({vc*100:.0f}% confidence). "
f"This incongruence between words and expression is worth noting in the "
f"context of the conversation - the spoken message may not fully reflect "
f"how the person actually feels."
)
if fusion["status"] == "ALIGNED":
return (
f"The speaker's words ({t}, {tc*100:.0f}%) and facial expression "
f"({v}, {vc*100:.0f}%) are consistent. The overall emotional state "
f"appears genuine and uncomplicated."
)
return (
f"The speaker shows mild divergence between facial expression ({v}, "
f"{vc*100:.0f}%) and spoken sentiment ({t}, {tc*100:.0f}%). The signals "
f"are not contradictory but suggest some nuance in the emotional state."
)
def generative_summary(fusion: Dict, text_input: str) -> str:
tok, model = get_generator()
fallback = template_summary(fusion)
if model is None or tok is None:
return fallback
try:
mismatch = fusion["status"].startswith("MISMATCH")
instr = (
"rewrite as one empathetic paragraph (2-3 sentences) that explicitly "
"highlights the mismatch between facial expression and spoken words"
if mismatch else
"rewrite as one empathetic paragraph (2-3 sentences) noting the emotional state"
)
prompt = (
f"You are an empathetic psychologist. Given the analysis below, {instr}. "
f"Begin with the word 'The'.\n\n"
f"Analysis:\n"
f"- Spoken sentence: \"{text_input or '(none provided)'}\"\n"
f"- Facial emotion detected: {fusion['visual_label']} "
f"({fusion['visual_conf']*100:.0f}% confidence)\n"
f"- Sentiment of the words: {fusion['text_label']} "
f"({fusion['text_conf']*100:.0f}% confidence)\n"
f"- Alignment: {fusion['status']}\n\n"
f"Paragraph:"
)
inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512)
if DEVICE == 0:
inputs = {k: v.to("cuda") for k, v in inputs.items()}
out = model.generate(
**inputs,
max_new_tokens=140,
min_new_tokens=30,
num_beams=4,
do_sample=False,
no_repeat_ngram_size=3,
early_stopping=True,
)
text = tok.decode(out[0], skip_special_tokens=True).strip()
# Reject obvious echoes / too-short / off-topic outputs
bad = (len(text) < 50
or text.lower().startswith(("tell ", "write ", "give "))
or "story" in text.lower()[:40]
or fusion["visual_label"].lower() not in text.lower()
and fusion["text_label"].lower() not in text.lower())
if bad:
return fallback
return text
except Exception as e:
print("[MoodSyncAI] Generation error:", e)
return fallback
# -------------------------------------------------------------
# Plotly charts
# -------------------------------------------------------------
def bar_chart(preds: List[Dict], title: str, color: str) -> go.Figure:
df = pd.DataFrame(preds).sort_values("score", ascending=True)
df["pct"] = (df["score"] * 100).round(1)
fig = go.Figure(go.Bar(
x=df["pct"], y=df["label"], orientation="h",
marker=dict(color=color),
text=df["pct"].astype(str) + "%",
textposition="outside",
))
fig.update_layout(
title=title,
xaxis_title="Confidence (%)",
yaxis_title=None,
xaxis=dict(range=[0, 110]),
height=320, margin=dict(l=10, r=10, t=40, b=10),
template="plotly_white",
)
return fig
def empty_fig(msg="No data") -> go.Figure:
fig = go.Figure()
fig.add_annotation(text=msg, xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False, font=dict(size=14))
fig.update_layout(height=320, template="plotly_white",
margin=dict(l=10, r=10, t=20, b=10))
return fig
# -------------------------------------------------------------
# Tab 1: Image + Text analysis
# -------------------------------------------------------------
def analyse_image_text(image: Image.Image, text: str):
if image is None:
return (empty_fig("Please upload an image"),
empty_fig("Awaiting input"),
"### ⚠️ Please upload an image of a face.", "")
visual_preds = predict_visual(image)
text_preds = predict_text(text or "")
fusion = fuse(visual_preds, text_preds)
summary = generative_summary(fusion, text)
vfig = bar_chart(visual_preds, "👁️ Visual Emotion (ViT)", "#4C78A8")
tfig = bar_chart(text_preds, "💬 Text Sentiment (Transformer)", "#54A24B")
fusion_md = f"""
### {fusion['badge']} Fusion Result: **{fusion['status']}**
| Modality | Top Prediction | Confidence | Valence |
|---|---|---|---|
| 👁️ Visual | **{fusion['visual_label']}** | {fusion['visual_conf']*100:.1f}% | {fusion['visual_valence']:+.2f} |
| 💬 Text | **{fusion['text_label']}** | {fusion['text_conf']*100:.1f}% | {fusion['text_valence']:+.2f} |
| 🔗 Overall valence | — | — | **{fusion['overall_valence']:+.2f}** |
"""
summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
return vfig, tfig, fusion_md, summary_md
# -------------------------------------------------------------
# Tab 2: Webcam / short video → emotion timeline
# -------------------------------------------------------------
def sample_frames(video_path: str, max_frames: int = 12) -> List[Tuple[float, Image.Image]]:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
return []
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
# If total frames is unknown, read sequentially to count.
if total <= 0:
total = 0
while True:
ok, _ = cap.read()
if not ok:
break
total += 1
cap.release()
cap = cv2.VideoCapture(video_path)
if total <= 0:
return []
duration = total / fps if fps > 0 else 1.0
n = min(max_frames, max(3, int(duration * 2))) # ~2 fps target
target_idxs = set(np.linspace(0, total - 1, n).astype(int).tolist())
out: List[Tuple[float, Image.Image]] = []
idx = 0
while True:
ok, frame = cap.read()
if not ok:
break
if idx in target_idxs:
ts = idx / fps if fps > 0 else float(idx)
pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
out.append((float(ts), pil))
if len(out) >= n:
break
idx += 1
cap.release()
return out
def analyse_video_text(video_path, text: str):
if not video_path:
return (empty_fig("Record or upload a short video"),
empty_fig("Awaiting input"),
empty_fig("Awaiting input"),
"### ⚠️ Please provide a webcam video.", "")
frames = sample_frames(video_path, max_frames=12)
if not frames:
return (empty_fig("Could not read video"),
empty_fig(""), empty_fig(""),
"### ⚠️ Could not decode the video file.", "")
timeline = [] # list of dict: ts, label->score
aggregated: Dict[str, float] = {}
for ts, pil in frames:
preds = predict_visual(pil)
row = {"timestamp": ts}
for p in preds:
row[p["label"]] = p["score"]
aggregated[p["label"]] = aggregated.get(p["label"], 0.0) + p["score"]
timeline.append(row)
# Average the aggregated visual prediction across frames
n = len(frames)
avg_visual = [{"label": k, "score": v / n} for k, v in aggregated.items()]
text_preds = predict_text(text or "")
fusion = fuse(avg_visual, text_preds)
summary = generative_summary(fusion, text)
# Timeline figure (line per emotion)
df = pd.DataFrame(timeline).fillna(0.0)
label_cols = [c for c in df.columns if c != "timestamp"]
tl_fig = go.Figure()
palette = px.colors.qualitative.Set2
for i, lbl in enumerate(label_cols):
tl_fig.add_trace(go.Scatter(
x=df["timestamp"], y=df[lbl] * 100,
mode="lines+markers", name=lbl,
line=dict(color=palette[i % len(palette)], width=2),
))
tl_fig.update_layout(
title="📈 Emotion Timeline (per frame)",
xaxis_title="Time (s)", yaxis_title="Confidence (%)",
height=360, template="plotly_white",
margin=dict(l=10, r=10, t=40, b=10),
yaxis=dict(range=[0, 100]),
)
vfig = bar_chart(avg_visual, "👁️ Average Visual Emotion", "#4C78A8")
tfig = bar_chart(text_preds, "💬 Text Sentiment", "#54A24B")
fusion_md = f"""
### {fusion['badge']} Fusion Result: **{fusion['status']}**
| Modality | Top Prediction | Confidence | Valence |
|---|---|---|---|
| 👁️ Visual (avg) | **{fusion['visual_label']}** | {fusion['visual_conf']*100:.1f}% | {fusion['visual_valence']:+.2f} |
| 💬 Text | **{fusion['text_label']}** | {fusion['text_conf']*100:.1f}% | {fusion['text_valence']:+.2f} |
| 🔗 Overall valence | — | — | **{fusion['overall_valence']:+.2f}** |
*Analysed {n} frames from the video.*
"""
summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
return tl_fig, vfig, tfig, fusion_md, summary_md
# =============================================================
# NEW FEATURE BLOCK (additive — does not touch Tab 1 / Tab 2)
# =============================================================
# 1) Whisper ASR (audio → text channel)
# 2) Video with audio (transcribe + frame timeline + fusion)
# 3) Attention visualisation (ViT rollout heatmap + text token attention)
# =============================================================
import tempfile
import subprocess
import html as _html
def get_asr_pipe():
global _asr_pipe
if _asr_pipe is None:
print("[MoodSyncAI] Loading ASR model:", ASR_MODEL)
_asr_pipe = pipeline(
"automatic-speech-recognition",
model=ASR_MODEL,
device=DEVICE,
chunk_length_s=30,
return_timestamps=False,
)
return _asr_pipe
def transcribe_audio(audio_path: str) -> str:
if not audio_path:
return ""
try:
# Load audio ourselves (soundfile/librosa) so we don't depend on
# whisper's internal ffmpeg-via-PATH lookup.
import soundfile as sf
try:
audio, sr = sf.read(audio_path, dtype="float32", always_2d=False)
except Exception:
import librosa
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
if audio.ndim > 1:
audio = audio.mean(axis=1)
if sr != 16000:
import librosa
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
sr = 16000
if audio.size == 0:
return ""
pipe = get_asr_pipe()
out = pipe(
{"array": audio, "sampling_rate": sr},
generate_kwargs={"language": "en", "task": "transcribe"},
)
text = out.get("text", "") if isinstance(out, dict) else str(out)
return (text or "").strip()
except Exception as e:
print("[MoodSyncAI] Transcription error:", e)
return ""
def _ffmpeg_exe() -> str:
try:
import imageio_ffmpeg
return imageio_ffmpeg.get_ffmpeg_exe()
except Exception:
return "ffmpeg"
def extract_audio_from_video(video_path: str) -> str:
"""Extract mono 16 kHz wav from video. Returns wav path or '' on failure."""
if not video_path:
return ""
try:
out_path = tempfile.NamedTemporaryFile(
suffix=".wav", delete=False
).name
cmd = [
_ffmpeg_exe(), "-y", "-i", video_path,
"-vn", "-ac", "1", "-ar", "16000",
"-f", "wav", out_path,
]
proc = subprocess.run(cmd, capture_output=True, timeout=120)
if proc.returncode != 0 or not os.path.exists(out_path) or os.path.getsize(out_path) < 1024:
return ""
return out_path
except Exception as e:
print("[MoodSyncAI] Audio-extract error:", e)
return ""
# -------------------------------------------------------------
# Attention visualisation
# -------------------------------------------------------------
def _get_vit_attn():
global _vit_attn_model, _vit_attn_processor
if _vit_attn_model is None:
print("[MoodSyncAI] Loading ViT (eager attn) for attention rollout")
_vit_attn_processor = AutoImageProcessor.from_pretrained(VISION_MODEL)
_vit_attn_model = AutoModelForImageClassification.from_pretrained(
VISION_MODEL, attn_implementation="eager"
)
_vit_attn_model.eval()
if DEVICE == 0:
_vit_attn_model = _vit_attn_model.to("cuda")
return _vit_attn_model, _vit_attn_processor
def _get_text_attn():
global _text_attn_model, _text_attn_tokenizer
if _text_attn_model is None:
print("[MoodSyncAI] Loading text classifier (eager attn) for token attention")
_text_attn_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
_text_attn_model = AutoModelForSequenceClassification.from_pretrained(
TEXT_MODEL, attn_implementation="eager"
)
_text_attn_model.eval()
if DEVICE == 0:
_text_attn_model = _text_attn_model.to("cuda")
return _text_attn_model, _text_attn_tokenizer
def vit_attention_heatmap(pil_img: Image.Image) -> Image.Image:
"""Attention-rollout heatmap overlaid on the (face-cropped) image."""
try:
face = detect_and_crop_face(pil_img).convert("RGB")
model, processor = _get_vit_attn()
inputs = processor(images=face, return_tensors="pt")
if DEVICE == 0:
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
out = model(**inputs, output_attentions=True)
attns = out.attentions # tuple(L) of (1, H, S, S)
if not attns:
return face
# Attention rollout: avg heads, add identity, normalise, multiply layers
result = None
for a in attns:
a = a.mean(dim=1).squeeze(0) # (S, S)
a = a + torch.eye(a.size(0), device=a.device)
a = a / a.sum(dim=-1, keepdim=True)
result = a if result is None else a @ result
# CLS-token row, drop CLS index → patch importances
cls_attn = result[0, 1:].detach().cpu().numpy()
side = int(np.sqrt(cls_attn.shape[0]))
if side * side != cls_attn.shape[0]:
return face
grid = cls_attn.reshape(side, side)
grid = (grid - grid.min()) / (grid.max() - grid.min() + 1e-8)
# Resize heatmap to face image
w, h = face.size
heat = cv2.resize(grid, (w, h), interpolation=cv2.INTER_CUBIC)
heat_u8 = (heat * 255).astype(np.uint8)
color = cv2.applyColorMap(heat_u8, cv2.COLORMAP_JET)
color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB)
base = np.array(face)
overlay = (0.55 * base + 0.45 * color).clip(0, 255).astype(np.uint8)
return Image.fromarray(overlay)
except Exception as e:
print("[MoodSyncAI] ViT attention error:", e)
return pil_img
def text_token_attention_html(text: str) -> str:
"""Render input text with per-token attention intensity (last layer, [CLS] row)."""
if not text or not text.strip():
return "<em>(no text)</em>"
try:
model, tok = _get_text_attn()
enc = tok(text, return_tensors="pt", truncation=True, max_length=256)
if DEVICE == 0:
enc = {k: v.to("cuda") for k, v in enc.items()}
with torch.no_grad():
out = model(**enc, output_attentions=True)
attns = out.attentions # tuple(L) of (1, H, S, S)
if not attns:
return _html.escape(text)
last = attns[-1].mean(dim=1).squeeze(0) # (S, S)
cls_row = last[0].detach().cpu().numpy() # importance of each token to CLS
ids = enc["input_ids"][0].detach().cpu().tolist()
tokens = tok.convert_ids_to_tokens(ids)
# Skip special tokens for normalisation range
specials = set(tok.all_special_tokens)
keep_mask = np.array([t not in specials for t in tokens])
if keep_mask.sum() == 0:
return _html.escape(text)
scores = cls_row.copy()
scores_disp = scores[keep_mask]
lo, hi = scores_disp.min(), scores_disp.max()
norm = (scores - lo) / (hi - lo + 1e-8)
norm = np.clip(norm, 0.0, 1.0)
# Build HTML: merge subword tokens (RoBERTa uses 'Ġ' prefix for word start)
spans = []
for i, t in enumerate(tokens):
if t in specials:
continue
display = t
prefix_space = ""
if display.startswith("Ġ"):
display = display[1:]
prefix_space = " "
elif display.startswith("▁"):
display = display[1:]
prefix_space = " "
intensity = float(norm[i])
# red highlight, alpha from intensity
bg = f"rgba(220,38,38,{intensity:.2f})"
color = "#fff" if intensity > 0.55 else "#111"
safe = _html.escape(display)
spans.append(
f"{prefix_space}<span style=\"background:{bg};color:{color};"
f"padding:2px 4px;border-radius:4px;margin:1px;"
f"font-family:monospace\" title=\"{intensity:.2f}\">{safe}</span>"
)
body = "".join(spans).strip()
legend = (
"<div style='margin-top:8px;font-size:12px;color:#555'>"
"Darker red = higher attention weight from [CLS] to that token "
"(last transformer layer, averaged over heads)."
"</div>"
)
return f"<div style='line-height:2;font-size:15px'>{body}</div>{legend}"
except Exception as e:
print("[MoodSyncAI] Text attention error:", e)
return _html.escape(text)
# -------------------------------------------------------------
# Tab 1 wrapper: existing outputs + (optional) attention viz
# -------------------------------------------------------------
def analyse_image_text_with_attention(image: Image.Image, text: str, show_attn: bool):
vfig, tfig, fusion_md, summary_md = analyse_image_text(image, text)
if not show_attn or image is None:
return (vfig, tfig, fusion_md, summary_md,
None, "<em>Toggle 'Show attention visualisation' to view.</em>")
heat = vit_attention_heatmap(image)
token_html = text_token_attention_html(text or "")
return vfig, tfig, fusion_md, summary_md, heat, token_html
# -------------------------------------------------------------
# Tab 3: Audio + Image
# -------------------------------------------------------------
def analyse_audio_image(audio_path, image: Image.Image):
if image is None and not audio_path:
return ("",
empty_fig("Provide an image"),
empty_fig("Provide audio"),
"### ⚠️ Please provide both an image and audio.", "")
transcript = transcribe_audio(audio_path) if audio_path else ""
if not transcript:
transcript = "(no speech detected)"
if image is None:
return (transcript,
empty_fig("No image provided"),
empty_fig("(transcript only)"),
"### ⚠️ Please also provide a face image.", "")
visual_preds = predict_visual(image)
spoken = "" if transcript.startswith("(") else transcript
text_preds = predict_text(spoken)
fusion = fuse(visual_preds, text_preds)
summary = generative_summary(fusion, spoken)
vfig = bar_chart(visual_preds, "👁️ Visual Emotion (ViT)", "#4C78A8")
tfig = bar_chart(text_preds, "💬 Sentiment of Transcribed Speech", "#54A24B")
fusion_md = f"""
### {fusion['badge']} Fusion Result: **{fusion['status']}**
| Modality | Top Prediction | Confidence | Valence |
|---|---|---|---|
| 👁️ Visual (image) | **{fusion['visual_label']}** | {fusion['visual_conf']*100:.1f}% | {fusion['visual_valence']:+.2f} |
| 🎙️ Audio → Text | **{fusion['text_label']}** | {fusion['text_conf']*100:.1f}% | {fusion['text_valence']:+.2f} |
| 🔗 Overall valence | — | — | **{fusion['overall_valence']:+.2f}** |
"""
summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
return transcript, vfig, tfig, fusion_md, summary_md
# -------------------------------------------------------------
# Tab 4: Video WITH audio (frames timeline + audio transcript → text channel)
# -------------------------------------------------------------
def analyse_video_with_audio(video_path):
if not video_path:
return ("",
empty_fig("Record or upload a video"),
empty_fig(""), empty_fig(""),
"### ⚠️ Please provide a video.", "")
frames = sample_frames(video_path, max_frames=12)
if not frames:
return ("",
empty_fig("Could not read video"),
empty_fig(""), empty_fig(""),
"### ⚠️ Could not decode the video file.", "")
# 1) Audio → transcript
wav = extract_audio_from_video(video_path)
transcript = transcribe_audio(wav) if wav else ""
if wav and os.path.exists(wav):
try: os.remove(wav)
except Exception: pass
if not transcript:
transcript = "(no speech detected in the audio track)"
spoken = "" if transcript.startswith("(") else transcript
# 2) Per-frame visual + aggregate
timeline = []
aggregated: Dict[str, float] = {}
for ts, pil in frames:
preds = predict_visual(pil)
row = {"timestamp": ts}
for p in preds:
row[p["label"]] = p["score"]
aggregated[p["label"]] = aggregated.get(p["label"], 0.0) + p["score"]
timeline.append(row)
n = len(frames)
avg_visual = [{"label": k, "score": v / n} for k, v in aggregated.items()]
# 3) Text channel from transcript
text_preds = predict_text(spoken)
fusion = fuse(avg_visual, text_preds)
summary = generative_summary(fusion, spoken)
# Timeline figure
df = pd.DataFrame(timeline).fillna(0.0)
label_cols = [c for c in df.columns if c != "timestamp"]
tl_fig = go.Figure()
palette = px.colors.qualitative.Set2
for i, lbl in enumerate(label_cols):
tl_fig.add_trace(go.Scatter(
x=df["timestamp"], y=df[lbl] * 100,
mode="lines+markers", name=lbl,
line=dict(color=palette[i % len(palette)], width=2),
))
tl_fig.update_layout(
title="📈 Emotion Timeline (per frame) — audio transcript drives text channel",
xaxis_title="Time (s)", yaxis_title="Confidence (%)",
height=360, template="plotly_white",
margin=dict(l=10, r=10, t=40, b=10),
yaxis=dict(range=[0, 100]),
)
vfig = bar_chart(avg_visual, "👁️ Avg Visual Emotion (frames)", "#4C78A8")
tfig = bar_chart(text_preds, "💬 Sentiment of Spoken Audio", "#54A24B")
fusion_md = f"""
### {fusion['badge']} Fusion Result: **{fusion['status']}**
| Modality | Top Prediction | Confidence | Valence |
|---|---|---|---|
| 👁️ Visual (avg of {n} frames) | **{fusion['visual_label']}** | {fusion['visual_conf']*100:.1f}% | {fusion['visual_valence']:+.2f} |
| 🎙️ Audio transcript | **{fusion['text_label']}** | {fusion['text_conf']*100:.1f}% | {fusion['text_valence']:+.2f} |
| 🔗 Overall valence | — | — | **{fusion['overall_valence']:+.2f}** |
*Spoken words (auto-transcribed):* "{spoken or '—'}"
"""
summary_md = f"### 🧠 Generative Summary\n\n> {summary}"
return transcript, tl_fig, vfig, tfig, fusion_md, summary_md
# -------------------------------------------------------------
# Gradio UI
# -------------------------------------------------------------
CSS = """
.gradio-container {max-width: 1200px !important;}
#title {text-align:center;}
footer {display: none !important;}
.show-api, .built-with, .settings {display: none !important;}
"""
with gr.Blocks(title="MoodSyncAI", theme=gr.themes.Soft(), css=CSS) as demo:
gr.Markdown("# 🎭 MoodSyncAI", elem_id="title")
gr.Markdown(
"**Multi-Modal Sentiment & Emotion Analyser** — combines a Vision "
"Transformer (face), a Transformer text classifier (words), a fusion "
"layer (mismatch detection), and a generative model (plain-language "
"summary). 100% open-source."
)
with gr.Tabs():
# ---------------- Tab 1 ----------------
with gr.Tab("🖼️ Image + Text"):
with gr.Row():
with gr.Column(scale=1):
img_in = gr.Image(type="pil", label="Face photo", height=320)
txt_in = gr.Textbox(
label="What the person said",
placeholder="e.g., No, I think the project is going really well.",
lines=2,
)
btn1 = gr.Button("🔍 Analyse", variant="primary")
attn_toggle1 = gr.Checkbox(
label="🔬 Show attention visualisation (ViT rollout + text tokens)",
value=False,
)
gr.Examples(
examples=[
[None, "No, I think the project is going really well."],
[None, "I'm absolutely thrilled about the results!"],
[None, "I'm fine, really, don't worry about me."],
],
inputs=[img_in, txt_in],
)
with gr.Column(scale=2):
fusion_md1 = gr.Markdown()
summary_md1 = gr.Markdown()
with gr.Row():
vbar1 = gr.Plot(label="Visual emotion")
tbar1 = gr.Plot(label="Text sentiment")
with gr.Accordion("🔬 Attention visualisation", open=False):
attn_img1 = gr.Image(
label="ViT attention rollout (face)",
height=320, interactive=False,
)
attn_html1 = gr.HTML(label="Text token attention")
btn1.click(analyse_image_text_with_attention,
inputs=[img_in, txt_in, attn_toggle1],
outputs=[vbar1, tbar1, fusion_md1, summary_md1,
attn_img1, attn_html1])
# ---------------- Tab 2 ----------------
with gr.Tab("📹 Webcam / Video + Text"):
gr.Markdown(
"Record a short clip from your webcam (3–10 s recommended) **or** "
"upload a short video. The system samples frames and builds an "
"emotion timeline."
)
with gr.Row():
with gr.Column(scale=1):
vid_in = gr.Video(
label="Webcam / video",
sources=["webcam", "upload"],
height=300,
)
txt_in2 = gr.Textbox(
label="What the person said",
placeholder="Type the spoken sentence here…",
lines=2,
)
btn2 = gr.Button("🔍 Analyse video", variant="primary")
with gr.Column(scale=2):
timeline_plot = gr.Plot(label="Emotion timeline")
fusion_md2 = gr.Markdown()
summary_md2 = gr.Markdown()
with gr.Row():
vbar2 = gr.Plot(label="Avg visual emotion")
tbar2 = gr.Plot(label="Text sentiment")
btn2.click(analyse_video_text,
inputs=[vid_in, txt_in2],
outputs=[timeline_plot, vbar2, tbar2, fusion_md2, summary_md2])
# ---------------- Tab 3 : Audio + Image ----------------
with gr.Tab("🎙️ Audio + Image"):
gr.Markdown(
"Speak (or upload audio) **and** provide a face image. Whisper "
"transcribes the audio; the words become the *text channel* fed "
"into the multimodal fusion."
)
with gr.Row():
with gr.Column(scale=1):
audio_in3 = gr.Audio(
label="🎙️ Audio (microphone or upload)",
sources=["microphone", "upload"],
type="filepath",
)
img_in3 = gr.Image(type="pil", label="Face photo", height=300)
btn3 = gr.Button("🔍 Transcribe & analyse", variant="primary")
with gr.Column(scale=2):
transcript3 = gr.Textbox(
label="Auto-transcript (Whisper)",
interactive=False, lines=2,
)
fusion_md3 = gr.Markdown()
summary_md3 = gr.Markdown()
with gr.Row():
vbar3 = gr.Plot(label="Visual emotion")
tbar3 = gr.Plot(label="Audio→text sentiment")
btn3.click(analyse_audio_image,
inputs=[audio_in3, img_in3],
outputs=[transcript3, vbar3, tbar3, fusion_md3, summary_md3])
# ---------------- Tab 4 : Video WITH audio ----------------
with gr.Tab("🎬 Video with Audio"):
gr.Markdown(
"Record or upload a short video **with sound**. The system extracts "
"the audio track, transcribes it (Whisper), samples frames for an "
"emotion timeline, then fuses the visual signal with the spoken-word "
"sentiment — no manual typing needed."
)
with gr.Row():
with gr.Column(scale=1):
vid_in4 = gr.Video(
label="Webcam / video (with audio)",
sources=["webcam", "upload"],
height=300,
)
btn4 = gr.Button("🔍 Transcribe & analyse video", variant="primary")
with gr.Column(scale=2):
transcript4 = gr.Textbox(
label="Auto-transcript (Whisper)",
interactive=False, lines=2,
)
timeline_plot4 = gr.Plot(label="Emotion timeline")
fusion_md4 = gr.Markdown()
summary_md4 = gr.Markdown()
with gr.Row():
vbar4 = gr.Plot(label="Avg visual emotion")
tbar4 = gr.Plot(label="Audio→text sentiment")
btn4.click(analyse_video_with_audio,
inputs=[vid_in4],
outputs=[transcript4, timeline_plot4, vbar4, tbar4,
fusion_md4, summary_md4])
# ---------------- Tab 3 (about) ----------------
with gr.Tab("ℹ️ About"):
gr.Markdown(f"""
### Architecture
| Stage | Model | Type |
|---|---|---|
| Visual emotion | `{VISION_MODEL}` | **Vision Transformer (ViT)** |
| Text sentiment | `{TEXT_MODEL}` | **Transformer (DistilRoBERTa)** |
| Speech-to-text | `{ASR_MODEL}` | **Encoder-Decoder Transformer (Whisper)** |
| Fusion | Valence-aligned multimodal fusion (custom) | rule + weighted |
| Generative summary | `{GEN_MODEL}` | **Encoder-Decoder Transformer (FLAN-T5)** |
| Attention viz | ViT attention rollout + last-layer text attention | interpretability |
### Fusion logic
1. Each modality produces a probability distribution over emotion labels.
2. Labels are mapped to a *valence* score in `[-1, +1]`.
3. We compute weighted valence per modality, then a delta.
4. Opposite signs → **MISMATCH** (amber). Small delta → **ALIGNED** (green).
5. Generative model receives the structured signals and writes plain-language output.
### Privacy
All processing runs locally on your machine; no data is sent to external services
after the first model download from the Hugging Face Hub.
""")
if __name__ == "__main__":
# Warm up small models so first request is snappy
try:
get_text_pipe()
except Exception as e:
print("[MoodSyncAI] Warmup text failed:", e)
import os as _os
_on_spaces = bool(_os.environ.get("SPACE_ID"))
demo.queue().launch(
server_name="0.0.0.0" if _on_spaces else "127.0.0.1",
server_port=7860,
inbrowser=not _on_spaces,
show_error=True,
show_api=False,
ssr_mode=False,
)