Spaces:

pavankumarvk
/

Multi_Modal_Deepfake_Detection

Running

App Files Files Community

pavankumarvk commited on 12 days ago

Commit

09dc27f

verified ·

1 Parent(s): 28c4d49

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +78 -98

pipeline.py CHANGED Viewed

@@ -8,7 +8,7 @@ import subprocess
 import tempfile
 import numpy as np
 import tensorflow as tf
-from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
 try:
     import noisereduce as nr
@@ -35,33 +35,29 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
 )
 # ─────────────────────────────────────────────────────────────────────────────
-# Audio Ensemble: 3 models vote — majority wins (for uploaded files only)
 # ─────────────────────────────────────────────────────────────────────────────
-AUDIO_MODELS = [
-    "MelodyMachine/Deepfake-audio-detection-V2",
-    "MelodyMachine/Deepfake-audio-detection",
-    "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
-]
-AUDIO_SAMPLE_RATE = 16000
-# ─── Model Thresholds ────────────────────────────────────────────────────────
-REAL_THRESHOLD = 0.55
-FAKE_THRESHOLD = 0.70
-print("Loading audio ensemble models ...")
-ensemble = []
-for model_id in AUDIO_MODELS:
-    print(f"  Loading {model_id} ...")
-    try:
-        fe = AutoFeatureExtractor.from_pretrained(model_id)
-        m = AutoModelForAudioClassification.from_pretrained(model_id)
-        m.eval()
-        ensemble.append({"id": model_id, "extractor": fe, "model": m})
-        print(f"  ✅ Loaded: {model_id} | labels: {m.config.id2label}")
-    except Exception as e:
-        print(f"  ⚠️ Skipped {model_id}: {e}")
-print(f"Ensemble ready with {len(ensemble)} models.")
 # ─────────────────────────────────────────────────────────────────────────────
@@ -219,7 +215,7 @@ class DetectionPipeline:
             return faces
         elif self.input_modality == 'image':
-            image = filename  # Gradio already delivers RGB; no conversion needed
             return cv2.resize(image, (224, 224))
         else:
@@ -304,85 +300,70 @@ def fake_processing_steps(x: np.ndarray, sr: int):
     print("[Audio] Final decision: real")
-def get_real_fake_probs(probs, id2label: dict):
-    real_prob, fake_prob = None, None
-    for idx, prob in enumerate(probs):
-        label = id2label[idx].lower().strip()
-        if label in ("real", "label_1", "genuine", "bonafide", "1"):
-            real_prob = float(prob)
-        elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
-            fake_prob = float(prob)
-    if real_prob is None or fake_prob is None:
-        print("[Audio] Warning: unknown labels — falling back to probs[0]=fake, probs[1]=real")
-        fake_prob = float(probs[0])
-        real_prob = float(probs[1])
-    return real_prob, fake_prob
-def single_model_vote(x, entry):
-    model_id = entry["id"]
-    fe = entry["extractor"]
-    m = entry["model"]
-    inputs = fe(x, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = m(**inputs).logits
-    probs = torch.softmax(logits, dim=-1)[0]
-    real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
-    print(f"[Audio] {model_id} → real={real_prob:.4f}  fake={fake_prob:.4f}")
-    if real_prob >= REAL_THRESHOLD:
-        vote = "real"
-    elif fake_prob >= FAKE_THRESHOLD:
-        vote = "fake"
-    else:
-        vote = "ai_synth"
-    print(f"[Audio] {model_id} → vote: {vote}")
-    return vote, real_prob, fake_prob
-def run_ensemble(x: np.ndarray) -> str:
-    votes = {"real": 0, "ai_synth": 0, "fake": 0}
-    for entry in ensemble:
-        try:
-            vote, real_prob, fake_prob = single_model_vote(x, entry)
-            votes[vote] += 1
-        except Exception as e:
-            print(f"[Audio] Model {entry['id']} failed: {e}")
-    print(f"[Audio] Vote tally: {votes}")
-    max_votes = max(votes.values())
-    winners = [label for label, count in votes.items() if count == max_votes]
-    if "real" in winners:
-        ensemble_result = "real"
-    elif "ai_synth" in winners:
-        ensemble_result = "ai_synth"
-    else:
-        ensemble_result = "fake"
-    print(f"[Audio] Ensemble decision: {ensemble_result}")
     acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
-    if ensemble_result == "fake":
         final = "fake"
-    elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
-        print(f"[Audio] Acoustic override: ensemble=real but ai_synth_score={acoustic['ai_synth_score']:.4f} > {AI_SYNTH_THRESHOLD} → AI Synthesized")
         final = "ai_synth"
     else:
-        final = ensemble_result
     print(f"[Audio] Final decision: {final}")
     if final == "real":
-        return "✅ Real Human Voice"
     elif final == "ai_synth":
-        return "🤖 AI Synthesized / Voice Cloned"
     else:
-        return "🚨 Fake / Manipulated Audio"
 def deepfakes_audio_predict(input_audio):
@@ -407,12 +388,13 @@ def deepfakes_audio_predict(input_audio):
         x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
         print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
-    # Cap at 30 seconds to prevent OOM on long audio uploads
-    MAX_SAMPLES = AUDIO_SAMPLE_RATE * 30
-    if len(x) > MAX_SAMPLES:
-        print(f"[Audio] Trimming audio to 30s ({len(x)} → {MAX_SAMPLES} samples)")
-        x = x[:MAX_SAMPLES]
-    return run_ensemble(x)
 # ─────────────────────────────────────────────────────────────────────────────
@@ -483,9 +465,7 @@ def deepfakes_text_predict(input_text: str) -> str:
             f"P(Human-Written) : {human_prob*100:.1f}%\n"
             f"\n"
             f"Words analysed   : {word_count}\n"
-            f"(First 128 tokens used — ~100 words)\n"
-            f"\n"
-            f"{'Model: HybridAI TextDetector (your checkpoint)' if result.get('source') == 'custom_model' else 'Model: Pretrained fallback (chatgpt-detector-roberta)'}"
         )
         return output

 import tempfile
 import numpy as np
 import tensorflow as tf
+# AutoFeatureExtractor / AutoModelForAudioClassification removed — using AASISTDeepFake instead
 try:
     import noisereduce as nr
 )
 # ─────────────────────────────────────────────────────────────────────────────
+# Audio: AASISTDeepFake (our trained model)
+# Replaces the 3-model HuggingFace ensemble.
 # ─────────────────────────────────────────────────────────────────────────────
+AUDIO_SAMPLE_RATE   = 16000
+AUDIO_CHECKPOINT    = "best_aasist.pt"
+# Update this to the optimal F1 threshold printed at the end of your training run
+# (Cell 14 output: "Optimal threshold: X.XXXX")
+AUDIO_THRESHOLD     = 0.5
+_audio_detector = None   # lazy-loaded on first audio call
+def _get_audio_detector():
+    """Lazy-load AASISTDeepFake — avoids startup delay if tab isn't used."""
+    global _audio_detector
+    if _audio_detector is None:
+        from audio_detector_inference import AudioDetectorInference
+        print("[Audio] Loading AASISTDeepFake ...")
+        _audio_detector = AudioDetectorInference(
+            checkpoint=AUDIO_CHECKPOINT,
+            threshold=AUDIO_THRESHOLD,
+        )
+        print("[Audio] ✅ AASISTDeepFake ready")
+    return _audio_detector
 # ─────────────────────────────────────────────────────────────────────────────
             return faces
         elif self.input_modality == 'image':
+            image = filename  # Gradio already delivers RGB — no conversion needed
             return cv2.resize(image, (224, 224))
         else:
     print("[Audio] Final decision: real")
+# get_real_fake_probs() removed — was only used by the HF ensemble
+# single_model_vote() removed — was only used by the HF ensemble
+def run_aasist(x: np.ndarray) -> str:
+    """
+    Run AASISTDeepFake on a preprocessed (16 kHz, float32, mono) waveform.
+    Acoustic feature override is applied on top: if the model says Real but
+    acoustic analysis detects TTS-like smoothness, the result is upgraded to
+    AI Synthesized.
+    """
+    detector = _get_audio_detector()
+    result   = detector.predict(x, AUDIO_SAMPLE_RATE)
+    if "error" in result:
+        print(f"[Audio] ❌ AASIST error: {result['error']}")
+        return f"❌ Audio detection failed: {result['error']}"
+    aasist_label = result["label"]          # "Real" or "Fake"
+    real_prob    = result["real_prob"]
+    fake_prob    = result["fake_prob"]
+    confidence   = result["confidence"]
+    print(f"[Audio] AASIST → {aasist_label}  "
+          f"(real={real_prob:.4f}  fake={fake_prob:.4f})")
+    # ── Acoustic override (catches TTS content AASIST may miss) ──────────────
     acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
+    if aasist_label == "Fake":
         final = "fake"
+    elif aasist_label == "Real" and acoustic["is_ai_synthesized"]:
+        print(
+            f"[Audio] Acoustic override: AASIST=Real but "
+            f"ai_synth_score={acoustic['ai_synth_score']:.4f} > {AI_SYNTH_THRESHOLD}"
+            f" → AI Synthesized"
+        )
         final = "ai_synth"
     else:
+        final = "real"
     print(f"[Audio] Final decision: {final}")
     if final == "real":
+        conf_pct = f"{real_prob*100:.1f}"
+        return (
+            f"✅ Real Human Voice\n\n"
+            f"Confidence  {conf_pct}%  (P(real)={real_prob:.4f})"
+        )
     elif final == "ai_synth":
+        return (
+            f"🤖 AI Synthesized / Voice Cloned\n\n"
+            f"Model said Real ({real_prob*100:.1f}%) but acoustic features\n"
+            f"detected unnaturally smooth synthesis patterns.\n"
+            f"AI synthesis score: {acoustic['ai_synth_score']:.4f}"
+        )
     else:
+        conf_pct = f"{fake_prob*100:.1f}"
+        return (
+            f"🚨 Fake / Manipulated Audio\n\n"
+            f"Confidence  {conf_pct}%  (P(fake)={fake_prob:.4f})"
+        )
 def deepfakes_audio_predict(input_audio):
         x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
         print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
+    # Cap at 30 seconds to prevent OOM on very long uploads
+    MAX_AUDIO = AUDIO_SAMPLE_RATE * 30
+    if len(x) > MAX_AUDIO:
+        print(f"[Audio] Trimming to 30s ({len(x)} → {MAX_AUDIO} samples)")
+        x = x[:MAX_AUDIO]
+    return run_aasist(x)
 # ─────────────────────────────────────────────────────────────────────────────
             f"P(Human-Written) : {human_prob*100:.1f}%\n"
             f"\n"
             f"Words analysed   : {word_count}\n"
+            f"(First 128 tokens used — ~100 words)"
         )
         return output