pavankumarvk commited on
Commit
09dc27f
Β·
verified Β·
1 Parent(s): 28c4d49

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +78 -98
pipeline.py CHANGED
@@ -8,7 +8,7 @@ import subprocess
8
  import tempfile
9
  import numpy as np
10
  import tensorflow as tf
11
- from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
12
 
13
  try:
14
  import noisereduce as nr
@@ -35,33 +35,29 @@ efficientnet_model = tf.keras.layers.TFSMLayer(
35
  )
36
 
37
  # ─────────────────────────────────────────────────────────────────────────────
38
- # Audio Ensemble: 3 models vote β€” majority wins (for uploaded files only)
 
39
  # ─────────────────────────────────────────────────────────────────────────────
40
- AUDIO_MODELS = [
41
- "MelodyMachine/Deepfake-audio-detection-V2",
42
- "MelodyMachine/Deepfake-audio-detection",
43
- "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification",
44
- ]
45
- AUDIO_SAMPLE_RATE = 16000
46
-
47
- # ─── Model Thresholds ────────────────────────────────────────────────────────
48
- REAL_THRESHOLD = 0.55
49
- FAKE_THRESHOLD = 0.70
50
-
51
- print("Loading audio ensemble models ...")
52
- ensemble = []
53
- for model_id in AUDIO_MODELS:
54
- print(f" Loading {model_id} ...")
55
- try:
56
- fe = AutoFeatureExtractor.from_pretrained(model_id)
57
- m = AutoModelForAudioClassification.from_pretrained(model_id)
58
- m.eval()
59
- ensemble.append({"id": model_id, "extractor": fe, "model": m})
60
- print(f" βœ… Loaded: {model_id} | labels: {m.config.id2label}")
61
- except Exception as e:
62
- print(f" ⚠️ Skipped {model_id}: {e}")
63
-
64
- print(f"Ensemble ready with {len(ensemble)} models.")
65
 
66
 
67
  # ─────────────────────────────────────────────────────────────────────────────
@@ -219,7 +215,7 @@ class DetectionPipeline:
219
  return faces
220
 
221
  elif self.input_modality == 'image':
222
- image = filename # Gradio already delivers RGB; no conversion needed
223
  return cv2.resize(image, (224, 224))
224
 
225
  else:
@@ -304,85 +300,70 @@ def fake_processing_steps(x: np.ndarray, sr: int):
304
  print("[Audio] Final decision: real")
305
 
306
 
307
- def get_real_fake_probs(probs, id2label: dict):
308
- real_prob, fake_prob = None, None
309
- for idx, prob in enumerate(probs):
310
- label = id2label[idx].lower().strip()
311
- if label in ("real", "label_1", "genuine", "bonafide", "1"):
312
- real_prob = float(prob)
313
- elif label in ("fake", "label_0", "spoof", "synthetic", "0"):
314
- fake_prob = float(prob)
315
- if real_prob is None or fake_prob is None:
316
- print("[Audio] Warning: unknown labels β€” falling back to probs[0]=fake, probs[1]=real")
317
- fake_prob = float(probs[0])
318
- real_prob = float(probs[1])
319
- return real_prob, fake_prob
320
-
321
-
322
- def single_model_vote(x, entry):
323
- model_id = entry["id"]
324
- fe = entry["extractor"]
325
- m = entry["model"]
326
-
327
- inputs = fe(x, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
328
- with torch.no_grad():
329
- logits = m(**inputs).logits
330
-
331
- probs = torch.softmax(logits, dim=-1)[0]
332
- real_prob, fake_prob = get_real_fake_probs(probs, m.config.id2label)
333
- print(f"[Audio] {model_id} β†’ real={real_prob:.4f} fake={fake_prob:.4f}")
334
-
335
- if real_prob >= REAL_THRESHOLD:
336
- vote = "real"
337
- elif fake_prob >= FAKE_THRESHOLD:
338
- vote = "fake"
339
- else:
340
- vote = "ai_synth"
341
 
342
- print(f"[Audio] {model_id} β†’ vote: {vote}")
343
- return vote, real_prob, fake_prob
344
 
 
345
 
346
- def run_ensemble(x: np.ndarray) -> str:
347
- votes = {"real": 0, "ai_synth": 0, "fake": 0}
348
- for entry in ensemble:
349
- try:
350
- vote, real_prob, fake_prob = single_model_vote(x, entry)
351
- votes[vote] += 1
352
- except Exception as e:
353
- print(f"[Audio] Model {entry['id']} failed: {e}")
354
 
355
- print(f"[Audio] Vote tally: {votes}")
 
 
 
 
 
 
 
 
356
 
357
- max_votes = max(votes.values())
358
- winners = [label for label, count in votes.items() if count == max_votes]
359
- if "real" in winners:
360
- ensemble_result = "real"
361
- elif "ai_synth" in winners:
362
- ensemble_result = "ai_synth"
363
- else:
364
- ensemble_result = "fake"
365
 
366
- print(f"[Audio] Ensemble decision: {ensemble_result}")
 
 
 
367
 
 
 
 
 
368
  acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
369
 
370
- if ensemble_result == "fake":
371
  final = "fake"
372
- elif ensemble_result == "real" and acoustic["is_ai_synthesized"]:
373
- print(f"[Audio] Acoustic override: ensemble=real but ai_synth_score={acoustic['ai_synth_score']:.4f} > {AI_SYNTH_THRESHOLD} β†’ AI Synthesized")
 
 
 
 
374
  final = "ai_synth"
375
  else:
376
- final = ensemble_result
377
 
378
  print(f"[Audio] Final decision: {final}")
379
 
380
  if final == "real":
381
- return "βœ… Real Human Voice"
 
 
 
 
382
  elif final == "ai_synth":
383
- return "πŸ€– AI Synthesized / Voice Cloned"
 
 
 
 
 
384
  else:
385
- return "🚨 Fake / Manipulated Audio"
 
 
 
 
386
 
387
 
388
  def deepfakes_audio_predict(input_audio):
@@ -407,12 +388,13 @@ def deepfakes_audio_predict(input_audio):
407
  x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
408
  print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
409
 
410
- # Cap at 30 seconds to prevent OOM on long audio uploads
411
- MAX_SAMPLES = AUDIO_SAMPLE_RATE * 30
412
- if len(x) > MAX_SAMPLES:
413
- print(f"[Audio] Trimming audio to 30s ({len(x)} β†’ {MAX_SAMPLES} samples)")
414
- x = x[:MAX_SAMPLES]
415
- return run_ensemble(x)
 
416
 
417
 
418
  # ─────────────────────────────────────────────────────────────────────────────
@@ -483,9 +465,7 @@ def deepfakes_text_predict(input_text: str) -> str:
483
  f"P(Human-Written) : {human_prob*100:.1f}%\n"
484
  f"\n"
485
  f"Words analysed : {word_count}\n"
486
- f"(First 128 tokens used β€” ~100 words)\n"
487
- f"\n"
488
- f"{'Model: HybridAI TextDetector (your checkpoint)' if result.get('source') == 'custom_model' else 'Model: Pretrained fallback (chatgpt-detector-roberta)'}"
489
  )
490
  return output
491
 
 
8
  import tempfile
9
  import numpy as np
10
  import tensorflow as tf
11
+ # AutoFeatureExtractor / AutoModelForAudioClassification removed β€” using AASISTDeepFake instead
12
 
13
  try:
14
  import noisereduce as nr
 
35
  )
36
 
37
  # ─────────────────────────────────────────────────────────────────────────────
38
+ # Audio: AASISTDeepFake (our trained model)
39
+ # Replaces the 3-model HuggingFace ensemble.
40
  # ─────────────────────────────────────────────────────────────────────────────
41
+ AUDIO_SAMPLE_RATE = 16000
42
+ AUDIO_CHECKPOINT = "best_aasist.pt"
43
+ # Update this to the optimal F1 threshold printed at the end of your training run
44
+ # (Cell 14 output: "Optimal threshold: X.XXXX")
45
+ AUDIO_THRESHOLD = 0.5
46
+
47
+ _audio_detector = None # lazy-loaded on first audio call
48
+
49
+ def _get_audio_detector():
50
+ """Lazy-load AASISTDeepFake β€” avoids startup delay if tab isn't used."""
51
+ global _audio_detector
52
+ if _audio_detector is None:
53
+ from audio_detector_inference import AudioDetectorInference
54
+ print("[Audio] Loading AASISTDeepFake ...")
55
+ _audio_detector = AudioDetectorInference(
56
+ checkpoint=AUDIO_CHECKPOINT,
57
+ threshold=AUDIO_THRESHOLD,
58
+ )
59
+ print("[Audio] βœ… AASISTDeepFake ready")
60
+ return _audio_detector
 
 
 
 
 
61
 
62
 
63
  # ─────────────────────────────────────────────────────────────────────────────
 
215
  return faces
216
 
217
  elif self.input_modality == 'image':
218
+ image = filename # Gradio already delivers RGB β€” no conversion needed
219
  return cv2.resize(image, (224, 224))
220
 
221
  else:
 
300
  print("[Audio] Final decision: real")
301
 
302
 
303
+ # get_real_fake_probs() removed β€” was only used by the HF ensemble
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
 
 
305
 
306
+ # single_model_vote() removed β€” was only used by the HF ensemble
307
 
 
 
 
 
 
 
 
 
308
 
309
+ def run_aasist(x: np.ndarray) -> str:
310
+ """
311
+ Run AASISTDeepFake on a preprocessed (16 kHz, float32, mono) waveform.
312
+ Acoustic feature override is applied on top: if the model says Real but
313
+ acoustic analysis detects TTS-like smoothness, the result is upgraded to
314
+ AI Synthesized.
315
+ """
316
+ detector = _get_audio_detector()
317
+ result = detector.predict(x, AUDIO_SAMPLE_RATE)
318
 
319
+ if "error" in result:
320
+ print(f"[Audio] ❌ AASIST error: {result['error']}")
321
+ return f"❌ Audio detection failed: {result['error']}"
 
 
 
 
 
322
 
323
+ aasist_label = result["label"] # "Real" or "Fake"
324
+ real_prob = result["real_prob"]
325
+ fake_prob = result["fake_prob"]
326
+ confidence = result["confidence"]
327
 
328
+ print(f"[Audio] AASIST β†’ {aasist_label} "
329
+ f"(real={real_prob:.4f} fake={fake_prob:.4f})")
330
+
331
+ # ── Acoustic override (catches TTS content AASIST may miss) ──────────────
332
  acoustic = analyze_acoustic_features(x, AUDIO_SAMPLE_RATE)
333
 
334
+ if aasist_label == "Fake":
335
  final = "fake"
336
+ elif aasist_label == "Real" and acoustic["is_ai_synthesized"]:
337
+ print(
338
+ f"[Audio] Acoustic override: AASIST=Real but "
339
+ f"ai_synth_score={acoustic['ai_synth_score']:.4f} > {AI_SYNTH_THRESHOLD}"
340
+ f" β†’ AI Synthesized"
341
+ )
342
  final = "ai_synth"
343
  else:
344
+ final = "real"
345
 
346
  print(f"[Audio] Final decision: {final}")
347
 
348
  if final == "real":
349
+ conf_pct = f"{real_prob*100:.1f}"
350
+ return (
351
+ f"βœ… Real Human Voice\n\n"
352
+ f"Confidence {conf_pct}% (P(real)={real_prob:.4f})"
353
+ )
354
  elif final == "ai_synth":
355
+ return (
356
+ f"πŸ€– AI Synthesized / Voice Cloned\n\n"
357
+ f"Model said Real ({real_prob*100:.1f}%) but acoustic features\n"
358
+ f"detected unnaturally smooth synthesis patterns.\n"
359
+ f"AI synthesis score: {acoustic['ai_synth_score']:.4f}"
360
+ )
361
  else:
362
+ conf_pct = f"{fake_prob*100:.1f}"
363
+ return (
364
+ f"🚨 Fake / Manipulated Audio\n\n"
365
+ f"Confidence {conf_pct}% (P(fake)={fake_prob:.4f})"
366
+ )
367
 
368
 
369
  def deepfakes_audio_predict(input_audio):
 
388
  x = librosa.resample(x, orig_sr=sr, target_sr=AUDIO_SAMPLE_RATE)
389
  print(f"[Audio] After resample: {len(x)} samples ({len(x) / AUDIO_SAMPLE_RATE:.2f}s)")
390
 
391
+ # Cap at 30 seconds to prevent OOM on very long uploads
392
+ MAX_AUDIO = AUDIO_SAMPLE_RATE * 30
393
+ if len(x) > MAX_AUDIO:
394
+ print(f"[Audio] Trimming to 30s ({len(x)} β†’ {MAX_AUDIO} samples)")
395
+ x = x[:MAX_AUDIO]
396
+
397
+ return run_aasist(x)
398
 
399
 
400
  # ─────────────────────────────────────────────────────────────────────────────
 
465
  f"P(Human-Written) : {human_prob*100:.1f}%\n"
466
  f"\n"
467
  f"Words analysed : {word_count}\n"
468
+ f"(First 128 tokens used β€” ~100 words)"
 
 
469
  )
470
  return output
471