Spaces:

Strider234
/

Hackthon

Sleeping

App Files Files Community

Apurv commited on Mar 16

Commit

b8630cb

0 Parent(s):

Deploying AegisAI Hackathon Backend

Browse files

Files changed (14) hide show

.gitattributes +1 -0
agents/__init__.py +5 -0
agents/agent1_external.py +168 -0
agents/agent2_content.py +281 -0
agents/agent3_synthesizer.py +236 -0
agents/agent4_prompt.py +117 -0
app.py +120 -0
models/model_new.pkl +3 -0
models/phishing_new.pkl +3 -0
models/vectorizer_new.pkl +3 -0
models/vectorizerurl_new.pkl +3 -0
requirements.txt +8 -0
utils/__init__.py +3 -0
utils/preprocessor.py +40 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pkl filter=lfs diff=lfs merge=lfs -text

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .agent1_external import ExternalAnalysisAgent
+from .agent2_content import ContentAnalysisAgent
+from .agent3_synthesizer import SynthesizerAgent
+__all__ = ['ExternalAnalysisAgent', 'ContentAnalysisAgent', 'SynthesizerAgent']

agents/agent1_external.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import re
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from urllib.parse import urlparse
+from difflib import SequenceMatcher
+import os
+import pickle
+class ExternalAnalysisAgent:
+    def __init__(self):
+        print("Loading External Analysis Agent...")
+        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+        # Load pickle models for URL analysis
+        model_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'models')
+        try:
+            with open(os.path.join(model_dir, 'phishing_new.pkl'), 'rb') as f:
+                self.url_ml_model = pickle.load(f)
+            with open(os.path.join(model_dir, 'vectorizerurl_new.pkl'), 'rb') as f:
+                self.url_vectorizer = pickle.load(f)
+            self.has_url_ml = True
+            print("Successfully loaded URL ML models.")
+        except Exception as e:
+            print(f"Failed to load URL ML models: {e}")
+            self.has_url_ml = False
+        self.phishing_patterns = [
+            "verify your account immediately",
+            "suspicious activity detected",
+            "click here to confirm",
+            "your account will be suspended",
+            "update your payment information",
+            "unusual sign-in attempt",
+            "secure your account now",
+            "limited time offer",
+            "you have won a prize",
+            "inheritance money transfer"
+        ]
+        self.suspicious_tlds = ['.xyz', '.top', '.club', '.online', '.site', '.win', '.bid']
+        self.legitimate_domains = ['google.com', 'microsoft.com', 'amazon.com', 'paypal.com', 'apple.com']
+        self.pattern_embeddings = self.model.encode(self.phishing_patterns)
+        print("External Analysis Agent loaded successfully!")
+    def analyze_url_risk(self, url):
+        """Analyze URL for suspicious patterns"""
+        risk_score = 0.0
+        reasons = []
+        for tld in self.suspicious_tlds:
+            if url.lower().endswith(tld) or tld in url.lower():
+                risk_score += 0.3
+                reasons.append(f"Suspicious TLD: {tld}")
+                break
+        if re.search(r'\d+\.\d+\.\d+\.\d+', url):
+            risk_score += 0.4
+            reasons.append("IP address used instead of domain name")
+        if url.count('.') > 3:
+            risk_score += 0.2
+            reasons.append("Excessive subdomains")
+        shortening_services = ['bit.ly', 'tinyurl', 'goo.gl', 'ow.ly', 'tiny.cc']
+        for service in shortening_services:
+            if service in url.lower():
+                risk_score += 0.3
+                reasons.append(f"URL shortening service detected: {service}")
+                break
+        suspicious_keywords = ['login', 'signin', 'verify', 'account', 'secure', 'update', 'confirm']
+        for keyword in suspicious_keywords:
+            if keyword in url.lower():
+                risk_score += 0.1
+                reasons.append(f"Suspicious keyword in URL: '{keyword}'")
+                break
+        domain_similarity = self.check_domain_similarity(url)
+        if domain_similarity > 0.7:
+            risk_score += 0.3
+            reasons.append("Domain similar to legitimate brand")
+        url_ml_prob = 0.0
+        if self.has_url_ml:
+            try:
+                features = self.url_vectorizer.transform([url])
+                # phishing.pkl is LogisticRegression
+                url_ml_prob = self.url_ml_model.predict_proba(features)[0][1]
+                # Hybrid Logic: Weight the ML model heavily if it has high confidence
+                if url_ml_prob > 0.8:
+                    risk_score = max(risk_score, 0.9)
+                    reasons.append(f"ML model identified highly malicious URL structure (Score: {url_ml_prob:.1%})")
+                elif url_ml_prob > 0.5:
+                    risk_score = max(risk_score, 0.6)
+                    reasons.append(f"ML model flagged suspicious URL structure (Score: {url_ml_prob:.1%})")
+            except Exception as e:
+                print(f"Error predicting URL with ML model: {e}")
+        return min(risk_score, 1.0), reasons, url_ml_prob
+    def check_domain_similarity(self, url):
+        """Check if domain is similar to legitimate domains"""
+        domain = self.extract_domain(url)
+        max_similarity = 0.0
+        for legit_domain in self.legitimate_domains:
+            similarity = SequenceMatcher(None, domain.lower(), legit_domain).ratio()
+            max_similarity = max(max_similarity, similarity)
+        return max_similarity
+    def extract_domain(self, url):
+        """Extract domain from URL"""
+        parsed = urlparse(url)
+        domain = parsed.netloc or parsed.path.split('/')[0]
+        return domain
+    def analyze(self, input_data):
+        """Main analysis function"""
+        text = input_data['cleaned_text']
+        urls = input_data['urls']
+        results = {
+            'url_risk': 0.0,
+            'url_ml_risk': 0.0,
+            'domain_similarity': 0.0,
+            'suspicious_patterns': [],
+            'risk_factors': [],
+            'overall_risk': 0.0
+        }
+        if urls:
+            url_risks = []
+            url_ml_risks = []
+            for url in urls:
+                risk, reasons, ml_prob = self.analyze_url_risk(url)
+                url_risks.append(risk)
+                url_ml_risks.append(ml_prob)
+                results['risk_factors'].extend(reasons)
+            results['url_risk'] = np.mean(url_risks) if url_risks else 0
+            results['url_ml_risk'] = max(url_ml_risks) if url_ml_risks else 0
+            results['domain_similarity'] = self.check_domain_similarity(urls[0])
+        try:
+            text_embedding = self.model.encode([text])
+            similarities = cosine_similarity(text_embedding, self.pattern_embeddings)[0]
+            if max(similarities) > 0.6:
+                results['suspicious_patterns'].append("Text similar to known phishing patterns")
+                results['overall_risk'] += 0.3
+        except Exception as e:
+            print(f"Error in semantic similarity: {e}")
+        results['overall_risk'] = min(
+            results['url_risk'] * 0.6 +
+            results['domain_similarity'] * 0.4 +
+            len(results['suspicious_patterns']) * 0.1,
+            1.0
+        )
+        return results

agents/agent2_content.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, pipeline
+import torch
+import torch.nn.functional as F
+import os
+import pickle
+import re
+class ContentAnalysisAgent:
+    def __init__(self):
+        # Detection of Device
+        self.device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
+        print(f"Using device: {self.device} for inference optimization.")
+        self.model_name = "microsoft/deberta-v3-small"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=False)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name,
+            num_labels=2,
+            ignore_mismatched_sizes=True
+        ).to(self.device)
+        # New: sentence-transformers/all-MiniLM-L6-v2 using AutoModel/AutoTokenizer
+        self.minilm_name = "sentence-transformers/all-MiniLM-L6-v2"
+        self.minilm_tokenizer = AutoTokenizer.from_pretrained(self.minilm_name)
+        self.minilm_model = AutoModel.from_pretrained(self.minilm_name).to(self.device)
+        # Optimization: Use Half-precision if on MPS
+        if self.device.type == "mps":
+            self.model = self.model.half()
+            self.minilm_model = self.minilm_model.half()
+        self.model.eval()
+        self.minilm_model.eval()
+        print("Loading Hugging Face pipelines...")
+        try:
+            self.mask_pipeline = pipeline("fill-mask", model="microsoft/deberta-v3-small")
+            self.sentiment_pipeline = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+            self.has_pipelines = True
+            print("Successfully loaded HF pipelines.")
+        except Exception as e:
+            print(f"Failed to load HF pipelines: {e}")
+            self.has_pipelines = False
+        print("Loading local text ML models...")
+        model_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'models')
+        try:
+            with open(os.path.join(model_dir, 'model_new.pkl'), 'rb') as f:
+                self.scikit_model = pickle.load(f)
+            with open(os.path.join(model_dir, 'vectorizer_new.pkl'), 'rb') as f:
+                self.scikit_vectorizer = pickle.load(f)
+            self.has_text_ml = True
+            print("Successfully loaded text ML models.")
+        except Exception as e:
+            print(f"Failed to load text ML models: {e}")
+            self.has_text_ml = False
+        self.phishing_keywords = [
+            'verify', 'account', 'bank', 'login', 'password', 'credit card',
+            'ssn', 'social security', 'suspended', 'limited', 'unusual activity',
+            'confirm identity', 'update information', 'click here', 'urgent'
+        ]
+        self.urgency_phrases = [
+            'immediately', 'within 24 hours', 'as soon as possible',
+            'urgent', 'action required', 'deadline', 'expire soon'
+        ]
+        self.prompt_injection_patterns = [
+            'ignore previous instructions',
+            'ignore all previous',
+            'disregard previous',
+            'system prompt',
+            'you are now',
+            'act as',
+            'new role:',
+            'forget your instructions'
+        ]
+    def analyze_phishing(self, text):
+        """Analyze text for phishing indicators"""
+        text_lower = text.lower()
+        keyword_matches = []
+        for keyword in self.phishing_keywords:
+            if keyword in text_lower:
+                keyword_matches.append(keyword)
+        urgency_matches = []
+        for phrase in self.urgency_phrases:
+            if phrase in text_lower:
+                urgency_matches.append(phrase)
+        keyword_score = min(len(keyword_matches) / 5, 1.0)
+        urgency_score = min(len(urgency_matches) / 3, 1.0)
+        has_personal_info_request = any([
+            'password' in text_lower and 'send' in text_lower,
+            'credit card' in text_lower,
+            'ssn' in text_lower,
+            'social security' in text_lower
+        ])
+        if has_personal_info_request:
+            personal_info_score = 0.8
+        else:
+            personal_info_score = 0.0
+        phishing_score = (keyword_score * 0.4 + urgency_score * 0.3 + personal_info_score * 0.3)
+        return phishing_score, keyword_matches, urgency_matches
+    def analyze_prompt_injection(self, text):
+        """Check for prompt injection attempts"""
+        text_lower = text.lower()
+        for pattern in self.prompt_injection_patterns:
+            if pattern in text_lower:
+                return True, [f"Prompt injection pattern detected: '{pattern}'"]
+        return False, []
+    def analyze_ai_generated(self, text):
+        """Basic detection of AI-generated content patterns"""
+        ai_indicators = [
+            'as an ai', 'i am an ai', 'as a language model',
+            'i cannot', 'i apologize', 'i am unable to',
+            'unfortunately', 'i must inform you'
+        ]
+        text_lower = text.lower()
+        matches = [ind for ind in ai_indicators if ind in text_lower]
+        if len(matches) > 1:
+            return 0.7, matches
+        elif len(matches) > 0:
+            return 0.4, matches
+        else:
+            return 0.0, []
+    def analyze_with_transformer(self, text):
+        """Use transformer model for classification with optimized inference"""
+        try:
+            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
+            with torch.inference_mode(): # Faster than no_grad
+                outputs = self.model(**inputs)
+                probabilities = F.softmax(outputs.logits.float(), dim=-1) # Cast back to float for softmax
+            phishing_prob = probabilities[0][1].item()
+            return phishing_prob
+        except Exception as e:
+            print(f"Transformer error: {e}")
+            return 0.5
+    def get_minilm_embeddings(self, text):
+        """Get embeddings using all-MiniLM-L6-v2 with mean pooling (optimized)"""
+        inputs = self.minilm_tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=512).to(self.device)
+        with torch.inference_mode():
+            model_output = self.minilm_model(**inputs)
+        # Mean Pooling
+        attention_mask = inputs['attention_mask']
+        token_embeddings = model_output[0].float() # Cast to float16 to float32 for pooling stability
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+        return embeddings
+    def analyze_connection(self, text, urls):
+        """Analyze the connection between email text and URLs"""
+        if not urls:
+            return 1.0, "No URLs to analyze"
+        text_emb = self.get_minilm_embeddings(text)
+        connection_scores = []
+        for url in urls:
+            # Extract meaningful parts of the URL for semantic comparison
+            url_parts = url.replace('http://', '').replace('https://', '').replace('www.', '')
+            url_parts = re.sub(r'[/.\-_]', ' ', url_parts)
+            url_emb = self.get_minilm_embeddings(url_parts)
+            similarity = F.cosine_similarity(text_emb, url_emb).item()
+            connection_scores.append(similarity)
+        avg_connection = sum(connection_scores) / len(connection_scores)
+        # A very low connection score (divergence) is an indicator of phishing
+        if avg_connection < 0.2:
+            return avg_connection, "High divergence: URL content does not match email context"
+        elif avg_connection < 0.4:
+            return avg_connection, "Moderate divergence: URL seems loosely related to email context"
+        else:
+            return avg_connection, "Stable: URL matches email context"
+    def analyze(self, input_data):
+        """Main analysis function with hybrid and connection logic"""
+        text = input_data['cleaned_text']
+        urls = input_data['urls']
+        # Benign baseline check for short / common messages
+        benign_greetings = ['hi', 'hii', 'hiii', 'hello', 'hey', 'how are you', 'how is this', 'test']
+        clean_msg = text.lower().strip().replace('?', '').replace('!', '')
+        if clean_msg in benign_greetings and not urls:
+            return {
+                'phishing_probability': 0.01,
+                'urgency_matches': [],
+                'keyword_matches': [],
+                'prompt_injection': False,
+                'ai_generated_probability': 0.05,
+                'spam_probability': 0.01,
+                'connection_score': 1.0,
+                'connection_message': "Safe: Benign conversational text",
+                'sentiment_label': "POSITIVE",
+                'sentiment_score': 0.99
+            }
+        phishing_score, keyword_matches, urgency_matches = self.analyze_phishing(text)
+        prompt_injection, injection_patterns = self.analyze_prompt_injection(text)
+        ai_generated_score, ai_patterns = self.analyze_ai_generated(text)
+        transformer_score = self.analyze_with_transformer(text)
+        # Hybrid Text Analysis: Combine model.pkl score with transformer_score
+        spam_probability = 0.0
+        spam_ml_prob = 0.0
+        if self.has_text_ml:
+            try:
+                features = self.scikit_vectorizer.transform([text])
+                spam_ml_prob = self.scikit_model.predict_proba(features)[0][1]
+                # Fine-tune transformer score using the pickle model baseline
+                transformer_score = (transformer_score * 0.7) + (spam_ml_prob * 0.3)
+                spam_probability = spam_ml_prob
+            except Exception as e:
+                print(f"Text ML model unavailable (sklearn version mismatch), using fallback: {e}")
+                self.has_text_ml = False  # disable to avoid repeated errors
+        # Connection Analysis
+        connection_score, connection_msg = self.analyze_connection(text, urls)
+        # Adjust combined phishing score based on connection divergence
+        # If divergence is high (low connection), we increase the phishing probability
+        connection_penalty = max(0, 0.5 - connection_score) if connection_score < 0.4 else 0
+        combined_phishing = min(max(phishing_score, transformer_score) + connection_penalty, 1.0)
+        if spam_probability < 0.3:
+            spam_indicators = ['free', 'win', 'winner', 'prize', 'click here', 'offer', 'limited time', 'lottery', 'congratulations', 'cash', 'money', 'claim', 'award']
+            spam_matches = [ind for ind in spam_indicators if ind in text.lower()]
+            heuristic_spam = min(len(spam_matches) / 6, 1.0) # 1 match = 0.16 (Safe), 2 matches = 0.33 (Low), 3 matches = 0.5 (Medium)
+            spam_probability = max(spam_probability, heuristic_spam)
+        # Optional sentiment analysis using pipeline
+        sentiment_score = 0.0
+        sentiment_label = "UNKNOWN"
+        if self.has_pipelines:
+            try:
+                sent_result = self.sentiment_pipeline(text[:512])[0]
+                sentiment_label = sent_result['label']
+                sentiment_score = sent_result['score'] if sentiment_label == 'NEGATIVE' else (1.0 - sent_result['score'])
+            except Exception as e:
+                print(f"Error predicting sentiment: {e}")
+        results = {
+            'phishing_probability': combined_phishing,
+            'prompt_injection': prompt_injection,
+            'prompt_injection_patterns': injection_patterns,
+            'ai_generated_probability': ai_generated_score,
+            'spam_probability': spam_probability,
+            'spam_ml_score': spam_ml_prob,
+            'keyword_matches': keyword_matches,
+            'urgency_matches': urgency_matches,
+            'ai_patterns': ai_patterns,
+            'transformer_score': transformer_score,
+            'using_transformer': True,
+            'sentiment_score': sentiment_score,
+            'sentiment_label': sentiment_label,
+            'connection_score': connection_score,
+            'connection_message': connection_msg
+        }
+        return results

agents/agent3_synthesizer.py ADDED Viewed

	@@ -0,0 +1,236 @@

+class SynthesizerAgent:
+    def __init__(self):
+        self.thresholds = {
+            'low': 0.22,
+            'medium': 0.5,
+            'high': 0.8
+        }
+        self.weights = {
+            'phishing': 0.4,
+            'url_risk': 0.3,
+            'spam': 0.15,
+            'ai_generated': 0.1,
+            'domain_similarity': 0.05,
+            'prompt_injection': 0.3 # High impact when detected
+        }
+    def calculate_risk_score(self, agent1_results, agent2_results, agent4_results):
+        """Calculate overall risk score"""
+        risk_score = 0.0
+        # Give higher priority to ML based scores if available
+        url_risk_val = agent1_results['url_ml_risk'] if agent1_results.get('url_ml_risk', 0) > agent1_results['url_risk'] else agent1_results['url_risk']
+        spam_val = agent2_results['spam_ml_score'] if agent2_results.get('spam_ml_score', 0) > agent2_results.get('spam_probability', 0) else agent2_results.get('spam_probability', 0)
+        risk_score += agent2_results['phishing_probability'] * self.weights['phishing']
+        risk_score += url_risk_val * self.weights['url_risk']
+        risk_score += spam_val * self.weights['spam']
+        risk_score += agent2_results['ai_generated_probability'] * self.weights['ai_generated']
+        risk_score += agent1_results['domain_similarity'] * self.weights['domain_similarity']
+        # Integrate Agent 4 Prompt Injection Score
+        risk_score += agent4_results['confidence'] * self.weights['prompt_injection']
+        # New: Factor in connection score (divergence)
+        connection_score = agent2_results.get('connection_score', 1.0)
+        if connection_score < 0.4:
+            # Low connection = higher risk
+            divergence_penalty = (0.4 - connection_score) * 0.5
+            risk_score += divergence_penalty
+        # Adjust based on aggressive sentiment
+        if agent2_results.get('sentiment_label') == 'NEGATIVE' and agent2_results.get('sentiment_score', 0) > 0.8:
+            risk_score += 0.1
+        # Combine Prompt Injection flags from Agent 2 (heuristic) and Agent 4 (transformer)
+        if agent2_results['prompt_injection'] or agent4_results['prompt_injection_detected']:
+            risk_score = max(risk_score, 0.7) # Ensure at least HIGH risk if injection is detected
+        return min(risk_score, 1.0)
+    def determine_risk_level(self, risk_score):
+        """Convert numerical score to risk level"""
+        if risk_score >= self.thresholds['high']:
+            return "HIGH"
+        elif risk_score >= self.thresholds['medium']:
+            return "MEDIUM"
+        elif risk_score >= self.thresholds['low']:
+            return "LOW"
+        else:
+            return "MINIMAL"
+    def determine_threat_type(self, risk_score, agent1_results, agent2_results, agent4_results):
+        """Classify the type of threat"""
+        threats = []
+        if agent2_results['phishing_probability'] > 0.7:
+            threats.append("Phishing")
+        if agent1_results['url_risk'] > 0.7 or agent1_results.get('url_ml_risk', 0) > 0.7:
+            threats.append("Malicious URL")
+        if agent2_results['prompt_injection'] or agent4_results['prompt_injection_detected']:
+            threats.append("Prompt Injection")
+        if agent2_results['ai_generated_probability'] > 0.6:
+            threats.append("AI-Generated Scam")
+        if agent2_results.get('spam_probability', 0) > 0.7 or agent2_results.get('spam_ml_score', 0) > 0.7:
+            threats.append("Spam")
+        if not threats and risk_score > 0.3:
+            threats.append("Suspicious Content")
+        elif not threats:
+            threats.append("Benign")
+        return threats
+    def generate_explanation(self, agent1_results, agent2_results, agent4_results, threat_types, risk_score):
+        """Generate detailed, context-aware forensic reasoning like a security expert."""
+        reasons = []
+        # ── URL / Domain Forensics ──
+        for factor in agent1_results.get('risk_factors', []):
+            factor_lower = factor.lower()
+            if 'suspicious tld' in factor_lower:
+                reasons.append(f"URL Analysis: {factor} — uncommon TLDs are frequently used by phishing campaigns to evade domain blocklists")
+            elif 'ip address' in factor_lower:
+                reasons.append(f"URL Analysis: {factor} — legitimate services almost never use raw IP addresses in their links")
+            elif 'shortening' in factor_lower:
+                reasons.append(f"URL Analysis: {factor} — URL shorteners hide the true destination, commonly abused by attackers")
+            elif 'ml model' in factor_lower:
+                reasons.append(f"URL Analysis (ML): {factor}")
+            elif 'similar to legitimate' in factor_lower:
+                reasons.append(f"Sender Spoofing: {factor} — this domain uses visual similarity (homoglyph attack) to impersonate a trusted brand")
+            elif 'suspicious keyword' in factor_lower:
+                reasons.append(f"URL Analysis: {factor} — authentication keywords in URLs often indicate credential-harvesting pages")
+            elif 'subdomain' in factor_lower:
+                reasons.append(f"URL Analysis: {factor} — excessive subdomains are a technique to disguise malicious domains")
+            else:
+                reasons.append(f"URL Analysis: {factor}")
+        # Domain similarity warning
+        if agent1_results.get('domain_similarity', 0) > 0.5:
+            reasons.append(f"Sender Spoofing: Domain is {agent1_results['domain_similarity']:.0%} similar to a known legitimate brand — possible impersonation attempt")
+        # ── Content Forensics ──
+        keyword_matches = agent2_results.get('keyword_matches', [])
+        if keyword_matches:
+            kw_str = ', '.join(f"'{k}'" for k in keyword_matches[:4])
+            reasons.append(f"Content Analysis: Detected high-risk keywords [{kw_str}] — these are hallmarks of social engineering and credential theft attempts")
+        urgency_matches = agent2_results.get('urgency_matches', [])
+        if urgency_matches:
+            urg_str = ', '.join(f"'{u}'" for u in urgency_matches[:3])
+            reasons.append(f"Behavioral Threat: Urgency/pressure language detected [{urg_str}] — creates artificial time pressure to bypass critical thinking")
+        # ── Prompt Injection (Agent 4 Integration) ──
+        if agent4_results.get('prompt_injection_detected'):
+            cats = agent4_results.get('attack_categories', [])
+            detail = f"Detected Categories: {', '.join(cats)}" if cats else "AI instruction override attempt"
+            reasons.append(f"Prompt Injection Agent: {detail} (Risk: {agent4_results['confidence']:.0%}) — advanced hijacking pattern identified via transformer analysis")
+        elif agent2_results.get('prompt_injection'):
+            reasons.append("Prompt Injection: Heuristic pattern match — suspicious instruction override pattern detected in input text")
+        # ── AI Generated Content ──
+        ai_prob = agent2_results.get('ai_generated_probability', 0)
+        if ai_prob > 0.5:
+            reasons.append(f"Content Analysis: Text shows AI-generation patterns (Score: {ai_prob:.0%}) — machine-written scam content designed to appear legitimate")
+        # ── Semantic Divergence ──
+        connection_score = agent2_results.get('connection_score', 1.0)
+        connection_msg = agent2_results.get('connection_message', '')
+        if connection_score < 0.4:
+            reasons.append(f"Hidden Threat: {connection_msg} (Divergence Score: {connection_score:.0%}) — link text says one thing but URL points somewhere completely different")
+        elif connection_score < 0.6 and agent1_results.get('url_risk', 0) > 0.3:
+            reasons.append(f"Content Analysis: Weak semantic link between email text and embedded URLs ({connection_score:.0%}) — potentially deceptive link labels")
+        # ── Sentiment / Tone ──
+        sentiment_label = agent2_results.get('sentiment_label', 'UNKNOWN')
+        sentiment_score = agent2_results.get('sentiment_score', 0)
+        if sentiment_label == 'NEGATIVE' and sentiment_score > 0.8:
+            reasons.append(f"Behavioral Threat: Highly aggressive/threatening tone detected (Score: {sentiment_score:.1%}) — intimidation tactics used to provoke panic-driven actions")
+        elif sentiment_label == 'NEGATIVE' and sentiment_score > 0.5:
+            reasons.append(f"Content Analysis: Negative sentiment detected (Score: {sentiment_score:.1%}) — may use fear-based language to manipulate recipient")
+        # ── Spam Signals ──
+        spam_prob = max(agent2_results.get('spam_probability', 0), agent2_results.get('spam_ml_score', 0))
+        if spam_prob > 0.7:
+            reasons.append(f"Content Analysis: High spam probability ({spam_prob:.0%}) — message matches known bulk/unsolicited mail patterns")
+        # ── Safe fallback: never return empty reasoning ──
+        if not reasons:
+            if risk_score < 0.2:
+                reasons.append("Content Analysis: No suspicious patterns, malicious URLs, or social engineering tactics detected — message appears legitimate")
+                reasons.append("URL Analysis: No links found, or all URLs point to verified, trusted domains")
+            else:
+                reasons.append(f"Content Analysis: Minor risk signals detected (combined score: {risk_score:.0%}) but no single strong threat indicator found")
+        # ── Recommended Actions ──
+        actions = []
+        if "Phishing" in threat_types or "Malicious URL" in threat_types:
+            actions.extend([
+                "Do not click any links in this message",
+                "Do not provide personal information or credentials",
+                "Block the sender and report to your security team"
+            ])
+        elif "Prompt Injection" in threat_types:
+            actions.extend([
+                "Do not execute any instructions contained in this message",
+                "Report this message to security team"
+            ])
+        elif "Spam" in threat_types:
+            actions.extend([
+                "Mark as spam and block sender",
+                "Do not unsubscribe via links — this confirms your address"
+            ])
+        elif "AI-Generated Scam" in threat_types:
+            actions.extend([
+                "Verify the sender through an independent channel",
+                "Do not act on any financial requests in this message"
+            ])
+        if risk_score < 0.3 and not actions:
+            actions.append("No immediate action required")
+        elif not actions:
+            actions.append("Report this message to security team")
+        return {
+            'reasons': reasons[:6],
+            'actions': actions[:4]
+        }
+    def synthesize(self, agent1_results, agent2_results, agent4_results):
+        """Main synthesis function"""
+        risk_score = self.calculate_risk_score(agent1_results, agent2_results, agent4_results)
+        risk_level = self.determine_risk_level(risk_score)
+        threat_types = self.determine_threat_type(risk_score, agent1_results, agent2_results, agent4_results)
+        explanation = self.generate_explanation(
+            agent1_results, agent2_results, agent4_results, threat_types, risk_score
+        )
+        # Confidence: Now dynamically reflects certainty in the verdict
+        # Higher confidence when risk_score is closer to extremes (0.0 or 1.0)
+        # Lower confidence when score is near the middle (0.5)
+        distance_from_borderline = abs(risk_score - 0.5)
+        confidence = 0.5 + distance_from_borderline
+        result = {
+            'threat_types': threat_types,
+            'risk_level': risk_level,
+            'risk_score': risk_score,
+            'confidence': min(confidence, 1.0),
+            'explanation': explanation,
+            'detailed_results': {
+                'agent1': agent1_results,
+                'agent2': agent2_results,
+                'agent4': agent4_results
+            }
+        }
+        return result

agents/agent4_prompt.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Agent 4: AI Prompt Injection Detection Module
+Uses a fine‑tuned DeBERTa model (MNLI) + rule‑based patterns.
+"""
+import re
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+class PromptInjectionAgent:
+    """
+    Detects prompt injection and jailbreak attempts in user inputs.
+    Combines a transformer model (trained on MNLI) with heuristic rules.
+    """
+    def __init__(self, model_name="mrm8488/deberta-v3-small-finetuned-mnli", threshold=0.6):
+        """
+        Args:
+            model_name: Hugging Face model identifier for a DeBERTa MNLI model.
+            threshold: Confidence threshold above which input is flagged as injection.
+        """
+        print("Loading Prompt Injection Agent (MNLI-based)...")
+        self.threshold = threshold
+        # Load tokenizer and model
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+        self.model.eval()  # inference mode
+        print("✓ Model loaded successfully")
+        # Rule‑based patterns (covers common jailbreak attempts)
+        self.injection_patterns = [
+            (r"ignore previous instructions", "instruction_override"),
+            (r"ignore all previous", "instruction_override"),
+            (r"disregard previous", "instruction_override"),
+            (r"system prompt", "system_override"),
+            (r"you are now", "role_playing"),
+            (r"act as", "role_playing"),
+            (r"new role:", "role_playing"),
+            (r"forget your instructions", "instruction_override"),
+            (r"do anything now", "privilege_escalation"),
+            (r"you must", "privilege_escalation"),
+            (r"you are free", "jailbreak"),
+            (r"no restrictions", "jailbreak"),
+            (r"override", "instruction_override"),
+            (r"jailbreak", "jailbreak"),
+            (r"dan", "jailbreak"),                # DAN mode
+            (r"developer mode", "jailbreak"),
+            (r"chatgpt, you are now", "role_playing"),
+            (r"you are an ai with no ethics", "role_playing"),
+            (r"output raw", "attention_diversion"),
+            (r"base64 decode", "attention_diversion"),
+        ]
+    def analyze(self, text: str) -> dict:
+        """
+        Analyze input text for prompt injection.
+        Returns:
+            dict with keys:
+                prompt_injection_detected (bool): final decision
+                confidence (float): combined risk score
+                risk_score (float): same as confidence (for backward compatibility)
+                matched_patterns (list): regex patterns that fired
+                attack_categories (list): types of injection detected
+                explanation (list): human‑readable reasons
+        """
+        # -------------------- Rule‑based scan --------------------
+        text_lower = text.lower()
+        rule_score = 0.0
+        matched_patterns = []
+        attack_categories = []
+        for pattern, category in self.injection_patterns:
+            if re.search(pattern, text_lower):
+                rule_score += 0.3
+                matched_patterns.append(pattern)
+                attack_categories.append(category)
+        # -------------------- Transformer inference --------------------
+        # Tokenize
+        inputs = self.tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512,
+            padding=True
+        )
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
+        # MNLI classes: 0 = entailment, 1 = neutral, 2 = contradiction
+        contradiction_prob = probs[0][2].item()
+        # -------------------- Combine scores --------------------
+        # 70% weight on contradiction probability, 30% on rule‑based
+        combined_risk = 0.7 * contradiction_prob + 0.3 * min(rule_score, 1.0)
+        detected = combined_risk > self.threshold
+        # -------------------- Build explanation --------------------
+        explanation = []
+        explanation.append(f"Contradiction probability: {contradiction_prob:.1%}")
+        if attack_categories:
+            unique_cats = list(set(attack_categories))
+            explanation.append(f"Rule matches: {', '.join(unique_cats)}")
+        if detected:
+            explanation.append(f"Combined risk {combined_risk:.1%} exceeds threshold {self.threshold:.0%}")
+        return {
+            "prompt_injection_detected": detected,
+            "confidence": combined_risk,
+            "risk_score": combined_risk,          # alias for compatibility
+            "matched_patterns": matched_patterns,
+            "attack_categories": list(set(attack_categories)),
+            "explanation": explanation
+        }

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import time
+import os
+import sys
+import json
+import gradio as gr
+from utils.preprocessor import TextPreprocessor
+from agents.agent1_external import ExternalAnalysisAgent
+from agents.agent2_content import ContentAnalysisAgent
+from agents.agent3_synthesizer import SynthesizerAgent
+from agents.agent4_prompt import PromptInjectionAgent
+class ThreatDetectionSystem:
+    def __init__(self):
+        print("Initializing Threat Detection System...")
+        self.preprocessor = TextPreprocessor()
+        self.agent1 = ExternalAnalysisAgent()
+        self.agent2 = ContentAnalysisAgent()
+        self.agent3 = SynthesizerAgent()
+        self.agent4 = PromptInjectionAgent()
+        print("System initialized!")
+    def analyze(self, user_input):
+        """Main analysis pipeline"""
+        start_time = time.time()
+        # Step 1: Preprocess
+        preprocessed = self.preprocessor.preprocess(user_input)
+        # Step 2: Run agents
+        agent1_results = self.agent1.analyze(preprocessed)
+        agent2_results = self.agent2.analyze(preprocessed)
+        agent4_results = self.agent4.analyze(user_input)
+        # Step 3: Synthesize results
+        final_result = self.agent3.synthesize(agent1_results, agent2_results, agent4_results)
+        final_result['processing_time'] = time.time() - start_time
+        return final_result
+# Initialize the system globally for HF
+system = ThreatDetectionSystem()
+# --- Gradio UI Logic ---
+def ui_analyze(text):
+    if not text or not text.strip():
+        return "Please enter some text", {}, {}
+    result = system.analyze(text)
+    # Prettify the report for display
+    risk_color = "🔴" if result['risk_level'] == "HIGH" else "🟠" if result['risk_level'] == "MEDIUM" else "🟡" if result['risk_level'] == "LOW" else "🟢"
+    report = f"{risk_color} {result['risk_level']} RISK DETECTED\n"
+    report += f"Confidence: {result['confidence']:.1%}\n"
+    report += f"Type: {', '.join(result['threat_types'])}\n\n"
+    report += "Forensic Reasons:\n" + "\n".join([f"- {r}" for r in result['explanation']['reasons']])
+    return report, result['detailed_results'], result['explanation']['actions']
+# --- Next.js Backend Compatibility API ---
+# This endpoint is what the Vercel frontend calls
+def api_analyze(text):
+    try:
+        if not text or not text.strip():
+            return {"error": "No input provided"}
+        result = system.analyze(text)
+        # Map to the schema expected by the Next.js frontend
+        risk_map = {"MINIMAL": "Safe", "LOW": "Low", "MEDIUM": "Medium", "HIGH": "High"}
+        risk_level = risk_map.get(result["risk_level"], "Medium")
+        return {
+            "riskLevel": risk_level,
+            "threatType": ", ".join(result["threat_types"]),
+            "confidenceScore": round(result["confidence"] * 100, 1),
+            "riskScore": round(result["risk_score"], 4),
+            "explanation": " ".join(result["explanation"]["reasons"]),
+            "indicators": result["explanation"]["reasons"],
+            "recommendations": result["explanation"]["actions"],
+            "detailedScores": {
+                "phishingProb": round(result["detailed_results"]["agent2"].get("phishing_probability", 0), 3),
+                "spamProb": round(result["detailed_results"]["agent2"].get("spam_probability", 0), 3),
+                "urlRisk": round(result["detailed_results"]["agent1"].get("url_risk", 0), 3),
+                "sentimentLabel": result["detailed_results"]["agent2"].get("sentiment_label", "UNKNOWN"),
+                "sentimentScore": round(result["detailed_results"]["agent2"].get("sentiment_score", 0), 3),
+                "promptInjectionScore": round(result["detailed_results"]["agent4"].get("confidence", 0), 3),
+                "promptInjectionDetected": result["detailed_results"]["agent4"].get("prompt_injection_detected", False),
+            },
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# --- Theme and Layout ---
+with gr.Blocks(theme="soft", title="🛡️ AegisAI Security") as demo:
+    gr.Markdown("# 🛡️ AegisAI: Advanced Phishing & Fraud Detector")
+    gr.Markdown("Drop an email body or URL here to get a full forensic breakdown.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_box = gr.Textbox(label="Message Content", lines=8, placeholder="Paste email content...")
+            with gr.Row():
+                clear_btn = gr.Button("Clear")
+                submit_btn = gr.Button("Analyze Threat", variant="primary")
+        with gr.Column(scale=3):
+            out_report = gr.Textbox(label="Analysis Report", lines=10, interactive=False)
+            out_actions = gr.JSON(label="Recommended Actions")
+            out_scores = gr.JSON(label="Agent Confidence Scores")
+    # Connect UI
+    submit_btn.click(fn=ui_analyze, inputs=input_box, outputs=[out_report, out_scores, out_actions])
+    clear_btn.click(lambda: ["", "", {}, {}], outputs=[input_box, out_report, out_scores, out_actions])
+    # HIDDEN API ENDPOINT FOR VERCEL
+    # Note: Hugging Face exposes this as an endpoint /run/predict or via api_name
+    api_endpoint = gr.Button("API", visible=False)
+    api_endpoint.click(fn=api_analyze, inputs=input_box, outputs=out_scores, api_name="analyze")
+if __name__ == "__main__":
+    demo.launch()

models/model_new.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b127302853205a24d23d79d543a17b0bc1aeecf152754f7ad9d1d77106acbe64
+size 40720

models/phishing_new.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:910e6db7e9bf78ed8f52ceed3a813541c749f412ceeeddfbb3758726eb7267e8
+size 40720

models/vectorizer_new.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a16317dca2e5aef5768222113e2a345bbef0e84e9f4115cd5394670506f5938b
+size 191803

models/vectorizerurl_new.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e594d26eb86e52c21d67640d7a0485fd519bbeb83411fe49a721f317a14ab181
+size 189473

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+pandas
+numpy
+scikit-learn
+torch
+transformers
+sentence-transformers
+# Optional: flask, gunicorn, flask-cors are not strictly needed for Gradio Spaces

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .preprocessor import TextPreprocessor
2	+
3	+ __all__ = ['TextPreprocessor']

utils/preprocessor.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import re
+from urllib.parse import urlparse
+class TextPreprocessor:
+    def __init__(self):
+        pass
+    def clean_text(self, text):
+        """Basic text cleaning"""
+        text = ' '.join(text.split())
+        return text
+    def extract_urls(self, text):
+        """Extract URLs from text"""
+        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
+        urls = re.findall(url_pattern, text)
+        return urls
+    def extract_domain(self, url):
+        """Extract domain from URL"""
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc or parsed.path.split('/')[0]
+            return domain
+        except:
+            return ""
+    def preprocess(self, text):
+        """Main preprocessing function"""
+        cleaned_text = self.clean_text(text)
+        urls = self.extract_urls(cleaned_text)
+        domains = [self.extract_domain(url) for url in urls]
+        return {
+            'cleaned_text': cleaned_text,
+            'urls': urls,
+            'domains': domains,
+            'has_urls': len(urls) > 0,
+            'text_length': len(cleaned_text)
+        }