Spaces:

Samarthrr
/

revcode-ai-engine

Sleeping

App Files Files Community

Samarthrr commited on 28 days ago

Commit

76e3fab

verified ·

1 Parent(s): 879b56d

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -111

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import ast
 import torch
 import torch.nn as nn
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import (
     T5ForConditionalGeneration,
     RobertaTokenizer,
@@ -11,176 +12,207 @@ from transformers import (
 )
 import pandas as pd
 import os
-app = FastAPI(title="Revcode AI Strong Orchestrator")
 # ---------------------------------------------------------
-# 1. ADVANCED SECURITY SCANNER (The "Brain")
 # ---------------------------------------------------------
 class DeepVulnerabilityScanner:
     def __init__(self):
-        print("Loading Deep Security Scanner (DistilRoBERTa)...")
-        self.model_name = "distilroberta-base"
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=2)
-            self.model.eval()
-        except Exception as e:
-            print(f"Failed to load Deep Scanner: {e}")
-            self.model = None
-    def scan(self, code: str) -> dict:
-        if not self.model:
-            return {"is_vulnerable": False, "risk_score": 0.0, "details": "Scanner unavailable"}
-        inputs = self.tokenizer(code, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             logits = self.model(**inputs).logits
         probs = torch.softmax(logits, dim=1)
         vuln_prob = probs[0][1].item()
         return {
-            "is_vulnerable": vuln_prob > 0.5,
-            "risk_score": round(vuln_prob * 100, 2),
-            "details": "Potential vulnerability detected in code logic." if vuln_prob > 0.5 else "Clean code."
         }
 # ---------------------------------------------------------
-# 2. AUTOMATED REPAIR ENGINE (The "Surgeon")
 # ---------------------------------------------------------
 class AutomatedRepairEngine:
     def __init__(self):
-        print("Loading Repair Engine (CodeT5)...")
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model_name = "Salesforce/codet5p-220m"
-        try:
-            self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
-            self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
-            self.model.eval()
-        except Exception as e:
-            print(f"Failed to load Repair Engine: {e}")
-            self.model = None
-    def repair(self, buggy_code: str) -> str:
-        if not self.model:
-            return buggy_code
-        prompt = f"Fix the security vulnerability: {buggy_code}"
-        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to(self.device)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
-                max_length=256,
                 num_beams=5,
-                temperature=0.7,
                 early_stopping=True
             )
         return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 # ---------------------------------------------------------
-# 3. SYNTAX & LOGIC VALIDATOR (The "Quality Control")
-# ---------------------------------------------------------
-class CodeValidator:
-    @staticmethod
-    def is_syntax_valid(code: str) -> bool:
-        try:
-            ast.parse(code)
-            return True
-        except Exception:
-            return False
-    @staticmethod
-    def check_security_patterns(code: str) -> list:
-        issues = []
-        dangerous_calls = ["eval(", "exec(", "os.system(", "subprocess.call("]
-        for call in dangerous_calls:
-            if call in code:
-                issues.append(f"Dangerous call found: {call}")
-        return issues
-# ---------------------------------------------------------
-# 4. GLOBAL HANDLERS (Lazy Loading)
 # ---------------------------------------------------------
 _scanner = None
 _repairer = None
-def get_scanner():
     global _scanner
-    if _scanner is None:
-        _scanner = DeepVulnerabilityScanner()
     return _scanner
 def get_repairer():
     global _repairer
-    if _repairer is None:
-        _repairer = AutomatedRepairEngine()
     return _repairer
-# ---------------------------------------------------------
-# 5. API ENDPOINTS
-# ---------------------------------------------------------
-class CodeInput(BaseModel):
-    code: str
 @app.post("/analyze")
 async def analyze_security(data: CodeInput):
     scanner = get_scanner()
-    result = scanner.scan(data.code)
     return {
-        "is_vulnerable": result["is_vulnerable"],
-        "confidence": result["risk_score"],
-        "verdict": "VULNERABLE" if result["is_vulnerable"] else "SECURE",
-        "details": result["details"],
-        "provider": "DistilRoBERTa-Strong"
     }
 @app.post("/fix")
 async def fix_code(data: CodeInput):
     repairer = get_repairer()
-    validator = CodeValidator()
-    # ML Repair
-    suggestion = repairer.repair(data.code)
-    # Validation Loop
-    status = "PASSED"
-    msg = "Valid syntax"
-    if not validator.is_syntax_valid(suggestion):
-        status = "FAILED"
-        msg = "Repair generated invalid syntax"
-        # Heuristic fallback (from user's logic)
-        suggestion = data.code.replace("eval(", "safe_eval(")
-    # Final Security Pattern Check
-    final_issues = validator.check_security_patterns(suggestion)
-    if final_issues:
-        for issue in final_issues:
-            call_name = issue.split(": ")[1]
-            suggestion = suggestion.replace(call_name, f"# BLOCKED_{call_name.replace('(', '')}")
-        msg += f" | Blocked {len(final_issues)} dangerous calls"
     return {
         "suggestion": suggestion,
-        "guardrail_status": status,
-        "guardrail_msg": msg
     }
 @app.post("/feedback")
 async def store_feedback(data: dict):
     feedback_file = "feedback_dataset.csv"
-    df = pd.DataFrame([data])
-    df.to_csv(feedback_file, mode='a', header=not os.path.exists(feedback_file), index=False)
-    return {"status": "Feedback stored for retraining"}
-@app.get("/")
-async def health():
-    return {"status": "Revcode AI Strong Engine is alive"}
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)

 import ast
 import torch
 import torch.nn as nn
+from fastapi import FastAPI, HTTPException, BackgroundTasks
 from pydantic import BaseModel
+from typing import Optional, List
 from transformers import (
     T5ForConditionalGeneration,
     RobertaTokenizer,
 )
 import pandas as pd
 import os
+import threading
+import re
+# Import the training function
+from train_engine import train_on_devign
+app = FastAPI(title="Revcode AI Precision Engine")
+# Global State
+training_lock = threading.Lock()
+is_training = False
+class CodeInput(BaseModel):
+    code: str
+    filename: Optional[str] = "snippet.js"
 # ---------------------------------------------------------
+# 1. PRECISION SCANNER (CodeBERT-Devign)
 # ---------------------------------------------------------
 class DeepVulnerabilityScanner:
     def __init__(self):
+        # Prefer locally trained model if it exists
+        local_model = "./trained_model"
+        if os.path.exists(local_model):
+            self.model_name = local_model
+            self.tokenizer_name = local_model
+        else:
+            self.model_name = "mahdin70/codebert-devign-code-vulnerability-detector"
+            self.tokenizer_name = "microsoft/codebert-base"
+        print(f"Loading Precision Scanner ({self.model_name})...")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
+        self.model.eval()
+    def scan(self, code: str) -> dict:
+        inputs = self.tokenizer(code, return_tensors="pt", truncation=True, padding=True, max_length=512)
         with torch.no_grad():
             logits = self.model(**inputs).logits
         probs = torch.softmax(logits, dim=1)
         vuln_prob = probs[0][1].item()
+        # RAISED THRESHOLD: Only flag as 'is_vulnerable' if we are > 85% certain
+        is_vuln = vuln_prob > 0.85
+        verdict = "SECURE"
+        if vuln_prob > 0.9: verdict = "CRITICAL"
+        elif vuln_prob > 0.7: verdict = "WARNING"
+        elif vuln_prob > 0.4: verdict = "POTENTIAL"
         return {
+            "is_vulnerable": is_vuln,
+            "confidence": round(vuln_prob * 100, 2),
+            "threat_level": verdict,
+            "reasoning": self._generate_reasoning(vuln_prob, code)
         }
+    def _generate_reasoning(self, prob, code):
+        if prob > 0.85:
+            return "CRITICAL: Detected high-confidence signature of an exploited pattern (likely injection or stack/heap overflow)."
+        if prob > 0.5:
+            return "MEDIUM: Code structure resembles vulnerable patterns in the security training set. Recommended audit."
+        return "SAFE: No significant security anomalies detected by the neural engine."
+# ---------------------------------------------------------
+# 2. RULE-BASED PATTERN FILTER (Hardened)
+# ---------------------------------------------------------
+class StructuralScanner:
+    @staticmethod
+    def scan(code: str, filename: str) -> List[dict]:
+        findings = []
+        # Rule 1: Code Injection (Detecting RAW eval, excluding json/safe wraps)
+        if "eval(" in code:
+            if not any(x in code for x in ["JSON.parse(", "safe_eval", "ast.literal_eval"]):
+                 findings.append({
+                    "title": "Unsafe Eval Usage",
+                    "severity": "CRITICAL",
+                    "reasoning": "Standard eval() executes string data as code. Use JSON.parse() or ast.literal_eval() for data."
+                })
+        # Rule 2: RAW Command Injection
+        if any(x in code for x in ["os.system(", "subprocess.Popen(..., shell=True)"]):
+            findings.append({
+                "title": "Direct Shell Execution",
+                "severity": "HIGH",
+                "reasoning": "Detected shell invocation with shell=True. This is highly susceptible to command injection."
+            })
+        return findings
 # ---------------------------------------------------------
+# 3. CONSERVATIVE REPAIR ENGINE (Minimal Changes)
 # ---------------------------------------------------------
 class AutomatedRepairEngine:
     def __init__(self):
+        print("Loading Conservative Repair Engine (CodeT5+)...")
         self.model_name = "Salesforce/codet5p-220m"
+        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_name)
+        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
+        self.model.eval()
+    def repair(self, buggy_code: str, filename: str) -> str:
+        # CONSTRAINED PROMPT: Focus only on the security fix
+        prompt = f"Fix the security scan vulnerability in this {filename} file accurately and with minimal changes: {buggy_code}"
+        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             outputs = self.model.generate(
                 **inputs,
+                max_length=512,
                 num_beams=5,
+                temperature=0.2, # LOWER TEMPERATURE for less creativity/more precision
                 early_stopping=True
             )
         return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 # ---------------------------------------------------------
+# 4. ORCHESTRATION & API
 # ---------------------------------------------------------
 _scanner = None
 _repairer = None
+_struct = StructuralScanner()
+def get_scanner(reload=False):
     global _scanner
+    if _scanner is None or reload: _scanner = DeepVulnerabilityScanner()
     return _scanner
 def get_repairer():
     global _repairer
+    if _repairer is None: _repairer = AutomatedRepairEngine()
     return _repairer
+@app.get("/")
+async def health():
+    return {"status": "Revcode Precision Engine Live", "is_training": is_training}
 @app.post("/analyze")
 async def analyze_security(data: CodeInput):
     scanner = get_scanner()
+    # 1. Neural Analysis
+    res = scanner.scan(data.code)
+    # 2. Structural Analysis
+    struct_findings = _struct.scan(data.code, data.filename)
+    # Merge Logic: If structural findings exist, it's definitely vulnerable
+    if struct_findings:
+        res["is_vulnerable"] = True
+        res["threat_level"] = "CRITICAL"
+        res["reasoning"] += " | Found hard rules violation: " + ", ".join([f['title'] for f in struct_findings])
     return {
+        "is_vulnerable": res["is_vulnerable"],
+        "confidence": res["confidence"],
+        "threat_level": res["threat_level"],
+        "reasoning": res["reasoning"],
+        "structural_findings": struct_findings,
+        "is_training": is_training
     }
 @app.post("/fix")
 async def fix_code(data: CodeInput):
     repairer = get_repairer()
+    # 1. Primary generative fix
+    suggestion = repairer.repair(data.code, data.filename)
+    # 2. Post-processing: If the AI failed to replace eval, force a surgical replacement
+    # This prevents the "vulnerability still there" issue
+    if "eval(" in data.code and "eval(" in suggestion:
+        suggestion = suggestion.replace("eval(", "JSON.parse(")
     return {
         "suggestion": suggestion,
+        "engine": "Conservative-CodeT5",
+        "context": data.filename
     }
+@app.post("/train")
+async def trigger_training(background_tasks: BackgroundTasks):
+    global is_training
+    if is_training: return {"status": "error", "message": "Training in progress"}
+    def run():
+        global is_training
+        is_training = True
+        try:
+            train_on_devign(output_dir="./trained_model")
+            get_scanner(reload=True)
+        finally: is_training = False
+    background_tasks.add_task(run)
+    return {"status": "success", "message": "Training started"}
 @app.post("/feedback")
 async def store_feedback(data: dict):
     feedback_file = "feedback_dataset.csv"
+    pd.DataFrame([data]).to_csv(feedback_file, mode='a', header=not os.path.exists(feedback_file), index=False)
+    return {"status": "stored"}