File size: 13,403 Bytes
7b4f5dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
"""
OWASP Top-10 (2021) + OWASP LLM Top-10 knowledge base.
Used by the security agent as a structured reference during analysis.
"""
from __future__ import annotations

from typing import Dict, List


# ──────────────────────────────────────────────
# OWASP LLM Top-10 (2025 edition)
# ──────────────────────────────────────────────

OWASP_LLM_TOP10: Dict[str, Dict] = {
    "LLM01": {
        "id": "LLM01",
        "name": "Prompt Injection",
        "description": (
            "User-supplied input alters the intended behaviour of a model prompt. "
            "Direct injections override system prompts; indirect injections are embedded "
            "in external content the model processes."
        ),
        "examples": [
            "Concatenating user input directly into a prompt string",
            "Trusting model output for routing/tool calls without sanitisation",
            "Allowing retrieval of attacker-controlled documents in RAG pipelines",
        ],
        "severity": "critical",
        "cwe": "CWE-74",
        "patterns": [
            r"f['\"].*\{.*user.*\}",
            r"prompt\s*=\s*.*\+.*request",
            r"format\(.*user_input",
            r"\.format\(.*query",
        ],
    },
    "LLM02": {
        "id": "LLM02",
        "name": "Insecure Output Handling",
        "description": (
            "LLM-generated text is passed to downstream components (shell, SQL, browser) "
            "without validation or sanitisation."
        ),
        "examples": [
            "Passing model response to eval()",
            "Executing model-generated SQL without parameterisation",
            "Rendering model HTML output without escaping",
        ],
        "severity": "critical",
        "cwe": "CWE-116",
        "patterns": [
            r"(?<!\.)eval\s*\(",
            r"(?<!\.)exec\s*\(",
            r"subprocess.*shell\s*=\s*True",
            r"os\.system\s*\(",
        ],
    },
    "LLM03": {
        "id": "LLM03",
        "name": "Training Data Poisoning",
        "description": (
            "Malicious or corrupted data introduced into training / fine-tuning pipelines "
            "causing biased, backdoored, or degraded model behaviour."
        ),
        "examples": [
            "No data validation before fine-tuning",
            "Loading training datasets from unverified URLs",
            "Accepting user-supplied training examples without filtering",
        ],
        "severity": "high",
        "cwe": "CWE-20",
        "patterns": [
            r"download.*dataset",
            r"load_dataset\(.*http",
            r"requests\.get.*train",
            r"urllib.*train",
        ],
    },
    "LLM04": {
        "id": "LLM04",
        "name": "Model Denial of Service",
        "description": (
            "Inputs crafted to consume excessive compute resources "
            "(token bombs, unbounded context, recursive prompts)."
        ),
        "examples": [
            "No max_tokens / max_length enforcement",
            "Accepting arbitrarily long user prompts",
            "Recursive agent calls without depth limit",
        ],
        "severity": "high",
        "cwe": "CWE-400",
        "patterns": [
            r"max_tokens\s*=\s*None",
            r"max_length\s*=\s*None",
            r"while True.*generate",
        ],
    },
    "LLM06": {
        "id": "LLM06",
        "name": "Sensitive Information Disclosure",
        "description": (
            "Model reveals confidential training data, system prompts, API keys, "
            "or PII due to insufficient access controls or prompt engineering."
        ),
        "examples": [
            "Hardcoded API keys passed in prompts",
            "PII embedded in embedding vectors",
            "System prompt leaked via adversarial queries",
        ],
        "severity": "high",
        "cwe": "CWE-200",
        "patterns": [
            r"(?i)(api_key|hf_token|openai_api_key|secret_key)\s*=\s*['\"][A-Za-z0-9_\-]{10,}",
            r"(?i)bearer\s+[A-Za-z0-9_\-\.]{20,}",
            r"(?i)sk-[A-Za-z0-9]{32,}",
            r"(?i)hf_[A-Za-z0-9]{20,}",
        ],
    },
    "LLM08": {
        "id": "LLM08",
        "name": "Excessive Agency",
        "description": (
            "An LLM agent is granted more permissions or capabilities than needed, "
            "allowing it to take unintended high-impact actions."
        ),
        "examples": [
            "Agent has filesystem write access with no scope limit",
            "Agent can call any external API without allowlist",
            "No human-in-the-loop for destructive operations",
        ],
        "severity": "high",
        "cwe": "CWE-269",
        "patterns": [
            r"tools\s*=\s*\[.*all_tools",
            r"allow_dangerous_requests\s*=\s*True",
            r"run_manager.*no.*confirm",
        ],
    },
    "LLM09": {
        "id": "LLM09",
        "name": "Overreliance",
        "description": (
            "System depends on LLM output for critical decisions without human oversight "
            "or validation layers."
        ),
        "examples": [
            "Auto-executing LLM-suggested shell commands",
            "Financial decisions made purely from model output",
            "No fallback when model returns malformed data",
        ],
        "severity": "medium",
        "cwe": "CWE-636",
        "patterns": [
            r"auto_run\s*=\s*True",
            r"autonomous.*mode",
            r"no.*human.*loop",
        ],
    },
}


# ──────────────────────────────────────────────
# OWASP Web Top-10 applied to ML serving
# ──────────────────────────────────────────────

OWASP_WEB_TOP10: Dict[str, Dict] = {
    "A01": {
        "id": "A01",
        "name": "Broken Access Control",
        "description": "Model endpoints exposed without authentication.",
        "severity": "critical",
        "cwe": "CWE-284",
        "patterns": [
            r"@app\.route.*methods.*POST",
            r"router\.(post|get|put)\s*\(",
        ],
    },
    "A02": {
        "id": "A02",
        "name": "Cryptographic Failures",
        "description": "Sensitive data transmitted or stored without encryption.",
        "severity": "high",
        "cwe": "CWE-311",
        "patterns": [
            r"http://(?!localhost|127\.0\.0\.1)",
            r"verify\s*=\s*False",
        ],
    },
    "A03": {
        "id": "A03",
        "name": "Injection",
        "description": "SQL/command injection in RAG pipeline queries or model serving endpoints.",
        "severity": "critical",
        "cwe": "CWE-89",
        "patterns": [
            r"cursor\.execute\s*\(\s*f['\"]",
            r'cursor\.execute\s*\(\s*".*%s',
            r"\.format\(.*user",
            r"SELECT.*\+.*user_input",
        ],
    },
    "A04": {
        "id": "A04",
        "name": "Insecure Design",
        "description": "Pickle deserialization from untrusted model file sources.",
        "severity": "critical",
        "cwe": "CWE-502",
        "patterns": [
            r"pickle\.load\s*\(",
            r"pickle\.loads\s*\(",
            r"torch\.load\s*\(.*map_location",
            r"joblib\.load\s*\(",
        ],
    },
    "A05": {
        "id": "A05",
        "name": "Security Misconfiguration",
        "description": "Debug mode enabled, CORS unrestricted, or default credentials.",
        "severity": "medium",
        "cwe": "CWE-16",
        "patterns": [
            r"debug\s*=\s*True",
            r'allow_origins\s*=\s*\["\*"\]',
            r"cors.*\*",
        ],
    },
    "A07": {
        "id": "A07",
        "name": "Identification and Authentication Failures",
        "description": "Hardcoded API keys or tokens in source code.",
        "severity": "critical",
        "cwe": "CWE-798",
        "patterns": [
            r"(?i)(password|passwd|pwd)\s*=\s*['\"].{4,}['\"]",
            r"(?i)(api_key|apikey|api_secret)\s*=\s*['\"][^'\"]{6,}['\"]",
            r"(?i)token\s*=\s*['\"][A-Za-z0-9_\-\.]{10,}['\"]",
        ],
    },
    "A08": {
        "id": "A08",
        "name": "Software and Data Integrity Failures",
        "description": "Loading model weights or packages from unverified sources without integrity checks.",
        "severity": "high",
        "cwe": "CWE-494",
        "patterns": [
            r"torch\.hub\.load\s*\(",
            r"from_pretrained\s*\(.*http",
            r"requests\.get.*model.*verify\s*=\s*False",
        ],
    },
    "A10": {
        "id": "A10",
        "name": "Server-Side Request Forgery",
        "description": "User-controlled URLs fetched by the server (e.g. model download path).",
        "severity": "high",
        "cwe": "CWE-918",
        "patterns": [
            r"requests\.get\s*\(\s*request\.",
            r"urllib\.request\.urlopen\s*\(\s*(user|param|input|query)",
        ],
    },
}


# ──────────────────────────────────────────────
# ML-specific vulnerability patterns
# ──────────────────────────────────────────────

ML_SPECIFIC_VULNS: List[Dict] = [
    {
        "id": "ML01",
        "name": "GPU Memory Leak β€” Tensor Not Released",
        "description": "GPU tensors retained on device after inference causing progressive VRAM exhaustion.",
        "severity": "high",
        "cwe": "CWE-401",
        "patterns": [
            r"\.cuda\(\)",
            r"\.to\(['\"]cuda['\"]",
            r"\.to\(device\)",
        ],
        "anti_patterns": [
            r"\.cpu\(\)",
            r"del\s+",
            r"torch\.cuda\.empty_cache",
        ],
    },
    {
        "id": "ML02",
        "name": "Missing @torch.no_grad on Inference",
        "description": "Running inference without no_grad() computes unnecessary gradients, wasting 2x memory.",
        "severity": "medium",
        "cwe": "CWE-400",
        "patterns": [
            r"def\s+(predict|infer|inference|generate|forward)\s*\(",
        ],
        "anti_patterns": [
            r"@torch\.no_grad",
            r"with torch\.no_grad",
        ],
    },
    {
        "id": "ML03",
        "name": "N+1 Embedding Calls",
        "description": "Embedding model called once per item in a loop instead of in a single batch call.",
        "severity": "medium",
        "cwe": "CWE-405",
        "patterns": [
            r"for .* in .*:\s*\n.*embed",
            r"for .* in .*:\s*\n.*encode",
        ],
    },
    {
        "id": "ML04",
        "name": "FP32 Inference β€” Should Use FP16/BF16",
        "description": "Model loaded in float32 wastes 2x VRAM vs float16/bfloat16.",
        "severity": "low",
        "cwe": "CWE-400",
        "patterns": [
            r"torch_dtype\s*=\s*torch\.float32",
            r"\.float\(\)",
        ],
        "anti_patterns": [
            r"float16|bfloat16|fp16|bf16|torch_dtype",
        ],
    },
    {
        "id": "ML05",
        "name": "Synchronous Model Loading in Request Handler",
        "description": "Loading model weights inside a per-request handler blocks the event loop and causes timeouts.",
        "severity": "high",
        "cwe": "CWE-400",
        "patterns": [
            r"(AutoModel|AutoTokenizer|from_pretrained).*inside.*route",
            r"def\s+(predict|infer).*:\s*\n.*from_pretrained",
        ],
    },
]


# ──────────────────────────────────────────────
# Convenience accessors
# ──────────────────────────────────────────────

ALL_CATEGORIES: Dict[str, Dict] = {
    **OWASP_LLM_TOP10,
    **OWASP_WEB_TOP10,
}


def get_category(category_id: str) -> Dict:
    """Return a vulnerability category dict by ID (e.g. 'LLM01', 'A03')."""
    return ALL_CATEGORIES.get(category_id.upper(), {})


def get_all_patterns() -> List[Dict]:
    """Return a flat list of all pattern dicts for scanning."""
    results = []
    for cat_id, cat in ALL_CATEGORIES.items():
        for pattern in cat.get("patterns", []):
            results.append(
                {
                    "pattern": pattern,
                    "category_id": cat_id,
                    "category_name": cat["name"],
                    "severity": cat["severity"],
                    "cwe": cat.get("cwe", ""),
                    "description": cat["description"],
                }
            )
    for vuln in ML_SPECIFIC_VULNS:
        for pattern in vuln.get("patterns", []):
            results.append(
                {
                    "pattern": pattern,
                    "category_id": vuln["id"],
                    "category_name": vuln["name"],
                    "severity": vuln["severity"],
                    "cwe": vuln.get("cwe", ""),
                    "description": vuln["description"],
                }
            )
    return results