MartyNattakit commited on
Commit
7e7d487
·
unverified ·
1 Parent(s): 74b2969

Add files via upload

Browse files
scripts/check_file_cache.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+
4
+ base_dir = "C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
5
+ pattern = os.path.join(base_dir, "**", "CWE121_Stack_Based_Buffer_Overflow*.c")
6
+ files = {os.path.basename(f): f for f in glob.glob(pattern, recursive=True)}
7
+ print(f"Found {len(files)} .c files")
8
+ for file in [
9
+ "CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_declare_memcpy_32.c",
10
+ "CWE121_Stack_Based_Buffer_Overflow__CWE131_memmove_18.c",
11
+ "CWE121_Stack_Based_Buffer_Overflow__CWE135_01.c"
12
+ ]:
13
+ print(f"{file}: {'Found' if file in files else 'Not found'}")
scripts/dataleakanalysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
scripts/extract_all_cwes.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import re
4
+ import xml.etree.ElementTree as ET
5
+
6
+ input_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
7
+ output_csv = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_cwes_dataset.csv"
8
+
9
+ bad_paths = ["s01", "s03", "s05", "s07"]
10
+ batch_size = 10000
11
+
12
+ df = pd.read_csv(input_file)
13
+ for i in range(0, len(df), batch_size):
14
+ data = {"file": [], "cwe": [], "label": [], "code": []}
15
+ batch = df[i:i+batch_size]
16
+ for file_path in batch["file_path"]:
17
+ try:
18
+ file_name = os.path.basename(file_path)
19
+
20
+ # Skip non-CWE files
21
+ if "testcasesupport" in file_path.lower() or not re.search(r"CWE\d+", file_name):
22
+ print(f"Skipped: {file_name} (non-CWE or testcasesupport)")
23
+ continue
24
+
25
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
26
+ code = f.read()
27
+ print(f"Processing: {file_name}")
28
+
29
+ # Extract CWE
30
+ cwe_match = re.search(r"CWE\d+", file_name)
31
+ cwe = cwe_match.group(0) if cwe_match else "Unknown"
32
+
33
+ # Path-based labeling
34
+ normalized_path = file_path.lower().replace("\\", "/")
35
+ is_bad_path = any(s in normalized_path for s in bad_paths) and re.search(r"_(0[13579]|[1-9][0-9])\.c$", file_name)
36
+
37
+ # XML-based labeling
38
+ xml_path = file_path.replace(".c", ".label.xml")
39
+ label = "good"
40
+ if os.path.exists(xml_path):
41
+ tree = ET.parse(xml_path)
42
+ if tree.find(".//flaw") is not None:
43
+ label = "bad"
44
+ elif is_bad_path:
45
+ label = "bad"
46
+
47
+ print(f"File: {file_name}, CWE: {cwe}, Path: {is_bad_path}, Label: {label}")
48
+
49
+ data["file"].append(file_name)
50
+ data["cwe"].append(cwe)
51
+ data["label"].append(label)
52
+ data["code"].append(code)
53
+ except Exception as e:
54
+ print(f"Error: {file_path}: {e}")
55
+
56
+ batch_df = pd.DataFrame(data)
57
+ batch_df.to_csv(f"{output_csv}.{i//batch_size}.csv", index=False)
58
+ print(f"Saved batch {i//batch_size} with {len(batch_df)} rows")
scripts/generate_cwe_top5_sampled.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Paths
4
+ all_cwes_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\Demo\\all_cwes_dataset.csv"
5
+ output_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv"
6
+
7
+ # Load all CWEs dataset
8
+ df = pd.read_csv(all_cwes_csv)
9
+
10
+ # Top 5 CWEs (from all_cwes_dataset.csv)
11
+ top_cwes = ["CWE121", "CWE78", "CWE190", "CWE191", "CWE122"]
12
+
13
+ # Filter for top 5 CWEs (using 'cwe' column)
14
+ df_top5 = df[df['cwe'].isin(top_cwes)]
15
+
16
+ # Sample 400 files per CWE (or all if fewer)
17
+ df_sampled = pd.DataFrame()
18
+ for cwe in top_cwes:
19
+ cwe_df = df_top5[df_top5['cwe'] == cwe]
20
+ sample_size = min(400, len(cwe_df))
21
+ if sample_size > 0:
22
+ df_sampled = pd.concat([df_sampled, cwe_df.sample(n=sample_size, random_state=42)])
23
+
24
+ # Save
25
+ df_sampled.to_csv(output_csv, index=False, encoding='utf-8')
26
+ print(f"Created {output_csv} with {len(df_sampled)} files")
27
+ if not df_sampled.empty:
28
+ print(f"CWE counts:\n{df_sampled['cwe'].value_counts().to_string()}")
29
+ else:
30
+ print("No data sampled. Check column name or top_cwes list.")
scripts/list_all_juliet_files.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+
4
+ juliet_root = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
5
+ output_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
6
+
7
+ files = []
8
+ for root, _, filenames in os.walk(juliet_root):
9
+ for fname in filenames:
10
+ if fname.endswith(".c"):
11
+ files.append(os.path.join(root, fname))
12
+
13
+ df = pd.DataFrame(files, columns=["file_path"])
14
+ df.to_csv(output_file, index=False)
15
+ print(f"Saved {len(files)} files to {output_file}")
scripts/nonecategory.ipynb ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "ca65c59e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "import os\n",
12
+ "import glob\n",
13
+ "import re\n",
14
+ "\n",
15
+ "def strip_comments_and_cwe(code):\n",
16
+ " \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
17
+ " code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
18
+ " code = re.sub(r'//.*?\\n', '\\n', code)\n",
19
+ " code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
20
+ " code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
21
+ " return code\n",
22
+ "\n",
23
+ "def extract_none_samples_from_juliet(juliet_dir):\n",
24
+ " \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
25
+ " cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
26
+ " good_samples = []\n",
27
+ " \n",
28
+ " for cwe in cwes:\n",
29
+ " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
30
+ " cwe_dirs = glob.glob(cwe_dir)\n",
31
+ " for dir_path in cwe_dirs:\n",
32
+ " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
33
+ " for file_path in good_files:\n",
34
+ " try:\n",
35
+ " with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
36
+ " code = f.read()\n",
37
+ " good_samples.append({\n",
38
+ " 'cwe': 'none',\n",
39
+ " 'code': code,\n",
40
+ " 'file': os.path.basename(file_path)\n",
41
+ " })\n",
42
+ " except Exception as e:\n",
43
+ " print(f\"Error reading {file_path}: {e}\")\n",
44
+ " \n",
45
+ " return pd.DataFrame(good_samples)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "ef098685",
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "Juliet directory exists: True\n",
59
+ "\n",
60
+ "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
61
+ "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
62
+ "\n",
63
+ "Looking for CWE78 directories: []\n",
64
+ "\n",
65
+ "Looking for CWE122 directories: []\n",
66
+ "\n",
67
+ "Looking for CWE190 directories: []\n",
68
+ "\n",
69
+ "Looking for CWE191 directories: []\n"
70
+ ]
71
+ }
72
+ ],
73
+ "source": [
74
+ "import os\n",
75
+ "import glob\n",
76
+ "\n",
77
+ "# Updated Juliet path\n",
78
+ "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
79
+ "\n",
80
+ "# Check if directory exists\n",
81
+ "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
82
+ "\n",
83
+ "# Check for CWE directories\n",
84
+ "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
85
+ "for cwe in cwes:\n",
86
+ " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
87
+ " cwe_dirs = glob.glob(cwe_dir)\n",
88
+ " print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
89
+ " for dir_path in cwe_dirs:\n",
90
+ " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
91
+ " print(f\"Good files in {dir_path}: {good_files}\")"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "id": "b95cd6d2",
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "name": "stdout",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "Loaded original dataset with 2000 samples.\n",
105
+ "Original CWE Distribution:\n",
106
+ " cwe\n",
107
+ "CWE121 400\n",
108
+ "CWE78 400\n",
109
+ "CWE190 400\n",
110
+ "CWE191 400\n",
111
+ "CWE122 400\n",
112
+ "Name: count, dtype: int64\n",
113
+ "Extracted 0 'none' samples from Juliet.\n",
114
+ "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
115
+ "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
116
+ "Final CWE Distribution:\n",
117
+ " cwe\n",
118
+ "CWE121 400\n",
119
+ "CWE78 400\n",
120
+ "CWE190 400\n",
121
+ "CWE191 400\n",
122
+ "CWE122 400\n",
123
+ "none 400\n",
124
+ "Name: count, dtype: int64\n",
125
+ "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "#Prepare and Save Dataset\n",
131
+ "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
132
+ "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
133
+ "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
134
+ "\n",
135
+ "# Load the original dataset\n",
136
+ "full_df = pd.read_csv(original_csv_path)\n",
137
+ "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
138
+ "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
139
+ "\n",
140
+ "# Extract 'none' samples from Juliet\n",
141
+ "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
142
+ "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
143
+ "\n",
144
+ "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
145
+ "if len(none_df) == 0:\n",
146
+ " print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
147
+ " none_samples = pd.DataFrame({\n",
148
+ " 'cwe': ['none'] * 400,\n",
149
+ " 'code': [\n",
150
+ " 'int main() { printf(\"Hello, World!\"); return 0; }',\n",
151
+ " 'void func() { int x = 5; printf(\"%d\", x); }',\n",
152
+ " 'int add(int a, int b) { return a + b; }',\n",
153
+ " 'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
154
+ " 'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
155
+ " ] * 80,\n",
156
+ " 'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
157
+ " })\n",
158
+ " none_df = none_samples\n",
159
+ "\n",
160
+ "# Combine with original dataset\n",
161
+ "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
162
+ "\n",
163
+ "# Clean the code\n",
164
+ "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
165
+ "\n",
166
+ "# Save the updated dataset\n",
167
+ "full_df.to_csv(output_csv_path, index=False)\n",
168
+ "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
169
+ "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
170
+ "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
171
+ ]
172
+ }
173
+ ],
174
+ "metadata": {
175
+ "kernelspec": {
176
+ "display_name": "Python 3",
177
+ "language": "python",
178
+ "name": "python3"
179
+ },
180
+ "language_info": {
181
+ "codemirror_mode": {
182
+ "name": "ipython",
183
+ "version": 3
184
+ },
185
+ "file_extension": ".py",
186
+ "mimetype": "text/x-python",
187
+ "name": "python",
188
+ "nbconvert_exporter": "python",
189
+ "pygments_lexer": "ipython3",
190
+ "version": "3.12.6"
191
+ }
192
+ },
193
+ "nbformat": 4,
194
+ "nbformat_minor": 5
195
+ }