MartyNattakit commited on
Commit
6b0305e
·
unverified ·
0 Parent(s):

Add files via upload

Browse files
cleaned/cwe_top5_sampled.csv ADDED
The diff for this file is too large to render. See raw diff
 
cleaned/cwe_top5_sampled_with_juliet_none.csv ADDED
The diff for this file is too large to render. See raw diff
 
codebertfinal1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
data/check_file_cache.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+
4
+ base_dir = "C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
5
+ pattern = os.path.join(base_dir, "**", "CWE121_Stack_Based_Buffer_Overflow*.c")
6
+ files = {os.path.basename(f): f for f in glob.glob(pattern, recursive=True)}
7
+ print(f"Found {len(files)} .c files")
8
+ for file in [
9
+ "CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_declare_memcpy_32.c",
10
+ "CWE121_Stack_Based_Buffer_Overflow__CWE131_memmove_18.c",
11
+ "CWE121_Stack_Based_Buffer_Overflow__CWE135_01.c"
12
+ ]:
13
+ print(f"{file}: {'Found' if file in files else 'Not found'}")
data/dataleakanalysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
data/extract_all_cwes.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import re
4
+ import xml.etree.ElementTree as ET
5
+
6
+ input_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
7
+ output_csv = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_cwes_dataset.csv"
8
+
9
+ bad_paths = ["s01", "s03", "s05", "s07"]
10
+ batch_size = 10000
11
+
12
+ df = pd.read_csv(input_file)
13
+ for i in range(0, len(df), batch_size):
14
+ data = {"file": [], "cwe": [], "label": [], "code": []}
15
+ batch = df[i:i+batch_size]
16
+ for file_path in batch["file_path"]:
17
+ try:
18
+ file_name = os.path.basename(file_path)
19
+
20
+ # Skip non-CWE files
21
+ if "testcasesupport" in file_path.lower() or not re.search(r"CWE\d+", file_name):
22
+ print(f"Skipped: {file_name} (non-CWE or testcasesupport)")
23
+ continue
24
+
25
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
26
+ code = f.read()
27
+ print(f"Processing: {file_name}")
28
+
29
+ # Extract CWE
30
+ cwe_match = re.search(r"CWE\d+", file_name)
31
+ cwe = cwe_match.group(0) if cwe_match else "Unknown"
32
+
33
+ # Path-based labeling
34
+ normalized_path = file_path.lower().replace("\\", "/")
35
+ is_bad_path = any(s in normalized_path for s in bad_paths) and re.search(r"_(0[13579]|[1-9][0-9])\.c$", file_name)
36
+
37
+ # XML-based labeling
38
+ xml_path = file_path.replace(".c", ".label.xml")
39
+ label = "good"
40
+ if os.path.exists(xml_path):
41
+ tree = ET.parse(xml_path)
42
+ if tree.find(".//flaw") is not None:
43
+ label = "bad"
44
+ elif is_bad_path:
45
+ label = "bad"
46
+
47
+ print(f"File: {file_name}, CWE: {cwe}, Path: {is_bad_path}, Label: {label}")
48
+
49
+ data["file"].append(file_name)
50
+ data["cwe"].append(cwe)
51
+ data["label"].append(label)
52
+ data["code"].append(code)
53
+ except Exception as e:
54
+ print(f"Error: {file_path}: {e}")
55
+
56
+ batch_df = pd.DataFrame(data)
57
+ batch_df.to_csv(f"{output_csv}.{i//batch_size}.csv", index=False)
58
+ print(f"Saved batch {i//batch_size} with {len(batch_df)} rows")
data/generate_cwe_top5_sampled.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Paths
4
+ all_cwes_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\Demo\\all_cwes_dataset.csv"
5
+ output_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv"
6
+
7
+ # Load all CWEs dataset
8
+ df = pd.read_csv(all_cwes_csv)
9
+
10
+ # Top 5 CWEs (from all_cwes_dataset.csv)
11
+ top_cwes = ["CWE121", "CWE78", "CWE190", "CWE191", "CWE122"]
12
+
13
+ # Filter for top 5 CWEs (using 'cwe' column)
14
+ df_top5 = df[df['cwe'].isin(top_cwes)]
15
+
16
+ # Sample 400 files per CWE (or all if fewer)
17
+ df_sampled = pd.DataFrame()
18
+ for cwe in top_cwes:
19
+ cwe_df = df_top5[df_top5['cwe'] == cwe]
20
+ sample_size = min(400, len(cwe_df))
21
+ if sample_size > 0:
22
+ df_sampled = pd.concat([df_sampled, cwe_df.sample(n=sample_size, random_state=42)])
23
+
24
+ # Save
25
+ df_sampled.to_csv(output_csv, index=False, encoding='utf-8')
26
+ print(f"Created {output_csv} with {len(df_sampled)} files")
27
+ if not df_sampled.empty:
28
+ print(f"CWE counts:\n{df_sampled['cwe'].value_counts().to_string()}")
29
+ else:
30
+ print("No data sampled. Check column name or top_cwes list.")
data/list_all_juliet_files.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+
4
+ juliet_root = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
5
+ output_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
6
+
7
+ files = []
8
+ for root, _, filenames in os.walk(juliet_root):
9
+ for fname in filenames:
10
+ if fname.endswith(".c"):
11
+ files.append(os.path.join(root, fname))
12
+
13
+ df = pd.DataFrame(files, columns=["file_path"])
14
+ df.to_csv(output_file, index=False)
15
+ print(f"Saved {len(files)} files to {output_file}")
data/nonecategory.ipynb ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "ca65c59e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "import os\n",
12
+ "import glob\n",
13
+ "import re\n",
14
+ "\n",
15
+ "def strip_comments_and_cwe(code):\n",
16
+ " \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
17
+ " code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
18
+ " code = re.sub(r'//.*?\\n', '\\n', code)\n",
19
+ " code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
20
+ " code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
21
+ " return code\n",
22
+ "\n",
23
+ "def extract_none_samples_from_juliet(juliet_dir):\n",
24
+ " \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
25
+ " cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
26
+ " good_samples = []\n",
27
+ " \n",
28
+ " for cwe in cwes:\n",
29
+ " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
30
+ " cwe_dirs = glob.glob(cwe_dir)\n",
31
+ " for dir_path in cwe_dirs:\n",
32
+ " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
33
+ " for file_path in good_files:\n",
34
+ " try:\n",
35
+ " with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
36
+ " code = f.read()\n",
37
+ " good_samples.append({\n",
38
+ " 'cwe': 'none',\n",
39
+ " 'code': code,\n",
40
+ " 'file': os.path.basename(file_path)\n",
41
+ " })\n",
42
+ " except Exception as e:\n",
43
+ " print(f\"Error reading {file_path}: {e}\")\n",
44
+ " \n",
45
+ " return pd.DataFrame(good_samples)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "ef098685",
52
+ "metadata": {},
53
+ "outputs": [
54
+ {
55
+ "name": "stdout",
56
+ "output_type": "stream",
57
+ "text": [
58
+ "Juliet directory exists: True\n",
59
+ "\n",
60
+ "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
61
+ "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
62
+ "\n",
63
+ "Looking for CWE78 directories: []\n",
64
+ "\n",
65
+ "Looking for CWE122 directories: []\n",
66
+ "\n",
67
+ "Looking for CWE190 directories: []\n",
68
+ "\n",
69
+ "Looking for CWE191 directories: []\n"
70
+ ]
71
+ }
72
+ ],
73
+ "source": [
74
+ "import os\n",
75
+ "import glob\n",
76
+ "\n",
77
+ "# Updated Juliet path\n",
78
+ "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
79
+ "\n",
80
+ "# Check if directory exists\n",
81
+ "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
82
+ "\n",
83
+ "# Check for CWE directories\n",
84
+ "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
85
+ "for cwe in cwes:\n",
86
+ " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
87
+ " cwe_dirs = glob.glob(cwe_dir)\n",
88
+ " print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
89
+ " for dir_path in cwe_dirs:\n",
90
+ " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
91
+ " print(f\"Good files in {dir_path}: {good_files}\")"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "id": "b95cd6d2",
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "name": "stdout",
102
+ "output_type": "stream",
103
+ "text": [
104
+ "Loaded original dataset with 2000 samples.\n",
105
+ "Original CWE Distribution:\n",
106
+ " cwe\n",
107
+ "CWE121 400\n",
108
+ "CWE78 400\n",
109
+ "CWE190 400\n",
110
+ "CWE191 400\n",
111
+ "CWE122 400\n",
112
+ "Name: count, dtype: int64\n",
113
+ "Extracted 0 'none' samples from Juliet.\n",
114
+ "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
115
+ "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
116
+ "Final CWE Distribution:\n",
117
+ " cwe\n",
118
+ "CWE121 400\n",
119
+ "CWE78 400\n",
120
+ "CWE190 400\n",
121
+ "CWE191 400\n",
122
+ "CWE122 400\n",
123
+ "none 400\n",
124
+ "Name: count, dtype: int64\n",
125
+ "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "#Prepare and Save Dataset\n",
131
+ "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
132
+ "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
133
+ "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
134
+ "\n",
135
+ "# Load the original dataset\n",
136
+ "full_df = pd.read_csv(original_csv_path)\n",
137
+ "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
138
+ "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
139
+ "\n",
140
+ "# Extract 'none' samples from Juliet\n",
141
+ "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
142
+ "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
143
+ "\n",
144
+ "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
145
+ "if len(none_df) == 0:\n",
146
+ " print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
147
+ " none_samples = pd.DataFrame({\n",
148
+ " 'cwe': ['none'] * 400,\n",
149
+ " 'code': [\n",
150
+ " 'int main() { printf(\"Hello, World!\"); return 0; }',\n",
151
+ " 'void func() { int x = 5; printf(\"%d\", x); }',\n",
152
+ " 'int add(int a, int b) { return a + b; }',\n",
153
+ " 'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
154
+ " 'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
155
+ " ] * 80,\n",
156
+ " 'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
157
+ " })\n",
158
+ " none_df = none_samples\n",
159
+ "\n",
160
+ "# Combine with original dataset\n",
161
+ "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
162
+ "\n",
163
+ "# Clean the code\n",
164
+ "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
165
+ "\n",
166
+ "# Save the updated dataset\n",
167
+ "full_df.to_csv(output_csv_path, index=False)\n",
168
+ "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
169
+ "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
170
+ "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
171
+ ]
172
+ }
173
+ ],
174
+ "metadata": {
175
+ "kernelspec": {
176
+ "display_name": "Python 3",
177
+ "language": "python",
178
+ "name": "python3"
179
+ },
180
+ "language_info": {
181
+ "codemirror_mode": {
182
+ "name": "ipython",
183
+ "version": 3
184
+ },
185
+ "file_extension": ".py",
186
+ "mimetype": "text/x-python",
187
+ "name": "python",
188
+ "nbconvert_exporter": "python",
189
+ "pygments_lexer": "ipython3",
190
+ "version": "3.12.6"
191
+ }
192
+ },
193
+ "nbformat": 4,
194
+ "nbformat_minor": 5
195
+ }