MartyNattakit commited on
Commit
74b2969
·
unverified ·
1 Parent(s): 6b0305e

Delete data directory

Browse files
data/check_file_cache.py DELETED
@@ -1,13 +0,0 @@
1
- import os
2
- import glob
3
-
4
- base_dir = "C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
5
- pattern = os.path.join(base_dir, "**", "CWE121_Stack_Based_Buffer_Overflow*.c")
6
- files = {os.path.basename(f): f for f in glob.glob(pattern, recursive=True)}
7
- print(f"Found {len(files)} .c files")
8
- for file in [
9
- "CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_declare_memcpy_32.c",
10
- "CWE121_Stack_Based_Buffer_Overflow__CWE131_memmove_18.c",
11
- "CWE121_Stack_Based_Buffer_Overflow__CWE135_01.c"
12
- ]:
13
- print(f"{file}: {'Found' if file in files else 'Not found'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/dataleakanalysis.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
data/extract_all_cwes.py DELETED
@@ -1,58 +0,0 @@
1
- import pandas as pd
2
- import os
3
- import re
4
- import xml.etree.ElementTree as ET
5
-
6
- input_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
7
- output_csv = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_cwes_dataset.csv"
8
-
9
- bad_paths = ["s01", "s03", "s05", "s07"]
10
- batch_size = 10000
11
-
12
- df = pd.read_csv(input_file)
13
- for i in range(0, len(df), batch_size):
14
- data = {"file": [], "cwe": [], "label": [], "code": []}
15
- batch = df[i:i+batch_size]
16
- for file_path in batch["file_path"]:
17
- try:
18
- file_name = os.path.basename(file_path)
19
-
20
- # Skip non-CWE files
21
- if "testcasesupport" in file_path.lower() or not re.search(r"CWE\d+", file_name):
22
- print(f"Skipped: {file_name} (non-CWE or testcasesupport)")
23
- continue
24
-
25
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
26
- code = f.read()
27
- print(f"Processing: {file_name}")
28
-
29
- # Extract CWE
30
- cwe_match = re.search(r"CWE\d+", file_name)
31
- cwe = cwe_match.group(0) if cwe_match else "Unknown"
32
-
33
- # Path-based labeling
34
- normalized_path = file_path.lower().replace("\\", "/")
35
- is_bad_path = any(s in normalized_path for s in bad_paths) and re.search(r"_(0[13579]|[1-9][0-9])\.c$", file_name)
36
-
37
- # XML-based labeling
38
- xml_path = file_path.replace(".c", ".label.xml")
39
- label = "good"
40
- if os.path.exists(xml_path):
41
- tree = ET.parse(xml_path)
42
- if tree.find(".//flaw") is not None:
43
- label = "bad"
44
- elif is_bad_path:
45
- label = "bad"
46
-
47
- print(f"File: {file_name}, CWE: {cwe}, Path: {is_bad_path}, Label: {label}")
48
-
49
- data["file"].append(file_name)
50
- data["cwe"].append(cwe)
51
- data["label"].append(label)
52
- data["code"].append(code)
53
- except Exception as e:
54
- print(f"Error: {file_path}: {e}")
55
-
56
- batch_df = pd.DataFrame(data)
57
- batch_df.to_csv(f"{output_csv}.{i//batch_size}.csv", index=False)
58
- print(f"Saved batch {i//batch_size} with {len(batch_df)} rows")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/generate_cwe_top5_sampled.py DELETED
@@ -1,30 +0,0 @@
1
- import pandas as pd
2
-
3
- # Paths
4
- all_cwes_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\Demo\\all_cwes_dataset.csv"
5
- output_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv"
6
-
7
- # Load all CWEs dataset
8
- df = pd.read_csv(all_cwes_csv)
9
-
10
- # Top 5 CWEs (from all_cwes_dataset.csv)
11
- top_cwes = ["CWE121", "CWE78", "CWE190", "CWE191", "CWE122"]
12
-
13
- # Filter for top 5 CWEs (using 'cwe' column)
14
- df_top5 = df[df['cwe'].isin(top_cwes)]
15
-
16
- # Sample 400 files per CWE (or all if fewer)
17
- df_sampled = pd.DataFrame()
18
- for cwe in top_cwes:
19
- cwe_df = df_top5[df_top5['cwe'] == cwe]
20
- sample_size = min(400, len(cwe_df))
21
- if sample_size > 0:
22
- df_sampled = pd.concat([df_sampled, cwe_df.sample(n=sample_size, random_state=42)])
23
-
24
- # Save
25
- df_sampled.to_csv(output_csv, index=False, encoding='utf-8')
26
- print(f"Created {output_csv} with {len(df_sampled)} files")
27
- if not df_sampled.empty:
28
- print(f"CWE counts:\n{df_sampled['cwe'].value_counts().to_string()}")
29
- else:
30
- print("No data sampled. Check column name or top_cwes list.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/list_all_juliet_files.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
- import pandas as pd
3
-
4
- juliet_root = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
5
- output_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
6
-
7
- files = []
8
- for root, _, filenames in os.walk(juliet_root):
9
- for fname in filenames:
10
- if fname.endswith(".c"):
11
- files.append(os.path.join(root, fname))
12
-
13
- df = pd.DataFrame(files, columns=["file_path"])
14
- df.to_csv(output_file, index=False)
15
- print(f"Saved {len(files)} files to {output_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/nonecategory.ipynb DELETED
@@ -1,195 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "ca65c59e",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "import pandas as pd\n",
11
- "import os\n",
12
- "import glob\n",
13
- "import re\n",
14
- "\n",
15
- "def strip_comments_and_cwe(code):\n",
16
- " \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
17
- " code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
18
- " code = re.sub(r'//.*?\\n', '\\n', code)\n",
19
- " code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
20
- " code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
21
- " return code\n",
22
- "\n",
23
- "def extract_none_samples_from_juliet(juliet_dir):\n",
24
- " \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
25
- " cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
26
- " good_samples = []\n",
27
- " \n",
28
- " for cwe in cwes:\n",
29
- " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
30
- " cwe_dirs = glob.glob(cwe_dir)\n",
31
- " for dir_path in cwe_dirs:\n",
32
- " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
33
- " for file_path in good_files:\n",
34
- " try:\n",
35
- " with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
36
- " code = f.read()\n",
37
- " good_samples.append({\n",
38
- " 'cwe': 'none',\n",
39
- " 'code': code,\n",
40
- " 'file': os.path.basename(file_path)\n",
41
- " })\n",
42
- " except Exception as e:\n",
43
- " print(f\"Error reading {file_path}: {e}\")\n",
44
- " \n",
45
- " return pd.DataFrame(good_samples)"
46
- ]
47
- },
48
- {
49
- "cell_type": "code",
50
- "execution_count": null,
51
- "id": "ef098685",
52
- "metadata": {},
53
- "outputs": [
54
- {
55
- "name": "stdout",
56
- "output_type": "stream",
57
- "text": [
58
- "Juliet directory exists: True\n",
59
- "\n",
60
- "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
61
- "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
62
- "\n",
63
- "Looking for CWE78 directories: []\n",
64
- "\n",
65
- "Looking for CWE122 directories: []\n",
66
- "\n",
67
- "Looking for CWE190 directories: []\n",
68
- "\n",
69
- "Looking for CWE191 directories: []\n"
70
- ]
71
- }
72
- ],
73
- "source": [
74
- "import os\n",
75
- "import glob\n",
76
- "\n",
77
- "# Updated Juliet path\n",
78
- "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
79
- "\n",
80
- "# Check if directory exists\n",
81
- "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
82
- "\n",
83
- "# Check for CWE directories\n",
84
- "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
85
- "for cwe in cwes:\n",
86
- " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
87
- " cwe_dirs = glob.glob(cwe_dir)\n",
88
- " print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
89
- " for dir_path in cwe_dirs:\n",
90
- " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
91
- " print(f\"Good files in {dir_path}: {good_files}\")"
92
- ]
93
- },
94
- {
95
- "cell_type": "code",
96
- "execution_count": null,
97
- "id": "b95cd6d2",
98
- "metadata": {},
99
- "outputs": [
100
- {
101
- "name": "stdout",
102
- "output_type": "stream",
103
- "text": [
104
- "Loaded original dataset with 2000 samples.\n",
105
- "Original CWE Distribution:\n",
106
- " cwe\n",
107
- "CWE121 400\n",
108
- "CWE78 400\n",
109
- "CWE190 400\n",
110
- "CWE191 400\n",
111
- "CWE122 400\n",
112
- "Name: count, dtype: int64\n",
113
- "Extracted 0 'none' samples from Juliet.\n",
114
- "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
115
- "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
116
- "Final CWE Distribution:\n",
117
- " cwe\n",
118
- "CWE121 400\n",
119
- "CWE78 400\n",
120
- "CWE190 400\n",
121
- "CWE191 400\n",
122
- "CWE122 400\n",
123
- "none 400\n",
124
- "Name: count, dtype: int64\n",
125
- "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
126
- ]
127
- }
128
- ],
129
- "source": [
130
- "#Prepare and Save Dataset\n",
131
- "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
132
- "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
133
- "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
134
- "\n",
135
- "# Load the original dataset\n",
136
- "full_df = pd.read_csv(original_csv_path)\n",
137
- "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
138
- "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
139
- "\n",
140
- "# Extract 'none' samples from Juliet\n",
141
- "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
142
- "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
143
- "\n",
144
- "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
145
- "if len(none_df) == 0:\n",
146
- " print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
147
- " none_samples = pd.DataFrame({\n",
148
- " 'cwe': ['none'] * 400,\n",
149
- " 'code': [\n",
150
- " 'int main() { printf(\"Hello, World!\"); return 0; }',\n",
151
- " 'void func() { int x = 5; printf(\"%d\", x); }',\n",
152
- " 'int add(int a, int b) { return a + b; }',\n",
153
- " 'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
154
- " 'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
155
- " ] * 80,\n",
156
- " 'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
157
- " })\n",
158
- " none_df = none_samples\n",
159
- "\n",
160
- "# Combine with original dataset\n",
161
- "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
162
- "\n",
163
- "# Clean the code\n",
164
- "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
165
- "\n",
166
- "# Save the updated dataset\n",
167
- "full_df.to_csv(output_csv_path, index=False)\n",
168
- "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
169
- "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
170
- "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
171
- ]
172
- }
173
- ],
174
- "metadata": {
175
- "kernelspec": {
176
- "display_name": "Python 3",
177
- "language": "python",
178
- "name": "python3"
179
- },
180
- "language_info": {
181
- "codemirror_mode": {
182
- "name": "ipython",
183
- "version": 3
184
- },
185
- "file_extension": ".py",
186
- "mimetype": "text/x-python",
187
- "name": "python",
188
- "nbconvert_exporter": "python",
189
- "pygments_lexer": "ipython3",
190
- "version": "3.12.6"
191
- }
192
- },
193
- "nbformat": 4,
194
- "nbformat_minor": 5
195
- }