File size: 7,301 Bytes
7e7d487
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca65c59e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import glob\n",
    "import re\n",
    "\n",
    "def strip_comments_and_cwe(code):\n",
    "    \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
    "    code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
    "    code = re.sub(r'//.*?\\n', '\\n', code)\n",
    "    code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
    "    code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
    "    return code\n",
    "\n",
    "def extract_none_samples_from_juliet(juliet_dir):\n",
    "    \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
    "    cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
    "    good_samples = []\n",
    "    \n",
    "    for cwe in cwes:\n",
    "        cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
    "        cwe_dirs = glob.glob(cwe_dir)\n",
    "        for dir_path in cwe_dirs:\n",
    "            good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
    "            for file_path in good_files:\n",
    "                try:\n",
    "                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
    "                        code = f.read()\n",
    "                    good_samples.append({\n",
    "                        'cwe': 'none',\n",
    "                        'code': code,\n",
    "                        'file': os.path.basename(file_path)\n",
    "                    })\n",
    "                except Exception as e:\n",
    "                    print(f\"Error reading {file_path}: {e}\")\n",
    "    \n",
    "    return pd.DataFrame(good_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef098685",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Juliet directory exists: True\n",
      "\n",
      "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
      "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
      "\n",
      "Looking for CWE78 directories: []\n",
      "\n",
      "Looking for CWE122 directories: []\n",
      "\n",
      "Looking for CWE190 directories: []\n",
      "\n",
      "Looking for CWE191 directories: []\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import glob\n",
    "\n",
    "# Updated Juliet path\n",
    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
    "\n",
    "# Check if directory exists\n",
    "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
    "\n",
    "# Check for CWE directories\n",
    "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
    "for cwe in cwes:\n",
    "    cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
    "    cwe_dirs = glob.glob(cwe_dir)\n",
    "    print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
    "    for dir_path in cwe_dirs:\n",
    "        good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
    "        print(f\"Good files in {dir_path}: {good_files}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b95cd6d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded original dataset with 2000 samples.\n",
      "Original CWE Distribution:\n",
      " cwe\n",
      "CWE121    400\n",
      "CWE78     400\n",
      "CWE190    400\n",
      "CWE191    400\n",
      "CWE122    400\n",
      "Name: count, dtype: int64\n",
      "Extracted 0 'none' samples from Juliet.\n",
      "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
      "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
      "Final CWE Distribution:\n",
      " cwe\n",
      "CWE121    400\n",
      "CWE78     400\n",
      "CWE190    400\n",
      "CWE191    400\n",
      "CWE122    400\n",
      "none      400\n",
      "Name: count, dtype: int64\n",
      "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
     ]
    }
   ],
   "source": [
    "#Prepare and Save Dataset\n",
    "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
    "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
    "\n",
    "# Load the original dataset\n",
    "full_df = pd.read_csv(original_csv_path)\n",
    "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
    "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
    "\n",
    "# Extract 'none' samples from Juliet\n",
    "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
    "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
    "\n",
    "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
    "if len(none_df) == 0:\n",
    "    print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
    "    none_samples = pd.DataFrame({\n",
    "        'cwe': ['none'] * 400,\n",
    "        'code': [\n",
    "            'int main() { printf(\"Hello, World!\"); return 0; }',\n",
    "            'void func() { int x = 5; printf(\"%d\", x); }',\n",
    "            'int add(int a, int b) { return a + b; }',\n",
    "            'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
    "            'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
    "        ] * 80,\n",
    "        'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
    "    })\n",
    "    none_df = none_samples\n",
    "\n",
    "# Combine with original dataset\n",
    "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
    "\n",
    "# Clean the code\n",
    "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
    "\n",
    "# Save the updated dataset\n",
    "full_df.to_csv(output_csv_path, index=False)\n",
    "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
    "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
    "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}