{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ca65c59e", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import glob\n", "import re\n", "\n", "def strip_comments_and_cwe(code):\n", " \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n", " code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n", " code = re.sub(r'//.*?\\n', '\\n', code)\n", " code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n", " code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n", " return code\n", "\n", "def extract_none_samples_from_juliet(juliet_dir):\n", " \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n", " cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n", " good_samples = []\n", " \n", " for cwe in cwes:\n", " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n", " cwe_dirs = glob.glob(cwe_dir)\n", " for dir_path in cwe_dirs:\n", " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n", " for file_path in good_files:\n", " try:\n", " with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n", " code = f.read()\n", " good_samples.append({\n", " 'cwe': 'none',\n", " 'code': code,\n", " 'file': os.path.basename(file_path)\n", " })\n", " except Exception as e:\n", " print(f\"Error reading {file_path}: {e}\")\n", " \n", " return pd.DataFrame(good_samples)" ] }, { "cell_type": "code", "execution_count": null, "id": "ef098685", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Juliet directory exists: True\n", "\n", "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n", "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n", "\n", "Looking for CWE78 directories: []\n", "\n", "Looking for CWE122 directories: []\n", "\n", "Looking for CWE190 directories: []\n", "\n", "Looking for CWE191 directories: []\n" ] } ], "source": [ "import os\n", "import glob\n", "\n", "# Updated Juliet path\n", "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n", "\n", "# Check if directory exists\n", "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n", "\n", "# Check for CWE directories\n", "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n", "for cwe in cwes:\n", " cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n", " cwe_dirs = glob.glob(cwe_dir)\n", " print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n", " for dir_path in cwe_dirs:\n", " good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n", " print(f\"Good files in {dir_path}: {good_files}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b95cd6d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded original dataset with 2000 samples.\n", "Original CWE Distribution:\n", " cwe\n", "CWE121 400\n", "CWE78 400\n", "CWE190 400\n", "CWE191 400\n", "CWE122 400\n", "Name: count, dtype: int64\n", "Extracted 0 'none' samples from Juliet.\n", "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n", "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n", "Final CWE Distribution:\n", " cwe\n", "CWE121 400\n", "CWE78 400\n", "CWE190 400\n", "CWE191 400\n", "CWE122 400\n", "none 400\n", "Name: count, dtype: int64\n", "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n" ] } ], "source": [ "#Prepare and Save Dataset\n", "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n", "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n", "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n", "\n", "# Load the original dataset\n", "full_df = pd.read_csv(original_csv_path)\n", "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n", "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n", "\n", "# Extract 'none' samples from Juliet\n", "none_df = extract_none_samples_from_juliet(juliet_dir)\n", "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n", "\n", "# Fallback: If no 'none' samples extracted, add synthetic ones\n", "if len(none_df) == 0:\n", " print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n", " none_samples = pd.DataFrame({\n", " 'cwe': ['none'] * 400,\n", " 'code': [\n", " 'int main() { printf(\"Hello, World!\"); return 0; }',\n", " 'void func() { int x = 5; printf(\"%d\", x); }',\n", " 'int add(int a, int b) { return a + b; }',\n", " 'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n", " 'int main() { char str[] = \"test\"; puts(str); return 0; }'\n", " ] * 80,\n", " 'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n", " })\n", " none_df = none_samples\n", "\n", "# Combine with original dataset\n", "full_df = pd.concat([full_df, none_df], ignore_index=True)\n", "\n", "# Clean the code\n", "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n", "\n", "# Save the updated dataset\n", "full_df.to_csv(output_csv_path, index=False)\n", "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n", "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n", "print(\"Unique CWE labels:\", full_df['cwe'].unique())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 5 }