Spaces:

martynattakit
/

CodeSentinel-CWE_Classification

Sleeping

File size: 7,301 Bytes

7e7d487

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca65c59e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import glob\n",
    "import re\n",
    "\n",
    "def strip_comments_and_cwe(code):\n",
    "    \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
    "    code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
    "    code = re.sub(r'//.*?\\n', '\\n', code)\n",
    "    code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
    "    code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
    "    return code\n",
    "\n",
    "def extract_none_samples_from_juliet(juliet_dir):\n",
    "    \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
    "    cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
    "    good_samples = []\n",
    "    \n",
    "    for cwe in cwes:\n",
    "        cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
    "        cwe_dirs = glob.glob(cwe_dir)\n",
    "        for dir_path in cwe_dirs:\n",
    "            good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
    "            for file_path in good_files:\n",
    "                try:\n",
    "                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
    "                        code = f.read()\n",
    "                    good_samples.append({\n",
    "                        'cwe': 'none',\n",
    "                        'code': code,\n",
    "                        'file': os.path.basename(file_path)\n",
    "                    })\n",
    "                except Exception as e:\n",
    "                    print(f\"Error reading {file_path}: {e}\")\n",
    "    \n",
    "    return pd.DataFrame(good_samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef098685",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Juliet directory exists: True\n",
      "\n",
      "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
      "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
      "\n",
      "Looking for CWE78 directories: []\n",
      "\n",
      "Looking for CWE122 directories: []\n",
      "\n",
      "Looking for CWE190 directories: []\n",
      "\n",
      "Looking for CWE191 directories: []\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import glob\n",
    "\n",
    "# Updated Juliet path\n",
    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
    "\n",
    "# Check if directory exists\n",
    "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
    "\n",
    "# Check for CWE directories\n",
    "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
    "for cwe in cwes:\n",
    "    cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
    "    cwe_dirs = glob.glob(cwe_dir)\n",
    "    print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
    "    for dir_path in cwe_dirs:\n",
    "        good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
    "        print(f\"Good files in {dir_path}: {good_files}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b95cd6d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded original dataset with 2000 samples.\n",
      "Original CWE Distribution:\n",
      " cwe\n",
      "CWE121    400\n",
      "CWE78     400\n",
      "CWE190    400\n",
      "CWE191    400\n",
      "CWE122    400\n",
      "Name: count, dtype: int64\n",
      "Extracted 0 'none' samples from Juliet.\n",
      "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
      "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
      "Final CWE Distribution:\n",
      " cwe\n",
      "CWE121    400\n",
      "CWE78     400\n",
      "CWE190    400\n",
      "CWE191    400\n",
      "CWE122    400\n",
      "none      400\n",
      "Name: count, dtype: int64\n",
      "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
     ]
    }
   ],
   "source": [
    "#Prepare and Save Dataset\n",
    "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
    "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
    "\n",
    "# Load the original dataset\n",
    "full_df = pd.read_csv(original_csv_path)\n",
    "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
    "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
    "\n",
    "# Extract 'none' samples from Juliet\n",
    "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
    "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
    "\n",
    "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
    "if len(none_df) == 0:\n",
    "    print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
    "    none_samples = pd.DataFrame({\n",
    "        'cwe': ['none'] * 400,\n",
    "        'code': [\n",
    "            'int main() { printf(\"Hello, World!\"); return 0; }',\n",
    "            'void func() { int x = 5; printf(\"%d\", x); }',\n",
    "            'int add(int a, int b) { return a + b; }',\n",
    "            'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
    "            'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
    "        ] * 80,\n",
    "        'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
    "    })\n",
    "    none_df = none_samples\n",
    "\n",
    "# Combine with original dataset\n",
    "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
    "\n",
    "# Clean the code\n",
    "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
    "\n",
    "# Save the updated dataset\n",
    "full_df.to_csv(output_csv_path, index=False)\n",
    "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
    "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
    "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}