Spaces:

martynattakit
/

CodeSentinel-CWE_Classification

Sleeping

App Files Files Community

MartyNattakit commited on Jun 10, 2025

Commit

6b0305e

unverified ·

0 Parent(s):

Add files via upload

Browse files

Files changed (9) hide show

cleaned/cwe_top5_sampled.csv +0 -0
cleaned/cwe_top5_sampled_with_juliet_none.csv +0 -0
codebertfinal1.ipynb +0 -0
data/check_file_cache.py +13 -0
data/dataleakanalysis.ipynb +0 -0
data/extract_all_cwes.py +58 -0
data/generate_cwe_top5_sampled.py +30 -0
data/list_all_juliet_files.py +15 -0
data/nonecategory.ipynb +195 -0

cleaned/cwe_top5_sampled.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

cleaned/cwe_top5_sampled_with_juliet_none.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

codebertfinal1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

data/check_file_cache.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+import glob
+base_dir = "C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
+pattern = os.path.join(base_dir, "**", "CWE121_Stack_Based_Buffer_Overflow*.c")
+files = {os.path.basename(f): f for f in glob.glob(pattern, recursive=True)}
+print(f"Found {len(files)} .c files")
+for file in [
+    "CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_declare_memcpy_32.c",
+    "CWE121_Stack_Based_Buffer_Overflow__CWE131_memmove_18.c",
+    "CWE121_Stack_Based_Buffer_Overflow__CWE135_01.c"
+]:
+    print(f"{file}: {'Found' if file in files else 'Not found'}")

data/dataleakanalysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

data/extract_all_cwes.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+import os
+import re
+import xml.etree.ElementTree as ET
+input_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
+output_csv = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_cwes_dataset.csv"
+bad_paths = ["s01", "s03", "s05", "s07"]
+batch_size = 10000
+df = pd.read_csv(input_file)
+for i in range(0, len(df), batch_size):
+    data = {"file": [], "cwe": [], "label": [], "code": []}
+    batch = df[i:i+batch_size]
+    for file_path in batch["file_path"]:
+        try:
+            file_name = os.path.basename(file_path)
+            # Skip non-CWE files
+            if "testcasesupport" in file_path.lower() or not re.search(r"CWE\d+", file_name):
+                print(f"Skipped: {file_name} (non-CWE or testcasesupport)")
+                continue
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                code = f.read()
+            print(f"Processing: {file_name}")
+            # Extract CWE
+            cwe_match = re.search(r"CWE\d+", file_name)
+            cwe = cwe_match.group(0) if cwe_match else "Unknown"
+            # Path-based labeling
+            normalized_path = file_path.lower().replace("\\", "/")
+            is_bad_path = any(s in normalized_path for s in bad_paths) and re.search(r"_(0[13579]|[1-9][0-9])\.c$", file_name)
+            # XML-based labeling
+            xml_path = file_path.replace(".c", ".label.xml")
+            label = "good"
+            if os.path.exists(xml_path):
+                tree = ET.parse(xml_path)
+                if tree.find(".//flaw") is not None:
+                    label = "bad"
+            elif is_bad_path:
+                label = "bad"
+            print(f"File: {file_name}, CWE: {cwe}, Path: {is_bad_path}, Label: {label}")
+            data["file"].append(file_name)
+            data["cwe"].append(cwe)
+            data["label"].append(label)
+            data["code"].append(code)
+        except Exception as e:
+            print(f"Error: {file_path}: {e}")
+    batch_df = pd.DataFrame(data)
+    batch_df.to_csv(f"{output_csv}.{i//batch_size}.csv", index=False)
+    print(f"Saved batch {i//batch_size} with {len(batch_df)} rows")

data/generate_cwe_top5_sampled.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pandas as pd
+# Paths
+all_cwes_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\Demo\\all_cwes_dataset.csv"
+output_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv"
+# Load all CWEs dataset
+df = pd.read_csv(all_cwes_csv)
+# Top 5 CWEs (from all_cwes_dataset.csv)
+top_cwes = ["CWE121", "CWE78", "CWE190", "CWE191", "CWE122"]
+# Filter for top 5 CWEs (using 'cwe' column)
+df_top5 = df[df['cwe'].isin(top_cwes)]
+# Sample 400 files per CWE (or all if fewer)
+df_sampled = pd.DataFrame()
+for cwe in top_cwes:
+    cwe_df = df_top5[df_top5['cwe'] == cwe]
+    sample_size = min(400, len(cwe_df))
+    if sample_size > 0:
+        df_sampled = pd.concat([df_sampled, cwe_df.sample(n=sample_size, random_state=42)])
+# Save
+df_sampled.to_csv(output_csv, index=False, encoding='utf-8')
+print(f"Created {output_csv} with {len(df_sampled)} files")
+if not df_sampled.empty:
+    print(f"CWE counts:\n{df_sampled['cwe'].value_counts().to_string()}")
+else:
+    print("No data sampled. Check column name or top_cwes list.")

data/list_all_juliet_files.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import pandas as pd
+juliet_root = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
+output_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
+files = []
+for root, _, filenames in os.walk(juliet_root):
+    for fname in filenames:
+        if fname.endswith(".c"):
+            files.append(os.path.join(root, fname))
+df = pd.DataFrame(files, columns=["file_path"])
+df.to_csv(output_file, index=False)
+print(f"Saved {len(files)} files to {output_file}")

data/nonecategory.ipynb ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca65c59e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "import glob\n",
+    "import re\n",
+    "\n",
+    "def strip_comments_and_cwe(code):\n",
+    "    \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
+    "    code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
+    "    code = re.sub(r'//.*?\\n', '\\n', code)\n",
+    "    code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
+    "    code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
+    "    return code\n",
+    "\n",
+    "def extract_none_samples_from_juliet(juliet_dir):\n",
+    "    \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
+    "    cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
+    "    good_samples = []\n",
+    "    \n",
+    "    for cwe in cwes:\n",
+    "        cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
+    "        cwe_dirs = glob.glob(cwe_dir)\n",
+    "        for dir_path in cwe_dirs:\n",
+    "            good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
+    "            for file_path in good_files:\n",
+    "                try:\n",
+    "                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
+    "                        code = f.read()\n",
+    "                    good_samples.append({\n",
+    "                        'cwe': 'none',\n",
+    "                        'code': code,\n",
+    "                        'file': os.path.basename(file_path)\n",
+    "                    })\n",
+    "                except Exception as e:\n",
+    "                    print(f\"Error reading {file_path}: {e}\")\n",
+    "    \n",
+    "    return pd.DataFrame(good_samples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef098685",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Juliet directory exists: True\n",
+      "\n",
+      "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
+      "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
+      "\n",
+      "Looking for CWE78 directories: []\n",
+      "\n",
+      "Looking for CWE122 directories: []\n",
+      "\n",
+      "Looking for CWE190 directories: []\n",
+      "\n",
+      "Looking for CWE191 directories: []\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "\n",
+    "# Updated Juliet path\n",
+    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
+    "\n",
+    "# Check if directory exists\n",
+    "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
+    "\n",
+    "# Check for CWE directories\n",
+    "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
+    "for cwe in cwes:\n",
+    "    cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
+    "    cwe_dirs = glob.glob(cwe_dir)\n",
+    "    print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
+    "    for dir_path in cwe_dirs:\n",
+    "        good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
+    "        print(f\"Good files in {dir_path}: {good_files}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b95cd6d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded original dataset with 2000 samples.\n",
+      "Original CWE Distribution:\n",
+      " cwe\n",
+      "CWE121    400\n",
+      "CWE78     400\n",
+      "CWE190    400\n",
+      "CWE191    400\n",
+      "CWE122    400\n",
+      "Name: count, dtype: int64\n",
+      "Extracted 0 'none' samples from Juliet.\n",
+      "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
+      "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
+      "Final CWE Distribution:\n",
+      " cwe\n",
+      "CWE121    400\n",
+      "CWE78     400\n",
+      "CWE190    400\n",
+      "CWE191    400\n",
+      "CWE122    400\n",
+      "none      400\n",
+      "Name: count, dtype: int64\n",
+      "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Prepare and Save Dataset\n",
+    "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
+    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
+    "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
+    "\n",
+    "# Load the original dataset\n",
+    "full_df = pd.read_csv(original_csv_path)\n",
+    "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
+    "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
+    "\n",
+    "# Extract 'none' samples from Juliet\n",
+    "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
+    "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
+    "\n",
+    "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
+    "if len(none_df) == 0:\n",
+    "    print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
+    "    none_samples = pd.DataFrame({\n",
+    "        'cwe': ['none'] * 400,\n",
+    "        'code': [\n",
+    "            'int main() { printf(\"Hello, World!\"); return 0; }',\n",
+    "            'void func() { int x = 5; printf(\"%d\", x); }',\n",
+    "            'int add(int a, int b) { return a + b; }',\n",
+    "            'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
+    "            'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
+    "        ] * 80,\n",
+    "        'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
+    "    })\n",
+    "    none_df = none_samples\n",
+    "\n",
+    "# Combine with original dataset\n",
+    "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
+    "\n",
+    "# Clean the code\n",
+    "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
+    "\n",
+    "# Save the updated dataset\n",
+    "full_df.to_csv(output_csv_path, index=False)\n",
+    "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
+    "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
+    "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}