Spaces:

martynattakit
/

CodeSentinel-CWE_Classification

Sleeping

App Files Files Community

MartyNattakit commited on Jun 10, 2025

Commit

7e7d487

unverified ·

1 Parent(s): 74b2969

Add files via upload

Browse files

Files changed (6) hide show

scripts/check_file_cache.py +13 -0
scripts/dataleakanalysis.ipynb +0 -0
scripts/extract_all_cwes.py +58 -0
scripts/generate_cwe_top5_sampled.py +30 -0
scripts/list_all_juliet_files.py +15 -0
scripts/nonecategory.ipynb +195 -0

scripts/check_file_cache.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+import glob
+base_dir = "C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
+pattern = os.path.join(base_dir, "**", "CWE121_Stack_Based_Buffer_Overflow*.c")
+files = {os.path.basename(f): f for f in glob.glob(pattern, recursive=True)}
+print(f"Found {len(files)} .c files")
+for file in [
+    "CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_declare_memcpy_32.c",
+    "CWE121_Stack_Based_Buffer_Overflow__CWE131_memmove_18.c",
+    "CWE121_Stack_Based_Buffer_Overflow__CWE135_01.c"
+]:
+    print(f"{file}: {'Found' if file in files else 'Not found'}")

scripts/dataleakanalysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/extract_all_cwes.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+import os
+import re
+import xml.etree.ElementTree as ET
+input_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
+output_csv = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_cwes_dataset.csv"
+bad_paths = ["s01", "s03", "s05", "s07"]
+batch_size = 10000
+df = pd.read_csv(input_file)
+for i in range(0, len(df), batch_size):
+    data = {"file": [], "cwe": [], "label": [], "code": []}
+    batch = df[i:i+batch_size]
+    for file_path in batch["file_path"]:
+        try:
+            file_name = os.path.basename(file_path)
+            # Skip non-CWE files
+            if "testcasesupport" in file_path.lower() or not re.search(r"CWE\d+", file_name):
+                print(f"Skipped: {file_name} (non-CWE or testcasesupport)")
+                continue
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                code = f.read()
+            print(f"Processing: {file_name}")
+            # Extract CWE
+            cwe_match = re.search(r"CWE\d+", file_name)
+            cwe = cwe_match.group(0) if cwe_match else "Unknown"
+            # Path-based labeling
+            normalized_path = file_path.lower().replace("\\", "/")
+            is_bad_path = any(s in normalized_path for s in bad_paths) and re.search(r"_(0[13579]|[1-9][0-9])\.c$", file_name)
+            # XML-based labeling
+            xml_path = file_path.replace(".c", ".label.xml")
+            label = "good"
+            if os.path.exists(xml_path):
+                tree = ET.parse(xml_path)
+                if tree.find(".//flaw") is not None:
+                    label = "bad"
+            elif is_bad_path:
+                label = "bad"
+            print(f"File: {file_name}, CWE: {cwe}, Path: {is_bad_path}, Label: {label}")
+            data["file"].append(file_name)
+            data["cwe"].append(cwe)
+            data["label"].append(label)
+            data["code"].append(code)
+        except Exception as e:
+            print(f"Error: {file_path}: {e}")
+    batch_df = pd.DataFrame(data)
+    batch_df.to_csv(f"{output_csv}.{i//batch_size}.csv", index=False)
+    print(f"Saved batch {i//batch_size} with {len(batch_df)} rows")

scripts/generate_cwe_top5_sampled.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pandas as pd
+# Paths
+all_cwes_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\Demo\\all_cwes_dataset.csv"
+output_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv"
+# Load all CWEs dataset
+df = pd.read_csv(all_cwes_csv)
+# Top 5 CWEs (from all_cwes_dataset.csv)
+top_cwes = ["CWE121", "CWE78", "CWE190", "CWE191", "CWE122"]
+# Filter for top 5 CWEs (using 'cwe' column)
+df_top5 = df[df['cwe'].isin(top_cwes)]
+# Sample 400 files per CWE (or all if fewer)
+df_sampled = pd.DataFrame()
+for cwe in top_cwes:
+    cwe_df = df_top5[df_top5['cwe'] == cwe]
+    sample_size = min(400, len(cwe_df))
+    if sample_size > 0:
+        df_sampled = pd.concat([df_sampled, cwe_df.sample(n=sample_size, random_state=42)])
+# Save
+df_sampled.to_csv(output_csv, index=False, encoding='utf-8')
+print(f"Created {output_csv} with {len(df_sampled)} files")
+if not df_sampled.empty:
+    print(f"CWE counts:\n{df_sampled['cwe'].value_counts().to_string()}")
+else:
+    print("No data sampled. Check column name or top_cwes list.")

scripts/list_all_juliet_files.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import pandas as pd
+juliet_root = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
+output_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
+files = []
+for root, _, filenames in os.walk(juliet_root):
+    for fname in filenames:
+        if fname.endswith(".c"):
+            files.append(os.path.join(root, fname))
+df = pd.DataFrame(files, columns=["file_path"])
+df.to_csv(output_file, index=False)
+print(f"Saved {len(files)} files to {output_file}")

scripts/nonecategory.ipynb ADDED Viewed

	@@ -0,0 +1,195 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca65c59e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "import glob\n",
+    "import re\n",
+    "\n",
+    "def strip_comments_and_cwe(code):\n",
+    "    \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
+    "    code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
+    "    code = re.sub(r'//.*?\\n', '\\n', code)\n",
+    "    code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
+    "    code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
+    "    return code\n",
+    "\n",
+    "def extract_none_samples_from_juliet(juliet_dir):\n",
+    "    \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
+    "    cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
+    "    good_samples = []\n",
+    "    \n",
+    "    for cwe in cwes:\n",
+    "        cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
+    "        cwe_dirs = glob.glob(cwe_dir)\n",
+    "        for dir_path in cwe_dirs:\n",
+    "            good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
+    "            for file_path in good_files:\n",
+    "                try:\n",
+    "                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
+    "                        code = f.read()\n",
+    "                    good_samples.append({\n",
+    "                        'cwe': 'none',\n",
+    "                        'code': code,\n",
+    "                        'file': os.path.basename(file_path)\n",
+    "                    })\n",
+    "                except Exception as e:\n",
+    "                    print(f\"Error reading {file_path}: {e}\")\n",
+    "    \n",
+    "    return pd.DataFrame(good_samples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef098685",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Juliet directory exists: True\n",
+      "\n",
+      "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
+      "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
+      "\n",
+      "Looking for CWE78 directories: []\n",
+      "\n",
+      "Looking for CWE122 directories: []\n",
+      "\n",
+      "Looking for CWE190 directories: []\n",
+      "\n",
+      "Looking for CWE191 directories: []\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "\n",
+    "# Updated Juliet path\n",
+    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
+    "\n",
+    "# Check if directory exists\n",
+    "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
+    "\n",
+    "# Check for CWE directories\n",
+    "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
+    "for cwe in cwes:\n",
+    "    cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
+    "    cwe_dirs = glob.glob(cwe_dir)\n",
+    "    print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
+    "    for dir_path in cwe_dirs:\n",
+    "        good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
+    "        print(f\"Good files in {dir_path}: {good_files}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b95cd6d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded original dataset with 2000 samples.\n",
+      "Original CWE Distribution:\n",
+      " cwe\n",
+      "CWE121    400\n",
+      "CWE78     400\n",
+      "CWE190    400\n",
+      "CWE191    400\n",
+      "CWE122    400\n",
+      "Name: count, dtype: int64\n",
+      "Extracted 0 'none' samples from Juliet.\n",
+      "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
+      "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
+      "Final CWE Distribution:\n",
+      " cwe\n",
+      "CWE121    400\n",
+      "CWE78     400\n",
+      "CWE190    400\n",
+      "CWE191    400\n",
+      "CWE122    400\n",
+      "none      400\n",
+      "Name: count, dtype: int64\n",
+      "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Prepare and Save Dataset\n",
+    "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
+    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
+    "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
+    "\n",
+    "# Load the original dataset\n",
+    "full_df = pd.read_csv(original_csv_path)\n",
+    "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
+    "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
+    "\n",
+    "# Extract 'none' samples from Juliet\n",
+    "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
+    "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
+    "\n",
+    "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
+    "if len(none_df) == 0:\n",
+    "    print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
+    "    none_samples = pd.DataFrame({\n",
+    "        'cwe': ['none'] * 400,\n",
+    "        'code': [\n",
+    "            'int main() { printf(\"Hello, World!\"); return 0; }',\n",
+    "            'void func() { int x = 5; printf(\"%d\", x); }',\n",
+    "            'int add(int a, int b) { return a + b; }',\n",
+    "            'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
+    "            'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
+    "        ] * 80,\n",
+    "        'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
+    "    })\n",
+    "    none_df = none_samples\n",
+    "\n",
+    "# Combine with original dataset\n",
+    "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
+    "\n",
+    "# Clean the code\n",
+    "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
+    "\n",
+    "# Save the updated dataset\n",
+    "full_df.to_csv(output_csv_path, index=False)\n",
+    "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
+    "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
+    "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}