Spaces:

martynattakit
/

CodeSentinel-CWE_Classification

Sleeping

App Files Files Community

MartyNattakit commited on Jun 10, 2025

Commit

74b2969

unverified ·

1 Parent(s): 6b0305e

Delete data directory

Browse files

Files changed (6) hide show

data/check_file_cache.py +0 -13
data/dataleakanalysis.ipynb +0 -0
data/extract_all_cwes.py +0 -58
data/generate_cwe_top5_sampled.py +0 -30
data/list_all_juliet_files.py +0 -15
data/nonecategory.ipynb +0 -195

data/check_file_cache.py DELETED Viewed

@@ -1,13 +0,0 @@
-import os
-import glob
-base_dir = "C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
-pattern = os.path.join(base_dir, "**", "CWE121_Stack_Based_Buffer_Overflow*.c")
-files = {os.path.basename(f): f for f in glob.glob(pattern, recursive=True)}
-print(f"Found {len(files)} .c files")
-for file in [
-    "CWE121_Stack_Based_Buffer_Overflow__CWE805_wchar_t_declare_memcpy_32.c",
-    "CWE121_Stack_Based_Buffer_Overflow__CWE131_memmove_18.c",
-    "CWE121_Stack_Based_Buffer_Overflow__CWE135_01.c"
-]:
-    print(f"{file}: {'Found' if file in files else 'Not found'}")

data/dataleakanalysis.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

data/extract_all_cwes.py DELETED Viewed

@@ -1,58 +0,0 @@
-import pandas as pd
-import os
-import re
-import xml.etree.ElementTree as ET
-input_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
-output_csv = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_cwes_dataset.csv"
-bad_paths = ["s01", "s03", "s05", "s07"]
-batch_size = 10000
-df = pd.read_csv(input_file)
-for i in range(0, len(df), batch_size):
-    data = {"file": [], "cwe": [], "label": [], "code": []}
-    batch = df[i:i+batch_size]
-    for file_path in batch["file_path"]:
-        try:
-            file_name = os.path.basename(file_path)
-            # Skip non-CWE files
-            if "testcasesupport" in file_path.lower() or not re.search(r"CWE\d+", file_name):
-                print(f"Skipped: {file_name} (non-CWE or testcasesupport)")
-                continue
-            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-                code = f.read()
-            print(f"Processing: {file_name}")
-            # Extract CWE
-            cwe_match = re.search(r"CWE\d+", file_name)
-            cwe = cwe_match.group(0) if cwe_match else "Unknown"
-            # Path-based labeling
-            normalized_path = file_path.lower().replace("\\", "/")
-            is_bad_path = any(s in normalized_path for s in bad_paths) and re.search(r"_(0[13579]|[1-9][0-9])\.c$", file_name)
-            # XML-based labeling
-            xml_path = file_path.replace(".c", ".label.xml")
-            label = "good"
-            if os.path.exists(xml_path):
-                tree = ET.parse(xml_path)
-                if tree.find(".//flaw") is not None:
-                    label = "bad"
-            elif is_bad_path:
-                label = "bad"
-            print(f"File: {file_name}, CWE: {cwe}, Path: {is_bad_path}, Label: {label}")
-            data["file"].append(file_name)
-            data["cwe"].append(cwe)
-            data["label"].append(label)
-            data["code"].append(code)
-        except Exception as e:
-            print(f"Error: {file_path}: {e}")
-    batch_df = pd.DataFrame(data)
-    batch_df.to_csv(f"{output_csv}.{i//batch_size}.csv", index=False)
-    print(f"Saved batch {i//batch_size} with {len(batch_df)} rows")

data/generate_cwe_top5_sampled.py DELETED Viewed

@@ -1,30 +0,0 @@
-import pandas as pd
-# Paths
-all_cwes_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\Demo\\all_cwes_dataset.csv"
-output_csv = "C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv"
-# Load all CWEs dataset
-df = pd.read_csv(all_cwes_csv)
-# Top 5 CWEs (from all_cwes_dataset.csv)
-top_cwes = ["CWE121", "CWE78", "CWE190", "CWE191", "CWE122"]
-# Filter for top 5 CWEs (using 'cwe' column)
-df_top5 = df[df['cwe'].isin(top_cwes)]
-# Sample 400 files per CWE (or all if fewer)
-df_sampled = pd.DataFrame()
-for cwe in top_cwes:
-    cwe_df = df_top5[df_top5['cwe'] == cwe]
-    sample_size = min(400, len(cwe_df))
-    if sample_size > 0:
-        df_sampled = pd.concat([df_sampled, cwe_df.sample(n=sample_size, random_state=42)])
-# Save
-df_sampled.to_csv(output_csv, index=False, encoding='utf-8')
-print(f"Created {output_csv} with {len(df_sampled)} files")
-if not df_sampled.empty:
-    print(f"CWE counts:\n{df_sampled['cwe'].value_counts().to_string()}")
-else:
-    print("No data sampled. Check column name or top_cwes list.")

data/list_all_juliet_files.py DELETED Viewed

@@ -1,15 +0,0 @@
-import os
-import pandas as pd
-juliet_root = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
-output_file = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\all_juliet_files.csv"
-files = []
-for root, _, filenames in os.walk(juliet_root):
-    for fname in filenames:
-        if fname.endswith(".c"):
-            files.append(os.path.join(root, fname))
-df = pd.DataFrame(files, columns=["file_path"])
-df.to_csv(output_file, index=False)
-print(f"Saved {len(files)} files to {output_file}")

data/nonecategory.ipynb DELETED Viewed

@@ -1,195 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ca65c59e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import os\n",
-    "import glob\n",
-    "import re\n",
-    "\n",
-    "def strip_comments_and_cwe(code):\n",
-    "    \"\"\"Strip comments and CWE-related variable names from code.\"\"\"\n",
-    "    code = re.sub(r'/\\*.*?\\*/', '', code, flags=re.DOTALL)\n",
-    "    code = re.sub(r'//.*?\\n', '\\n', code)\n",
-    "    code = re.sub(r'\\bCWE\\d{3}_\\w+', 'var', code)\n",
-    "    code = re.sub(r'\\n\\s*\\n', '\\n', code).strip()\n",
-    "    return code\n",
-    "\n",
-    "def extract_none_samples_from_juliet(juliet_dir):\n",
-    "    \"\"\"Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'.\"\"\"\n",
-    "    cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
-    "    good_samples = []\n",
-    "    \n",
-    "    for cwe in cwes:\n",
-    "        cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
-    "        cwe_dirs = glob.glob(cwe_dir)\n",
-    "        for dir_path in cwe_dirs:\n",
-    "            good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
-    "            for file_path in good_files:\n",
-    "                try:\n",
-    "                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
-    "                        code = f.read()\n",
-    "                    good_samples.append({\n",
-    "                        'cwe': 'none',\n",
-    "                        'code': code,\n",
-    "                        'file': os.path.basename(file_path)\n",
-    "                    })\n",
-    "                except Exception as e:\n",
-    "                    print(f\"Error reading {file_path}: {e}\")\n",
-    "    \n",
-    "    return pd.DataFrame(good_samples)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ef098685",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Juliet directory exists: True\n",
-      "\n",
-      "Looking for CWE121 directories: ['C:\\\\Users\\\\MartyNattakit\\\\Desktop\\\\Datasets\\\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\\\cwe121_results.txt']\n",
-      "Good files in C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt: []\n",
-      "\n",
-      "Looking for CWE78 directories: []\n",
-      "\n",
-      "Looking for CWE122 directories: []\n",
-      "\n",
-      "Looking for CWE190 directories: []\n",
-      "\n",
-      "Looking for CWE191 directories: []\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import glob\n",
-    "\n",
-    "# Updated Juliet path\n",
-    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
-    "\n",
-    "# Check if directory exists\n",
-    "print(f\"Juliet directory exists: {os.path.exists(juliet_dir)}\")\n",
-    "\n",
-    "# Check for CWE directories\n",
-    "cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']\n",
-    "for cwe in cwes:\n",
-    "    cwe_dir = os.path.join(juliet_dir, f'{cwe}*')\n",
-    "    cwe_dirs = glob.glob(cwe_dir)\n",
-    "    print(f\"\\nLooking for {cwe} directories: {cwe_dirs}\")\n",
-    "    for dir_path in cwe_dirs:\n",
-    "        good_files = glob.glob(os.path.join(dir_path, '*good*.c'))\n",
-    "        print(f\"Good files in {dir_path}: {good_files}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b95cd6d2",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loaded original dataset with 2000 samples.\n",
-      "Original CWE Distribution:\n",
-      " cwe\n",
-      "CWE121    400\n",
-      "CWE78     400\n",
-      "CWE190    400\n",
-      "CWE191    400\n",
-      "CWE122    400\n",
-      "Name: count, dtype: int64\n",
-      "Extracted 0 'none' samples from Juliet.\n",
-      "No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\n",
-      "Updated dataset saved as C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.\n",
-      "Final CWE Distribution:\n",
-      " cwe\n",
-      "CWE121    400\n",
-      "CWE78     400\n",
-      "CWE190    400\n",
-      "CWE191    400\n",
-      "CWE122    400\n",
-      "none      400\n",
-      "Name: count, dtype: int64\n",
-      "Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']\n"
-     ]
-    }
-   ],
-   "source": [
-    "#Prepare and Save Dataset\n",
-    "original_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled.csv\"\n",
-    "juliet_dir = r\"C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\"\n",
-    "output_csv_path = r\"C:\\Users\\MartyNattakit\\Desktop\\CodeSentinel\\cwe_top5_sampled_with_juliet_none.csv\"\n",
-    "\n",
-    "# Load the original dataset\n",
-    "full_df = pd.read_csv(original_csv_path)\n",
-    "print(f\"Loaded original dataset with {len(full_df)} samples.\")\n",
-    "print(\"Original CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
-    "\n",
-    "# Extract 'none' samples from Juliet\n",
-    "none_df = extract_none_samples_from_juliet(juliet_dir)\n",
-    "print(f\"Extracted {len(none_df)} 'none' samples from Juliet.\")\n",
-    "\n",
-    "# Fallback: If no 'none' samples extracted, add synthetic ones\n",
-    "if len(none_df) == 0:\n",
-    "    print(\"No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...\")\n",
-    "    none_samples = pd.DataFrame({\n",
-    "        'cwe': ['none'] * 400,\n",
-    "        'code': [\n",
-    "            'int main() { printf(\"Hello, World!\"); return 0; }',\n",
-    "            'void func() { int x = 5; printf(\"%d\", x); }',\n",
-    "            'int add(int a, int b) { return a + b; }',\n",
-    "            'void loop() { for(int i = 0; i < 10; i++) { printf(\".\"); } }',\n",
-    "            'int main() { char str[] = \"test\"; puts(str); return 0; }'\n",
-    "        ] * 80,\n",
-    "        'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]\n",
-    "    })\n",
-    "    none_df = none_samples\n",
-    "\n",
-    "# Combine with original dataset\n",
-    "full_df = pd.concat([full_df, none_df], ignore_index=True)\n",
-    "\n",
-    "# Clean the code\n",
-    "full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)\n",
-    "\n",
-    "# Save the updated dataset\n",
-    "full_df.to_csv(output_csv_path, index=False)\n",
-    "print(f\"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.\")\n",
-    "print(\"Final CWE Distribution:\\n\", full_df['cwe'].value_counts())\n",
-    "print(\"Unique CWE labels:\", full_df['cwe'].unique())"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}