Spaces:

Nitishkumar-ai
/

commitguard

Configuration error

File size: 8,862 Bytes

e4f3d12

import argparse
import json
import random
from collections import Counter
from pathlib import Path


def _read_jsonl(path: Path) -> list[dict]:
    rows = []
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line:
            continue
        rows.append(json.loads(line))
    return rows


def _write_jsonl(path: Path, rows: list[dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8", newline="\n") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


# ---------------------------------------------------------------------------
# Fix 2: CWE classification using vulnerable lines, not the whole function.
# Scored rules — highest-scoring match wins. Falls back to CWE-OTHER.
# ---------------------------------------------------------------------------

_CWE_RULES: list[tuple[str, list[str], int]] = [
    ("CWE-119", ["memcpy", "strcpy", "strcat", "strncpy", "memmove", "sprintf",
                  "gets(", "buffer", "overflow", "oob", "av_malloc", "av_realloc",
                  "realloc", "malloc", "alloc", "g_malloc", "g_realloc",
                  "qemu_malloc", "len ", "length", "copy_from", "copy_to"], 5),
    ("CWE-476", ["null", "nullptr", "!= null", "== null", "if (!",
                  "dereference", "segfault", "!obj", "!ctx", "!s->", "!p"], 5),
    ("CWE-189", ["integer overflow", "signedness", "truncat", "wrap",
                  "size_t", "underflow", "narrowing", "(int)", "(uint",
                  "(unsigned)", ">> ", "<< ", "0xffff", "max_", "min_"], 5),
    ("CWE-78",  ["system(", "popen(", "exec(", "execve", "shell",
                  "command", "subprocess"], 8),
    ("CWE-22",  ["../", "..\\", "traversal", "chroot", "realpath",
                  "canonicalize", "symlink", "path"], 7),
    ("CWE-89",  ["sql", "query", "select ", "insert ", "union ",
                  "prepared", "sqlite", "mysql"], 7),
    ("CWE-79",  ["xss", "innerhtml", "script", "sanitize", "escape",
                  "htmlentit", "content-type"], 6),
    ("CWE-20",  ["valid", "saniti", "untrusted", "input", "bounds",
                  "assert", "range", "check", "error", "return -1",
                  "goto fail", "goto err", "goto out"], 2),
]


def infer_cwe(vul_lines_code: list[str], func: str) -> str:
    vul_text = " ".join(vul_lines_code).lower() if vul_lines_code else ""
    func_text = func.lower()

    best_cwe, best_score = "CWE-OTHER", 0

    for cwe, keywords, weight in _CWE_RULES:
        vul_hits = sum(1 for k in keywords if k in vul_text) if vul_text else 0
        func_hits = sum(1 for k in keywords if k in func_text)
        score = vul_hits * weight + func_hits * (weight // 2)
        if score > best_score:
            best_cwe, best_score = cwe, score

    if best_score < 2:
        return "CWE-OTHER"
    return best_cwe


# ---------------------------------------------------------------------------
# Fix 1: Real unified diffs from per-line vulnerability labels.
# ---------------------------------------------------------------------------

def _build_diff(func: str, label: list[int], rng: random.Random, is_vuln: bool) -> str:
    lines = func.splitlines()

    if is_vuln and label and len(label) == len(lines):
        changed_indices = {i for i, l in enumerate(label) if l == 1}
    elif is_vuln and label and any(l == 1 for l in label):
        changed_indices = {i for i, l in enumerate(label) if l == 1}
    else:
        block_size = max(1, min(5, len(lines) // 4))
        start = rng.randint(0, max(0, len(lines) - block_size))
        changed_indices = set(range(start, min(start + block_size, len(lines))))

    if not changed_indices:
        changed_indices = {0}

    ctx = 3
    visible: set[int] = set()
    for ci in changed_indices:
        for offset in range(-ctx, ctx + 1):
            idx = ci + offset
            if 0 <= idx < len(lines):
                visible.add(idx)

    sorted_visible = sorted(visible)
    hunks: list[list[int]] = []
    current_hunk: list[int] = []
    for idx in sorted_visible:
        if current_hunk and idx > current_hunk[-1] + 1:
            hunks.append(current_hunk)
            current_hunk = [idx]
        else:
            current_hunk.append(idx)
    if current_hunk:
        hunks.append(current_hunk)

    diff_parts = ["--- a/source.c", "+++ b/source.c"]
    for hunk in hunks:
        start_line = hunk[0] + 1
        hunk_size = len(hunk)
        diff_parts.append(f"@@ -{start_line},{hunk_size} +{start_line},{hunk_size} @@")
        for idx in hunk:
            line = lines[idx]
            if idx in changed_indices:
                diff_parts.append(f"+{line}")
            else:
                diff_parts.append(f" {line}")

    return "\n".join(diff_parts)


# ---------------------------------------------------------------------------
# Fix 3: CWE rebalancing — cap dominant CWEs, merge tiny ones.
# ---------------------------------------------------------------------------

_MAX_PER_CWE_FRAC = 0.25
_MIN_CWE_SAMPLES = 20


def _rebalance(samples: list[dict], rng: random.Random, limit: int) -> list[dict]:
    by_cwe: dict[str, list[dict]] = {}
    for s in samples:
        by_cwe.setdefault(s["cwe"] or "CWE-OTHER", []).append(s)

    for cwe, items in list(by_cwe.items()):
        if len(items) < _MIN_CWE_SAMPLES and cwe != "CWE-OTHER":
            by_cwe.setdefault("CWE-OTHER", []).extend(items)
            for item in items:
                item["cwe"] = "CWE-OTHER"
            del by_cwe[cwe]

    cap = int(limit * _MAX_PER_CWE_FRAC)
    kept: list[dict] = []
    for cwe, items in by_cwe.items():
        rng.shuffle(items)
        kept.extend(items[:cap])

    rng.shuffle(kept)
    return kept[:limit]


def main() -> None:
    ap = argparse.ArgumentParser(description="Preprocess Devign-derived samples into CommitGuard JSONL.")
    ap.add_argument("--in", dest="inp", type=Path, default=None, help="Optional input JSONL.")
    ap.add_argument("--out", dest="out", type=Path, default=Path("data/devign_filtered.jsonl"))
    ap.add_argument("--limit", type=int, default=5000)
    ap.add_argument("--seed", type=int, default=42)
    args = ap.parse_args()

    rng = random.Random(args.seed)

    if args.inp is None:
        try:
            from datasets import load_dataset
            print("Loading DetectVul/devign from Hugging Face...")
            ds = load_dataset('DetectVul/devign', split='train')
            raw_rows = list(ds)
            print(f"Loaded {len(raw_rows)} rows from HF.")
        except Exception as e:
            print(f"Failed to load from HF: {e}")
            return
    else:
        raw_rows = _read_jsonl(args.inp)

    vuln_samples: list[dict] = []
    safe_samples: list[dict] = []
    cwe_counter: Counter[str] = Counter()

    for r in raw_rows:
        func = r.get("func")
        if not func:
            continue
        if len(func.split("\n")) > 80:
            continue

        target = bool(r.get("target", False))
        label = r.get("label", [])
        vul_lines_code = []
        vl = r.get("vul_lines")
        if vl and isinstance(vl, dict):
            vul_lines_code = vl.get("code", [])

        cwe = infer_cwe(vul_lines_code, func) if target else None
        diff = _build_diff(func, label, rng, target)

        sample_id = str(r.get("commit_id") or r.get("id") or f"row-{len(vuln_samples) + len(safe_samples)}")
        target_file = "source.c"

        sample = {
            "sample_id": sample_id,
            "diff": diff,
            "available_files": [target_file],
            "is_vulnerable": target,
            "cwe": cwe,
            "target_file": target_file,
            "files": {target_file: func},
        }

        if target:
            vuln_samples.append(sample)
            cwe_counter[cwe or "CWE-OTHER"] += 1
        else:
            safe_samples.append(sample)

    print(f"Filtered: {len(vuln_samples)} vulnerable, {len(safe_samples)} safe.")
    print(f"CWE distribution (pre-balance): {cwe_counter.most_common()}")

    target_each = args.limit // 2
    vuln_keep = _rebalance(vuln_samples, rng, target_each)
    safe_keep = rng.sample(safe_samples, min(target_each, len(safe_samples)))

    out_rows = vuln_keep + safe_keep
    rng.shuffle(out_rows)

    _write_jsonl(args.out, out_rows)

    final_cwes = Counter(r["cwe"] for r in out_rows if r["is_vulnerable"])
    print(f"Wrote {len(out_rows)} samples to {args.out}")
    print(f"Final CWE distribution: {final_cwes.most_common()}")

if __name__ == "__main__":
    main()