Spaces:

joshnavip
/

ai-code-detection

Runtime error

File size: 3,087 Bytes

b144cb7

import pandas as pd

# ===============================
# PATHS
# ===============================
INPUT_PATH = "dataset/raw/raw_dataset.csv"
OUTPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv"

# ===============================
# PARAMETERS
# ===============================
MIN_LINES = 4
MAX_LINES = 100

# ===============================
# UTILITY FUNCTIONS
# ===============================
def get_clean_lines(code):
    if not isinstance(code, str):
        return []

    lines = code.splitlines()

    # Remove leading empty lines
    while lines and lines[0].strip() == "":
        lines.pop(0)

    # Remove trailing empty lines
    while lines and lines[-1].strip() == "":
        lines.pop()

    return lines


def get_truncation_marker(language):
    if language == "Python":
        return "# ... truncated ..."
    elif language == "Java":
        return "// ... truncated ..."
    else:
        return "... truncated ..."


def truncate_middle(lines, max_lines, language):
    if len(lines) <= max_lines:
        return lines

    head = lines[: max_lines // 2]
    tail = lines[-(max_lines // 2):]
    marker = get_truncation_marker(language)

    return head + [marker] + tail


def normalize_code(code, language):
    lines = get_clean_lines(code)

    if len(lines) < MIN_LINES:
        return None

    if len(lines) > MAX_LINES:
        lines = truncate_middle(lines, MAX_LINES, language)

    return "\n".join(lines)

# ===============================
# MAIN PIPELINE
# ===============================
def main():
    print("Loading raw dataset...")
    df = pd.read_csv(INPUT_PATH)
    print(f"Initial dataset size: {len(df)} rows")

    # -------------------------------
    # Column validation
    # -------------------------------
    required_cols = ["Code_Text","Language"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # -------------------------------
    # Normalize code length
    # -------------------------------
    print("Normalizing code length (language-aware)...")

    df["normalized_code"] = df.apply(
        lambda row: normalize_code(row["Code_Text"], row["Language"]),
        axis=1
    )

    # Drop rows filtered out by normalization
    df = df[df["normalized_code"].notnull()].reset_index(drop=True)

    # -------------------------------
    # Metadata for analysis/debugging
    # -------------------------------
    df["original_line_count"] = df["Code_Text"].apply(
        lambda c: len(get_clean_lines(c))
    )

    df["normalized_line_count"] = df["normalized_code"].apply(
        lambda c: len(get_clean_lines(c))
    )

    print(f"Final dataset size after normalization: {len(df)} rows")

    # -------------------------------
    # Save output
    # -------------------------------
    print("Saving processed dataset...")
    df.to_csv(OUTPUT_PATH, index=False)

    print("Step 1 completed successfully ✅")
    print(f"Output file saved at: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()