File size: 3,087 Bytes
b144cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd

# ===============================
# PATHS
# ===============================
INPUT_PATH = "dataset/raw/raw_dataset.csv"
OUTPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv"

# ===============================
# PARAMETERS
# ===============================
MIN_LINES = 4
MAX_LINES = 100

# ===============================
# UTILITY FUNCTIONS
# ===============================
def get_clean_lines(code):
    if not isinstance(code, str):
        return []

    lines = code.splitlines()

    # Remove leading empty lines
    while lines and lines[0].strip() == "":
        lines.pop(0)

    # Remove trailing empty lines
    while lines and lines[-1].strip() == "":
        lines.pop()

    return lines


def get_truncation_marker(language):
    if language == "Python":
        return "# ... truncated ..."
    elif language == "Java":
        return "// ... truncated ..."
    else:
        return "... truncated ..."


def truncate_middle(lines, max_lines, language):
    if len(lines) <= max_lines:
        return lines

    head = lines[: max_lines // 2]
    tail = lines[-(max_lines // 2):]
    marker = get_truncation_marker(language)

    return head + [marker] + tail


def normalize_code(code, language):
    lines = get_clean_lines(code)

    if len(lines) < MIN_LINES:
        return None

    if len(lines) > MAX_LINES:
        lines = truncate_middle(lines, MAX_LINES, language)

    return "\n".join(lines)

# ===============================
# MAIN PIPELINE
# ===============================
def main():
    print("Loading raw dataset...")
    df = pd.read_csv(INPUT_PATH)
    print(f"Initial dataset size: {len(df)} rows")

    # -------------------------------
    # Column validation
    # -------------------------------
    required_cols = ["Code_Text","Language"]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    # -------------------------------
    # Normalize code length
    # -------------------------------
    print("Normalizing code length (language-aware)...")

    df["normalized_code"] = df.apply(
        lambda row: normalize_code(row["Code_Text"], row["Language"]),
        axis=1
    )

    # Drop rows filtered out by normalization
    df = df[df["normalized_code"].notnull()].reset_index(drop=True)

    # -------------------------------
    # Metadata for analysis/debugging
    # -------------------------------
    df["original_line_count"] = df["Code_Text"].apply(
        lambda c: len(get_clean_lines(c))
    )

    df["normalized_line_count"] = df["normalized_code"].apply(
        lambda c: len(get_clean_lines(c))
    )

    print(f"Final dataset size after normalization: {len(df)} rows")

    # -------------------------------
    # Save output
    # -------------------------------
    print("Saving processed dataset...")
    df.to_csv(OUTPUT_PATH, index=False)

    print("Step 1 completed successfully ✅")
    print(f"Output file saved at: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()