Spaces:
Runtime error
Runtime error
File size: 3,087 Bytes
b144cb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import pandas as pd
# ===============================
# PATHS
# ===============================
INPUT_PATH = "dataset/raw/raw_dataset.csv"
OUTPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv"
# ===============================
# PARAMETERS
# ===============================
MIN_LINES = 4
MAX_LINES = 100
# ===============================
# UTILITY FUNCTIONS
# ===============================
def get_clean_lines(code):
if not isinstance(code, str):
return []
lines = code.splitlines()
# Remove leading empty lines
while lines and lines[0].strip() == "":
lines.pop(0)
# Remove trailing empty lines
while lines and lines[-1].strip() == "":
lines.pop()
return lines
def get_truncation_marker(language):
if language == "Python":
return "# ... truncated ..."
elif language == "Java":
return "// ... truncated ..."
else:
return "... truncated ..."
def truncate_middle(lines, max_lines, language):
if len(lines) <= max_lines:
return lines
head = lines[: max_lines // 2]
tail = lines[-(max_lines // 2):]
marker = get_truncation_marker(language)
return head + [marker] + tail
def normalize_code(code, language):
lines = get_clean_lines(code)
if len(lines) < MIN_LINES:
return None
if len(lines) > MAX_LINES:
lines = truncate_middle(lines, MAX_LINES, language)
return "\n".join(lines)
# ===============================
# MAIN PIPELINE
# ===============================
def main():
print("Loading raw dataset...")
df = pd.read_csv(INPUT_PATH)
print(f"Initial dataset size: {len(df)} rows")
# -------------------------------
# Column validation
# -------------------------------
required_cols = ["Code_Text","Language"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
# -------------------------------
# Normalize code length
# -------------------------------
print("Normalizing code length (language-aware)...")
df["normalized_code"] = df.apply(
lambda row: normalize_code(row["Code_Text"], row["Language"]),
axis=1
)
# Drop rows filtered out by normalization
df = df[df["normalized_code"].notnull()].reset_index(drop=True)
# -------------------------------
# Metadata for analysis/debugging
# -------------------------------
df["original_line_count"] = df["Code_Text"].apply(
lambda c: len(get_clean_lines(c))
)
df["normalized_line_count"] = df["normalized_code"].apply(
lambda c: len(get_clean_lines(c))
)
print(f"Final dataset size after normalization: {len(df)} rows")
# -------------------------------
# Save output
# -------------------------------
print("Saving processed dataset...")
df.to_csv(OUTPUT_PATH, index=False)
print("Step 1 completed successfully ✅")
print(f"Output file saved at: {OUTPUT_PATH}")
if __name__ == "__main__":
main()
|