import pandas as pd # =============================== # PATHS # =============================== INPUT_PATH = "dataset/raw/raw_dataset.csv" OUTPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv" # =============================== # PARAMETERS # =============================== MIN_LINES = 4 MAX_LINES = 100 # =============================== # UTILITY FUNCTIONS # =============================== def get_clean_lines(code): if not isinstance(code, str): return [] lines = code.splitlines() # Remove leading empty lines while lines and lines[0].strip() == "": lines.pop(0) # Remove trailing empty lines while lines and lines[-1].strip() == "": lines.pop() return lines def get_truncation_marker(language): if language == "Python": return "# ... truncated ..." elif language == "Java": return "// ... truncated ..." else: return "... truncated ..." def truncate_middle(lines, max_lines, language): if len(lines) <= max_lines: return lines head = lines[: max_lines // 2] tail = lines[-(max_lines // 2):] marker = get_truncation_marker(language) return head + [marker] + tail def normalize_code(code, language): lines = get_clean_lines(code) if len(lines) < MIN_LINES: return None if len(lines) > MAX_LINES: lines = truncate_middle(lines, MAX_LINES, language) return "\n".join(lines) # =============================== # MAIN PIPELINE # =============================== def main(): print("Loading raw dataset...") df = pd.read_csv(INPUT_PATH) print(f"Initial dataset size: {len(df)} rows") # ------------------------------- # Column validation # ------------------------------- required_cols = ["Code_Text","Language"] missing = [c for c in required_cols if c not in df.columns] if missing: raise ValueError(f"Missing required columns: {missing}") # ------------------------------- # Normalize code length # ------------------------------- print("Normalizing code length (language-aware)...") df["normalized_code"] = df.apply( lambda row: normalize_code(row["Code_Text"], row["Language"]), axis=1 ) # Drop rows filtered out by normalization df = df[df["normalized_code"].notnull()].reset_index(drop=True) # ------------------------------- # Metadata for analysis/debugging # ------------------------------- df["original_line_count"] = df["Code_Text"].apply( lambda c: len(get_clean_lines(c)) ) df["normalized_line_count"] = df["normalized_code"].apply( lambda c: len(get_clean_lines(c)) ) print(f"Final dataset size after normalization: {len(df)} rows") # ------------------------------- # Save output # ------------------------------- print("Saving processed dataset...") df.to_csv(OUTPUT_PATH, index=False) print("Step 1 completed successfully ✅") print(f"Output file saved at: {OUTPUT_PATH}") if __name__ == "__main__": main()