Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| # =============================== | |
| # PATHS | |
| # =============================== | |
| INPUT_PATH = "dataset/raw/raw_dataset.csv" | |
| OUTPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv" | |
| # =============================== | |
| # PARAMETERS | |
| # =============================== | |
| MIN_LINES = 4 | |
| MAX_LINES = 100 | |
| # =============================== | |
| # UTILITY FUNCTIONS | |
| # =============================== | |
| def get_clean_lines(code): | |
| if not isinstance(code, str): | |
| return [] | |
| lines = code.splitlines() | |
| # Remove leading empty lines | |
| while lines and lines[0].strip() == "": | |
| lines.pop(0) | |
| # Remove trailing empty lines | |
| while lines and lines[-1].strip() == "": | |
| lines.pop() | |
| return lines | |
| def get_truncation_marker(language): | |
| if language == "Python": | |
| return "# ... truncated ..." | |
| elif language == "Java": | |
| return "// ... truncated ..." | |
| else: | |
| return "... truncated ..." | |
| def truncate_middle(lines, max_lines, language): | |
| if len(lines) <= max_lines: | |
| return lines | |
| head = lines[: max_lines // 2] | |
| tail = lines[-(max_lines // 2):] | |
| marker = get_truncation_marker(language) | |
| return head + [marker] + tail | |
| def normalize_code(code, language): | |
| lines = get_clean_lines(code) | |
| if len(lines) < MIN_LINES: | |
| return None | |
| if len(lines) > MAX_LINES: | |
| lines = truncate_middle(lines, MAX_LINES, language) | |
| return "\n".join(lines) | |
| # =============================== | |
| # MAIN PIPELINE | |
| # =============================== | |
| def main(): | |
| print("Loading raw dataset...") | |
| df = pd.read_csv(INPUT_PATH) | |
| print(f"Initial dataset size: {len(df)} rows") | |
| # ------------------------------- | |
| # Column validation | |
| # ------------------------------- | |
| required_cols = ["Code_Text","Language"] | |
| missing = [c for c in required_cols if c not in df.columns] | |
| if missing: | |
| raise ValueError(f"Missing required columns: {missing}") | |
| # ------------------------------- | |
| # Normalize code length | |
| # ------------------------------- | |
| print("Normalizing code length (language-aware)...") | |
| df["normalized_code"] = df.apply( | |
| lambda row: normalize_code(row["Code_Text"], row["Language"]), | |
| axis=1 | |
| ) | |
| # Drop rows filtered out by normalization | |
| df = df[df["normalized_code"].notnull()].reset_index(drop=True) | |
| # ------------------------------- | |
| # Metadata for analysis/debugging | |
| # ------------------------------- | |
| df["original_line_count"] = df["Code_Text"].apply( | |
| lambda c: len(get_clean_lines(c)) | |
| ) | |
| df["normalized_line_count"] = df["normalized_code"].apply( | |
| lambda c: len(get_clean_lines(c)) | |
| ) | |
| print(f"Final dataset size after normalization: {len(df)} rows") | |
| # ------------------------------- | |
| # Save output | |
| # ------------------------------- | |
| print("Saving processed dataset...") | |
| df.to_csv(OUTPUT_PATH, index=False) | |
| print("Step 1 completed successfully ✅") | |
| print(f"Output file saved at: {OUTPUT_PATH}") | |
| if __name__ == "__main__": | |
| main() | |