Spaces:

joshnavip
/

ai-code-detection

Runtime error

App Files Files Community

ai-code-detection / preprocessing /step1_length_normalization.py

joshnavip

Initial commit: AI code detection project (without binary files)

b144cb7 11 days ago

raw

history blame contribute delete

3.09 kB

	import pandas as pd

	# ===============================
	# PATHS
	# ===============================
	INPUT_PATH = "dataset/raw/raw_dataset.csv"
	OUTPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv"

	# ===============================
	# PARAMETERS
	# ===============================
	MIN_LINES = 4
	MAX_LINES = 100

	# ===============================
	# UTILITY FUNCTIONS
	# ===============================
	def get_clean_lines(code):
	if not isinstance(code, str):
	return []

	lines = code.splitlines()

	# Remove leading empty lines
	while lines and lines[0].strip() == "":
	lines.pop(0)

	# Remove trailing empty lines
	while lines and lines[-1].strip() == "":
	lines.pop()

	return lines


	def get_truncation_marker(language):
	if language == "Python":
	return "# ... truncated ..."
	elif language == "Java":
	return "// ... truncated ..."
	else:
	return "... truncated ..."


	def truncate_middle(lines, max_lines, language):
	if len(lines) <= max_lines:
	return lines

	head = lines[: max_lines // 2]
	tail = lines[-(max_lines // 2):]
	marker = get_truncation_marker(language)

	return head + [marker] + tail


	def normalize_code(code, language):
	lines = get_clean_lines(code)

	if len(lines) < MIN_LINES:
	return None

	if len(lines) > MAX_LINES:
	lines = truncate_middle(lines, MAX_LINES, language)

	return "\n".join(lines)

	# ===============================
	# MAIN PIPELINE
	# ===============================
	def main():
	print("Loading raw dataset...")
	df = pd.read_csv(INPUT_PATH)
	print(f"Initial dataset size: {len(df)} rows")

	# -------------------------------
	# Column validation
	# -------------------------------
	required_cols = ["Code_Text","Language"]
	missing = [c for c in required_cols if c not in df.columns]
	if missing:
	raise ValueError(f"Missing required columns: {missing}")

	# -------------------------------
	# Normalize code length
	# -------------------------------
	print("Normalizing code length (language-aware)...")

	df["normalized_code"] = df.apply(
	lambda row: normalize_code(row["Code_Text"], row["Language"]),
	axis=1
	)

	# Drop rows filtered out by normalization
	df = df[df["normalized_code"].notnull()].reset_index(drop=True)

	# -------------------------------
	# Metadata for analysis/debugging
	# -------------------------------
	df["original_line_count"] = df["Code_Text"].apply(
	lambda c: len(get_clean_lines(c))
	)

	df["normalized_line_count"] = df["normalized_code"].apply(
	lambda c: len(get_clean_lines(c))
	)

	print(f"Final dataset size after normalization: {len(df)} rows")

	# -------------------------------
	# Save output
	# -------------------------------
	print("Saving processed dataset...")
	df.to_csv(OUTPUT_PATH, index=False)

	print("Step 1 completed successfully ✅")
	print(f"Output file saved at: {OUTPUT_PATH}")


	if __name__ == "__main__":
	main()