Spaces:

joshnavip
/

ai-code-detection

Runtime error

App Files Files Community

ai-code-detection / preprocessing /step4_sanity_check.py

joshnavip

Initial commit: AI code detection project (without binary files)

b144cb7 21 days ago

raw

history blame contribute delete

2.76 kB

	import pandas as pd

	DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"

	def main():
	df = pd.read_csv(DATA_PATH)

	# -------------------------------
	# Handle label column safely
	# -------------------------------
	if "Label" not in df.columns:
	if "Label (0- HUMAN, 1-AI)" in df.columns:
	df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
	else:
	print("⚠️ Label column not found (labels may be added later).")

	print("\n🧪 SANITY CHECK REPORT\n")

	print(f"Total samples: {len(df)}")
	print(f"Total columns: {len(df.columns)}\n")

	# -------------------------------
	# Critical column null checks
	# -------------------------------
	print("🔍 Null value check (critical columns):")
	critical_cols = ["normalized_code", "Language"]
	if "Label" in df.columns:
	critical_cols.append("Label")

	for col in critical_cols:
	if col in df.columns:
	null_count = df[col].isnull().sum()
	print(f" {col}: {null_count} null values")
	else:
	print(f" {col}: COLUMN NOT FOUND")

	# -------------------------------
	# Code quality checks
	# -------------------------------
	empty_code = df["normalized_code"].astype(str).str.strip().eq("").sum()
	line_counts = df["normalized_code"].astype(str).str.split("\n").apply(len)

	print("\n🧾 Code quality check:")
	print(f" Empty normalized_code rows: {empty_code}")
	print(f" Very short code (<3 lines): {(line_counts < 3).sum()}")

	# -------------------------------
	# Label sanity
	# -------------------------------
	if "Label" in df.columns:
	unique_labels = sorted(df["Label"].unique())
	print("\n🏷️ Label check:")
	print(f" Unique labels found: {unique_labels}")

	# -------------------------------
	# Line count statistics
	# -------------------------------
	print("\n📏 Line count statistics:")
	if "original_line_count" in df.columns and "normalized_line_count" in df.columns:
	print(df[["original_line_count", "normalized_line_count"]].describe())
	else:
	print(" Line count columns not found.")

	# -------------------------------
	# Language-wise sanity summary
	# -------------------------------
	print("\n🌐 Language-wise summary:")
	for lang in df["Language"].unique():
	lang_df = df[df["Language"] == lang]
	print(f"\nLanguage: {lang}")
	print(f" Samples: {len(lang_df)}")
	short = (lang_df["normalized_code"].str.split("\n").apply(len) < 3).sum()
	print(f" Very short code (<3 lines): {short}")

	print("\nSanity check completed successfully ✅")

	if __name__ == "__main__":
	main()