ai-code-detection / classifier /inference.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import sys
import os
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(PROJECT_ROOT)
import numpy as np
import pandas as pd
import xgboost as xgb
# -------------------------------
# IMPORT FEATURE EXTRACTORS
# -------------------------------
from featureextraction.step1_statistical_extraction.step1_statistical_extraction import extract_features as extract_stat
from featureextraction.step2_ast_extraction.step2_ast_extraction import extract_ast_features
from featureextraction.step3_stylometry_extraction.step3_stylometry_extraction import extract_stylometry_features
from featureextraction.semantic_features.unixcoder_embedding import get_unixcoder_embedding
# XAI modules
from xai.shaplayer import shap_explain
from xai.grouping import group_shap_explanations
from xai.text_explainer import generate_text_explanation
# -------------------------------
# LOAD MODEL
# -------------------------------
model = xgb.XGBClassifier()
model.load_model("classifier/xgboost_final_model.json")
# -------------------------------
# LANGUAGE ONE-HOT
# -------------------------------
def encode_language(language):
language = language.lower()
if language == "python":
return np.array([1, 0])
elif language == "java":
return np.array([0, 1])
else:
raise ValueError("Language must be python or java")
# -------------------------------
# BUILD FEATURES FROM CODE
# -------------------------------
def build_features_from_code(code, language):
df = pd.DataFrame({
"normalized_code": [code],
"Language": [language]
})
stat_df = extract_stat(df)
ast_df = extract_ast_features(df)
style_df = extract_stylometry_features(df)
X_stat = stat_df.drop(columns=["language"]).values.flatten()
X_ast = ast_df.values.flatten()
X_style = style_df.values.flatten()
X_lang = encode_language(language)
X_sem = get_unixcoder_embedding(code)
X_final = np.hstack([
X_stat,
X_ast,
X_style,
X_lang,
X_sem
]).reshape(1, -1)
return X_final
# -------------------------------
# BASIC PREDICT FUNCTION
# -------------------------------
def predict_from_features(X_final):
prediction = model.predict(X_final)[0]
probability = model.predict_proba(X_final)[0][1]
label_name = "AI" if prediction == 1 else "Human"
return label_name, probability
# -------------------------------
# INTERACTIVE CLI
# -------------------------------
if __name__ == "__main__":
print("\n======================================")
print(" AI vs Human Code Classification")
print("======================================")
language = input("Choose language (python/java): ").strip().lower()
print("\nPaste your code below.")
print("Type 'END' on a new line when finished.\n")
lines = []
while True:
line = input()
if line.strip() == "END":
break
lines.append(line)
code_input = "\n".join(lines)
# build features
X_final = build_features_from_code(code_input, language)
# predict
label, prob = predict_from_features(X_final)
# shap
shap_result = shap_explain(model, X_final)
# grouping
grouped = group_shap_explanations(shap_result)
# text explanation
text_reason = generate_text_explanation(grouped, label, prob)
print("\n========== RESULT ==========")
print("Prediction :", label)
print("Confidence :", prob)
print("\nTop SHAP features:")
for e in shap_result:
print(f"Feature {e['feature_index']}{e['impact']} ({e['pushes_toward']})")
print("\nGrouped SHAP importance:", grouped)
print("\nExplanation:\n")
print(text_reason)