Update README.md

7e84642 verified 3 days ago

6.18 kB

	---
	language:
	- code
	tags:
	- python
	- java
	- cpp
	- ai-detection
	- code-analysis
	- temporal-cnn
	- codet5
	metrics:
	- f1: 0.9921
	---

	# ai_code_detect

	### Architecture
	- Semantic Engine: `Salesforce/codet5-base`
	- Statistical Extraction: `microsoft/codebert-base-mlm` (Calculates Entropy and Log-Rank across 256 tokens)
	- Fusion Network: 1D CNN for temporal feature extraction + Dense Feed-Forward Classifier

	### Performance Metrics
	Trained on a polyglot dataset (Python, Java, C++) to prevent single-language overfitting.
	- Training Validation F1: 0.9861
	- Unseen SemEval-2026 Audit (F1): 0.9921
	- Overall Accuracy: 99.20%

	### Requires:
	transformers==4.35.2

	### How to use
	To use this model in your own application, download the weights directly from this hub and load them into the custom `TemporalFusionClassifier` architecture.

	```python
	from huggingface_hub import hf_hub_download
	import torch

	weights_path = hf_hub_download(repo_id="santh-cpu/ai_code_detect", filename="pytorch_model.bin")

	model = TemporalFusionClassifier(base_model)
	model.load_state_dict(torch.load(weights_path))
	model.eval()
	```

	### Example
	```python
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import T5EncoderModel, AutoTokenizer, AutoModelForMaskedLM
	from huggingface_hub import hf_hub_download

	class TemporalFusionClassifier(nn.Module):
	def __init__(self, base, metric_dim=7):
	super().__init__()
	self.base = base
	h = base.config.hidden_size

	self.metric_cnn = nn.Sequential(
	nn.Conv1d(metric_dim, 32, 3, padding=1),
	nn.BatchNorm1d(32),
	nn.ReLU(),
	nn.MaxPool1d(2),
	nn.Conv1d(32, 64, 3, padding=1),
	nn.BatchNorm1d(64),
	nn.ReLU(),
	nn.AdaptiveAvgPool1d(1)
	)

	self.classifier = nn.Sequential(
	nn.Linear(h + 64, 1024),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(1024, 1)
	)

	def forward(self, input_ids, attention_mask, metric_vector):
	out = self.base(input_ids=input_ids, attention_mask=attention_mask)
	hidden = out.last_hidden_state
	mask = attention_mask.unsqueeze(-1).float()
	pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-4)

	cnn_features = self.metric_cnn(metric_vector.transpose(1, 2)).squeeze(-1)
	return self.classifier(torch.cat([pooled, cnn_features], dim=1))

	class AICodeDetector:
	def __init__(self, repo_id="santh-cpu/ai_code_detect"):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.max_len = 256

	self.cb_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base-mlm")
	self.cb_model = AutoModelForMaskedLM.from_pretrained("microsoft/codebert-base-mlm").to(self.device).eval()

	self.t5_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
	base_t5 = T5EncoderModel.from_pretrained("Salesforce/codet5-base")

	weights_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")
	self.detector = TemporalFusionClassifier(base_t5).to(self.device)
	self.detector.load_state_dict(torch.load(weights_path, map_location=self.device))
	self.detector.eval()

	def analyze(self, code_snippet):
	with torch.no_grad():
	cb_in = self.cb_tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_len).to(self.device)
	logits = self.cb_model(**cb_in).logits

	seq_len = cb_in["attention_mask"][0].sum().item()
	metrics = torch.zeros((1, self.max_len, 7), device=self.device)

	if seq_len > 1:
	seq_logits = logits[0:1, :seq_len-1, :]
	seq_labels = cb_in["input_ids"][0:1, 1:seq_len]
	probs = F.softmax(seq_logits, dim=-1)

	entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=-1)
	ranks = (torch.argsort(seq_logits, dim=-1, descending=True) == seq_labels.unsqueeze(-1)).nonzero(as_tuple=True)[2].view(1, -1) + 1

	token_metrics = torch.stack([
	torch.log(probs.gather(2, seq_labels.unsqueeze(-1)).squeeze(-1) + 1e-9),
	torch.log(ranks.float()),
	entropy,
	(ranks <= 10).float(),
	((ranks > 10) & (ranks <= 100)).float(),
	((ranks > 100) & (ranks <= 1000)).float(),
	(ranks > 1000).float()
	], dim=-1)
	metrics[0, :token_metrics.size(1), :] = token_metrics[0]

	clean_metrics = torch.nan_to_num(metrics, nan=0.0, posinf=10.0, neginf=-100.0)
	t5_in = self.t5_tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_len).to(self.device)
	prob = torch.sigmoid(self.detector(t5_in["input_ids"], t5_in["attention_mask"], clean_metrics)).item()

	return {"prediction": "AI Generated" if prob > 0.5 else "Human Written", "ai_probability": round(prob * 100, 2)}

	sample = """
	#include <bits/stdc++.h>
	using namespace std;

	int main() {
	ios::sync_with_stdio(0);
	cin.tie(0);

	int n, k, w;
	string s;
	cin >> n >> k >> w >> s;

	vector<vector<long long>> pre(k, vector<long long>(n));

	for (int i = 0; i < k; ++i) {
	for (int j = 0; j < n; ++j) {
	if (j % k == i && s[j] == '0')
	pre[i][j]++;

	if (j % k != i && s[j] == '1')
	pre[i][j]++;

	if (j > 0)
	pre[i][j] += pre[i][j - 1];
	}
	}

	for (int i = 0; i < w; ++i) {
	int l, r;
	cin >> l >> r;
	l--, r--;

	int m = (l + k - 1) % k;

	cout << pre[m][r] - (l > 0 ? pre[m][l - 1] : 0) << "\n";
	}

	return 0;
	}"""

	if __name__ == "__main__":
	detector = AICodeDetector()
	print("\n",detector.analyze(sample))
	```