File size: 6,179 Bytes
2be77de 7e84642 2be77de ec6dc5b ca8969e ec6dc5b ca8969e ec6dc5b 85d06f3 ec6dc5b ca8969e ec6dc5b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | ---
language:
- code
tags:
- python
- java
- cpp
- ai-detection
- code-analysis
- temporal-cnn
- codet5
metrics:
- f1: 0.9921
---
# ai_code_detect
### Architecture
- **Semantic Engine:** `Salesforce/codet5-base`
- **Statistical Extraction:** `microsoft/codebert-base-mlm` (Calculates Entropy and Log-Rank across 256 tokens)
- **Fusion Network:** 1D CNN for temporal feature extraction + Dense Feed-Forward Classifier
### Performance Metrics
Trained on a polyglot dataset (Python, Java, C++) to prevent single-language overfitting.
- **Training Validation F1:** 0.9861
- **Unseen SemEval-2026 Audit (F1):** 0.9921
- **Overall Accuracy:** 99.20%
### Requires:
transformers==4.35.2
### How to use
To use this model in your own application, download the weights directly from this hub and load them into the custom `TemporalFusionClassifier` architecture.
```python
from huggingface_hub import hf_hub_download
import torch
weights_path = hf_hub_download(repo_id="santh-cpu/ai_code_detect", filename="pytorch_model.bin")
model = TemporalFusionClassifier(base_model)
model.load_state_dict(torch.load(weights_path))
model.eval()
```
### Example
```python
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import T5EncoderModel, AutoTokenizer, AutoModelForMaskedLM
from huggingface_hub import hf_hub_download
class TemporalFusionClassifier(nn.Module):
def __init__(self, base, metric_dim=7):
super().__init__()
self.base = base
h = base.config.hidden_size
self.metric_cnn = nn.Sequential(
nn.Conv1d(metric_dim, 32, 3, padding=1),
nn.BatchNorm1d(32),
nn.ReLU(),
nn.MaxPool1d(2),
nn.Conv1d(32, 64, 3, padding=1),
nn.BatchNorm1d(64),
nn.ReLU(),
nn.AdaptiveAvgPool1d(1)
)
self.classifier = nn.Sequential(
nn.Linear(h + 64, 1024),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(1024, 1)
)
def forward(self, input_ids, attention_mask, metric_vector):
out = self.base(input_ids=input_ids, attention_mask=attention_mask)
hidden = out.last_hidden_state
mask = attention_mask.unsqueeze(-1).float()
pooled = (hidden * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-4)
cnn_features = self.metric_cnn(metric_vector.transpose(1, 2)).squeeze(-1)
return self.classifier(torch.cat([pooled, cnn_features], dim=1))
class AICodeDetector:
def __init__(self, repo_id="santh-cpu/ai_code_detect"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.max_len = 256
self.cb_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base-mlm")
self.cb_model = AutoModelForMaskedLM.from_pretrained("microsoft/codebert-base-mlm").to(self.device).eval()
self.t5_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")
base_t5 = T5EncoderModel.from_pretrained("Salesforce/codet5-base")
weights_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin")
self.detector = TemporalFusionClassifier(base_t5).to(self.device)
self.detector.load_state_dict(torch.load(weights_path, map_location=self.device))
self.detector.eval()
def analyze(self, code_snippet):
with torch.no_grad():
cb_in = self.cb_tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_len).to(self.device)
logits = self.cb_model(**cb_in).logits
seq_len = cb_in["attention_mask"][0].sum().item()
metrics = torch.zeros((1, self.max_len, 7), device=self.device)
if seq_len > 1:
seq_logits = logits[0:1, :seq_len-1, :]
seq_labels = cb_in["input_ids"][0:1, 1:seq_len]
probs = F.softmax(seq_logits, dim=-1)
entropy = -torch.sum(probs * torch.log(probs + 1e-9), dim=-1)
ranks = (torch.argsort(seq_logits, dim=-1, descending=True) == seq_labels.unsqueeze(-1)).nonzero(as_tuple=True)[2].view(1, -1) + 1
token_metrics = torch.stack([
torch.log(probs.gather(2, seq_labels.unsqueeze(-1)).squeeze(-1) + 1e-9),
torch.log(ranks.float()),
entropy,
(ranks <= 10).float(),
((ranks > 10) & (ranks <= 100)).float(),
((ranks > 100) & (ranks <= 1000)).float(),
(ranks > 1000).float()
], dim=-1)
metrics[0, :token_metrics.size(1), :] = token_metrics[0]
clean_metrics = torch.nan_to_num(metrics, nan=0.0, posinf=10.0, neginf=-100.0)
t5_in = self.t5_tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_len).to(self.device)
prob = torch.sigmoid(self.detector(t5_in["input_ids"], t5_in["attention_mask"], clean_metrics)).item()
return {"prediction": "AI Generated" if prob > 0.5 else "Human Written", "ai_probability": round(prob * 100, 2)}
sample = """
#include <bits/stdc++.h>
using namespace std;
int main() {
ios::sync_with_stdio(0);
cin.tie(0);
int n, k, w;
string s;
cin >> n >> k >> w >> s;
vector<vector<long long>> pre(k, vector<long long>(n));
for (int i = 0; i < k; ++i) {
for (int j = 0; j < n; ++j) {
if (j % k == i && s[j] == '0')
pre[i][j]++;
if (j % k != i && s[j] == '1')
pre[i][j]++;
if (j > 0)
pre[i][j] += pre[i][j - 1];
}
}
for (int i = 0; i < w; ++i) {
int l, r;
cin >> l >> r;
l--, r--;
int m = (l + k - 1) % k;
cout << pre[m][r] - (l > 0 ? pre[m][l - 1] : 0) << "\n";
}
return 0;
}"""
if __name__ == "__main__":
detector = AICodeDetector()
print("\n",detector.analyze(sample))
``` |