| """ |
| Inference script for UnixCoder-MIL |
| ===================================== |
| Usage: Simply run this script with your code samples |
| """ |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification |
| from safetensors.torch import load_file |
| import numpy as np |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| CLASS_NAMES = ["Human", "AI-Generated", "Hybrid", "Adversarial"] |
|
|
| class MilUnixCoder(nn.Module): |
| def __init__(self, model_name="microsoft/unixcoder-base", chunk_size=512, stride=256, max_chunks=16): |
| super().__init__() |
| self.config = AutoConfig.from_pretrained(model_name) |
| self.unixcoder = AutoModel.from_pretrained(model_name) |
| self.chunk_size, self.stride, self.max_chunks = chunk_size, stride, max_chunks |
| self.classifier = nn.Linear(self.config.hidden_size, 4) |
| self.dropout = nn.Dropout(0.1) |
| def forward(self, input_ids, attention_mask=None): |
| B, L = input_ids.size() |
| if attention_mask is None: attention_mask = torch.ones_like(input_ids) |
| if L > self.chunk_size: |
| c_ids = input_ids.unfold(1, self.chunk_size, self.stride) |
| c_mask = attention_mask.unfold(1, self.chunk_size, self.stride) |
| nc = min(c_ids.size(1), self.max_chunks) |
| flat_ids = c_ids[:,:nc,:].contiguous().view(-1, self.chunk_size) |
| flat_mask = c_mask[:,:nc,:].contiguous().view(-1, self.chunk_size) |
| else: |
| nc, flat_ids, flat_mask = 1, input_ids, attention_mask |
| out = self.unixcoder(input_ids=flat_ids, attention_mask=flat_mask) |
| logits = self.classifier(self.dropout(out.last_hidden_state[:, 0, :])) |
| return torch.max(logits.view(B, nc, -1), dim=1)[0] |
|
|
| def load_model(): |
| """Load the model and tokenizer""" |
| from huggingface_hub import hf_hub_download |
| repo = "YoungDSMLKZ/UnixCoder-MIL" |
| tokenizer = AutoTokenizer.from_pretrained(repo) |
| model = MilUnixCoder("microsoft/unixcoder-base") |
| weights_path = hf_hub_download(repo_id=repo, filename="model.safetensors") |
| model.load_state_dict(load_file(weights_path)) |
| model.to(DEVICE).eval() |
| return model, tokenizer |
|
|
| def predict(code: str, model, tokenizer) -> dict: |
| """Predict class for a single code sample""" |
| inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=4096, padding=True).to(DEVICE) |
| with torch.no_grad(): |
| logits = model(inputs["input_ids"], inputs["attention_mask"]) |
| probs = F.softmax(logits, dim=-1)[0] |
| pred = torch.argmax(probs).item() |
| return {"class": CLASS_NAMES[pred], "confidence": probs[pred].item()} |
|
|
| if __name__ == "__main__": |
| print("Loading model...") |
| model, tokenizer = load_model() |
| |
| |
| test_code = """ |
| def hello_world(): |
| print("Hello, World!") |
| """ |
| |
| result = predict(test_code, model, tokenizer) |
| print(f"Predicted: {result['class']} (confidence: {result['confidence']:.2%})") |
|
|