YoungDSMLKZ
/

UnixCoder-MIL

Model card Files Files and versions

Eraly-ml commited on Jan 9

Commit

5e03908

·

verified ·

1 Parent(s): ab29cac

Add inference script

Files changed (1) hide show

inference.py +68 -0

inference.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Inference script for UnixCoder-MIL
+=====================================
+Usage: Simply run this script with your code samples
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
+from safetensors.torch import load_file
+import numpy as np
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+CLASS_NAMES = ["Human", "AI-Generated", "Hybrid", "Adversarial"]
+class MilUnixCoder(nn.Module):
+    def __init__(self, model_name="microsoft/unixcoder-base", chunk_size=512, stride=256, max_chunks=16):
+        super().__init__()
+        self.config = AutoConfig.from_pretrained(model_name)
+        self.unixcoder = AutoModel.from_pretrained(model_name)
+        self.chunk_size, self.stride, self.max_chunks = chunk_size, stride, max_chunks
+        self.classifier = nn.Linear(self.config.hidden_size, 4)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, input_ids, attention_mask=None):
+        B, L = input_ids.size()
+        if attention_mask is None: attention_mask = torch.ones_like(input_ids)
+        if L > self.chunk_size:
+            c_ids = input_ids.unfold(1, self.chunk_size, self.stride)
+            c_mask = attention_mask.unfold(1, self.chunk_size, self.stride)
+            nc = min(c_ids.size(1), self.max_chunks)
+            flat_ids = c_ids[:,:nc,:].contiguous().view(-1, self.chunk_size)
+            flat_mask = c_mask[:,:nc,:].contiguous().view(-1, self.chunk_size)
+        else:
+            nc, flat_ids, flat_mask = 1, input_ids, attention_mask
+        out = self.unixcoder(input_ids=flat_ids, attention_mask=flat_mask)
+        logits = self.classifier(self.dropout(out.last_hidden_state[:, 0, :]))
+        return torch.max(logits.view(B, nc, -1), dim=1)[0]
+def load_model():
+    """Load the model and tokenizer"""
+    tokenizer = AutoTokenizer.from_pretrained("YoungDSMLKZ/UnixCoder-MIL")
+    model = MilUnixCoder("microsoft/unixcoder-base")
+    model.load_state_dict(load_file("YoungDSMLKZ/UnixCoder-MIL/model.safetensors"))
+    model.to(DEVICE).eval()
+    return model, tokenizer
+def predict(code: str, model, tokenizer) -> dict:
+    """Predict class for a single code sample"""
+    inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=4096, padding=True).to(DEVICE)
+    with torch.no_grad():
+        logits = model(inputs["input_ids"], inputs["attention_mask"])
+    probs = F.softmax(logits, dim=-1)[0]
+    pred = torch.argmax(probs).item()
+    return {"class": CLASS_NAMES[pred], "confidence": probs[pred].item()}
+if __name__ == "__main__":
+    print("Loading model...")
+    model, tokenizer = load_model()
+    # Example usage
+    test_code = """
+def hello_world():
+    print("Hello, World!")
+"""
+    result = predict(test_code, model, tokenizer)
+    print(f"Predicted: {result['class']} (confidence: {result['confidence']:.2%})")