YoungDSMLKZ
/

UnixCoder-MIL

Model card Files Files and versions

UnixCoder-MIL / inference.py

Eraly-ml's picture

Update inference.py

0619dcc verified 2 months ago

history blame contribute delete

3.03 kB

	"""
	Inference script for UnixCoder-MIL
	=====================================
	Usage: Simply run this script with your code samples
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification
	from safetensors.torch import load_file
	import numpy as np

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	CLASS_NAMES = ["Human", "AI-Generated", "Hybrid", "Adversarial"]

	class MilUnixCoder(nn.Module):
	def __init__(self, model_name="microsoft/unixcoder-base", chunk_size=512, stride=256, max_chunks=16):
	super().__init__()
	self.config = AutoConfig.from_pretrained(model_name)
	self.unixcoder = AutoModel.from_pretrained(model_name)
	self.chunk_size, self.stride, self.max_chunks = chunk_size, stride, max_chunks
	self.classifier = nn.Linear(self.config.hidden_size, 4)
	self.dropout = nn.Dropout(0.1)
	def forward(self, input_ids, attention_mask=None):
	B, L = input_ids.size()
	if attention_mask is None: attention_mask = torch.ones_like(input_ids)
	if L > self.chunk_size:
	c_ids = input_ids.unfold(1, self.chunk_size, self.stride)
	c_mask = attention_mask.unfold(1, self.chunk_size, self.stride)
	nc = min(c_ids.size(1), self.max_chunks)
	flat_ids = c_ids[:,:nc,:].contiguous().view(-1, self.chunk_size)
	flat_mask = c_mask[:,:nc,:].contiguous().view(-1, self.chunk_size)
	else:
	nc, flat_ids, flat_mask = 1, input_ids, attention_mask
	out = self.unixcoder(input_ids=flat_ids, attention_mask=flat_mask)
	logits = self.classifier(self.dropout(out.last_hidden_state[:, 0, :]))
	return torch.max(logits.view(B, nc, -1), dim=1)[0]

	def load_model():
	"""Load the model and tokenizer"""
	from huggingface_hub import hf_hub_download
	repo = "YoungDSMLKZ/UnixCoder-MIL"
	tokenizer = AutoTokenizer.from_pretrained(repo)
	model = MilUnixCoder("microsoft/unixcoder-base")
	weights_path = hf_hub_download(repo_id=repo, filename="model.safetensors")
	model.load_state_dict(load_file(weights_path))
	model.to(DEVICE).eval()
	return model, tokenizer

	def predict(code: str, model, tokenizer) -> dict:
	"""Predict class for a single code sample"""
	inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=4096, padding=True).to(DEVICE)
	with torch.no_grad():
	logits = model(inputs["input_ids"], inputs["attention_mask"])
	probs = F.softmax(logits, dim=-1)[0]
	pred = torch.argmax(probs).item()
	return {"class": CLASS_NAMES[pred], "confidence": probs[pred].item()}

	if __name__ == "__main__":
	print("Loading model...")
	model, tokenizer = load_model()

	# Example usage
	test_code = """
	def hello_world():
	print("Hello, World!")
	"""

	result = predict(test_code, model, tokenizer)
	print(f"Predicted: {result['class']} (confidence: {result['confidence']:.2%})")