joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import torch
from transformers import AutoTokenizer, AutoModel
MODEL_NAME = "microsoft/unixcoder-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
unix_model = AutoModel.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
unix_model.to(device)
unix_model.eval()
def get_unixcoder_embedding(code, max_length=512):
inputs = tokenizer(
code,
padding=True,
truncation=True,
max_length=max_length,
return_tensors="pt"
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = unix_model(**inputs)
last_hidden = outputs.last_hidden_state
cls_embedding = last_hidden[:, 0, :]
mean_embedding = last_hidden.mean(dim=1)
combined = torch.cat((cls_embedding, mean_embedding), dim=1)
return combined.cpu().numpy().flatten()