import torch from transformers import AutoTokenizer, AutoModel MODEL_NAME = "microsoft/unixcoder-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) unix_model = AutoModel.from_pretrained(MODEL_NAME) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") unix_model.to(device) unix_model.eval() def get_unixcoder_embedding(code, max_length=512): inputs = tokenizer( code, padding=True, truncation=True, max_length=max_length, return_tensors="pt" ) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = unix_model(**inputs) last_hidden = outputs.last_hidden_state cls_embedding = last_hidden[:, 0, :] mean_embedding = last_hidden.mean(dim=1) combined = torch.cat((cls_embedding, mean_embedding), dim=1) return combined.cpu().numpy().flatten()