File size: 2,401 Bytes
88da18c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | from gensim.models import Doc2Vec
import numpy as np
import logging
from utils.text_processing import tokenize_text
from sklearn.metrics.pairwise import cosine_similarity
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Default inference parameters
DEFAULT_EPOCHS = 50 # Changed from steps to epochs
DEFAULT_ALPHA = 0.025
DEFAULT_MIN_ALPHA = 0.001
def load_model(model_path):
try:
model = Doc2Vec.load(model_path)
logger.info(f"Loaded Doc2Vec model from {model_path}")
logger.info(f"Vector size: {model.vector_size}")
logger.info(f"Vocabulary size: {len(model.wv)}")
return model
except Exception as e:
logger.error(f"Error loading Doc2Vec model: {e}")
return None
def infer_vector(model, text, epochs=DEFAULT_EPOCHS, alpha=DEFAULT_ALPHA, min_alpha=DEFAULT_MIN_ALPHA):
"""Infer vector with proper parameters"""
try:
tokens = tokenize_text(text)
if not tokens:
logger.warning("No tokens after tokenization")
return np.zeros(model.vector_size)
# Ensure we have at least 5 tokens
if len(tokens) < 5:
tokens = tokens * (5 // len(tokens) + 1)
return model.infer_vector(
tokens,
epochs=epochs, # Only accepts 'epochs'
alpha=alpha,
min_alpha=min_alpha
)
except Exception as e:
logger.error(f"Vector inference error: {e}")
return np.zeros(model.vector_size) if model else np.zeros(100)
def calculate_similarity(model, text1, text2):
try:
# Infer vectors with proper parameters
vec1 = infer_vector(model, text1)
vec2 = infer_vector(model, text2)
if vec1 is None or vec2 is None:
return 0.0
# Calculate cosine similarity
similarity = cosine_similarity([vec1], [vec2])[0][0]
# Ensure valid similarity score
score = max(-1.0, min(1.0, similarity))
return max(0, min(100, (score + 1) * 50)) # Convert to 0-100 range
except Exception as e:
logger.error(f"Doc2Vec similarity calculation error: {e}")
return 0.0
def get_job_embeddings(model, job_taxonomy):
embeddings = []
for job in job_taxonomy:
embeddings.append(infer_vector(model, job))
return np.array(embeddings) |