| from gensim.models import Doc2Vec |
| import numpy as np |
| import logging |
| from utils.text_processing import tokenize_text |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| DEFAULT_EPOCHS = 50 |
| DEFAULT_ALPHA = 0.025 |
| DEFAULT_MIN_ALPHA = 0.001 |
|
|
| def load_model(model_path): |
| try: |
| model = Doc2Vec.load(model_path) |
| logger.info(f"Loaded Doc2Vec model from {model_path}") |
| logger.info(f"Vector size: {model.vector_size}") |
| logger.info(f"Vocabulary size: {len(model.wv)}") |
| return model |
| except Exception as e: |
| logger.error(f"Error loading Doc2Vec model: {e}") |
| return None |
|
|
| def infer_vector(model, text, epochs=DEFAULT_EPOCHS, alpha=DEFAULT_ALPHA, min_alpha=DEFAULT_MIN_ALPHA): |
| """Infer vector with proper parameters""" |
| try: |
| tokens = tokenize_text(text) |
| if not tokens: |
| logger.warning("No tokens after tokenization") |
| return np.zeros(model.vector_size) |
| |
| |
| if len(tokens) < 5: |
| tokens = tokens * (5 // len(tokens) + 1) |
| |
| return model.infer_vector( |
| tokens, |
| epochs=epochs, |
| alpha=alpha, |
| min_alpha=min_alpha |
| ) |
| except Exception as e: |
| logger.error(f"Vector inference error: {e}") |
| return np.zeros(model.vector_size) if model else np.zeros(100) |
|
|
| def calculate_similarity(model, text1, text2): |
| try: |
| |
| vec1 = infer_vector(model, text1) |
| vec2 = infer_vector(model, text2) |
| |
| if vec1 is None or vec2 is None: |
| return 0.0 |
| |
| |
| similarity = cosine_similarity([vec1], [vec2])[0][0] |
| |
| |
| score = max(-1.0, min(1.0, similarity)) |
| return max(0, min(100, (score + 1) * 50)) |
| except Exception as e: |
| logger.error(f"Doc2Vec similarity calculation error: {e}") |
| return 0.0 |
|
|
| def get_job_embeddings(model, job_taxonomy): |
| embeddings = [] |
| for job in job_taxonomy: |
| embeddings.append(infer_vector(model, job)) |
| return np.array(embeddings) |