Spaces:
Paused
Paused
| from gensim.models import Doc2Vec | |
| import numpy as np | |
| import logging | |
| from utils.text_processing import tokenize_text | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Default inference parameters | |
| DEFAULT_EPOCHS = 50 # Changed from steps to epochs | |
| DEFAULT_ALPHA = 0.025 | |
| DEFAULT_MIN_ALPHA = 0.001 | |
| def load_model(model_path): | |
| try: | |
| model = Doc2Vec.load(model_path) | |
| logger.info(f"Loaded Doc2Vec model from {model_path}") | |
| logger.info(f"Vector size: {model.vector_size}") | |
| logger.info(f"Vocabulary size: {len(model.wv)}") | |
| return model | |
| except Exception as e: | |
| logger.error(f"Error loading Doc2Vec model: {e}") | |
| return None | |
| def infer_vector(model, text, epochs=DEFAULT_EPOCHS, alpha=DEFAULT_ALPHA, min_alpha=DEFAULT_MIN_ALPHA): | |
| """Infer vector with proper parameters""" | |
| try: | |
| tokens = tokenize_text(text) | |
| if not tokens: | |
| logger.warning("No tokens after tokenization") | |
| return np.zeros(model.vector_size) | |
| # Ensure we have at least 5 tokens | |
| if len(tokens) < 5: | |
| tokens = tokens * (5 // len(tokens) + 1) | |
| return model.infer_vector( | |
| tokens, | |
| epochs=epochs, # Only accepts 'epochs' | |
| alpha=alpha, | |
| min_alpha=min_alpha | |
| ) | |
| except Exception as e: | |
| logger.error(f"Vector inference error: {e}") | |
| return np.zeros(model.vector_size) if model else np.zeros(100) | |
| def calculate_similarity(model, text1, text2): | |
| try: | |
| # Infer vectors with proper parameters | |
| vec1 = infer_vector(model, text1) | |
| vec2 = infer_vector(model, text2) | |
| if vec1 is None or vec2 is None: | |
| return 0.0 | |
| # Calculate cosine similarity | |
| similarity = cosine_similarity([vec1], [vec2])[0][0] | |
| # Ensure valid similarity score | |
| score = max(-1.0, min(1.0, similarity)) | |
| return max(0, min(100, (score + 1) * 50)) # Convert to 0-100 range | |
| except Exception as e: | |
| logger.error(f"Doc2Vec similarity calculation error: {e}") | |
| return 0.0 | |
| def get_job_embeddings(model, job_taxonomy): | |
| embeddings = [] | |
| for job in job_taxonomy: | |
| embeddings.append(infer_vector(model, job)) | |
| return np.array(embeddings) |