import argparse import logging import os import sys from gensim.models import Doc2Vec from gensim.models.doc2vec import TaggedDocument import numpy as np # Configure logging logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('doc2vec_training.log') ] ) def train_doc2vec_model(documents_path, output_path, vector_size=200, window=5, min_count=5, epochs=20): """Train a Doc2Vec model for document embeddings""" # Read documents try: with open(documents_path, 'r', encoding='utf-8') as f: documents = [line.strip() for line in f if line.strip()] except Exception as e: logging.error(f"Error reading documents: {str(e)}") return False # Prepare tagged documents tagged_data = [] for i, doc in enumerate(documents): try: tokens = tokenize_text(doc) tagged_data.append(TaggedDocument(words=tokens, tags=[str(i)])) except Exception as e: logging.warning(f"Skipping document {i}: {str(e)}") if not tagged_data: logging.error("No valid documents for training") return False # Train model try: model = Doc2Vec( documents=tagged_data, vector_size=vector_size, window=window, min_count=min_count, workers=os.cpu_count(), epochs=epochs, dm=1, # Use PV-DM (Distributed Memory) mode dbow_words=0 # Skip training word vectors in DBOW mode ) # Save model model.save(output_path) logging.info(f"Doc2Vec model saved to {output_path}") # Test the model test_phrases = [ "software engineer python java", "data scientist machine learning", "cloud engineer aws docker" ] for phrase in test_phrases: tokens = tokenize_text(phrase) vec = model.infer_vector(tokens) similar = model.dv.most_similar([vec], topn=3) logging.info(f"Similar to '{phrase}': {similar}") return True except Exception as e: logging.error(f"Training failed: {str(e)}") return False