File size: 4,634 Bytes
56c7b6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import argparse
import logging
import os
import sys
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(message)s', 
    datefmt='%Y-%m-%d %H:%M:%S', 
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(sys.stdout),
        logging.FileHandler('sbert_training.log')
    ]
)

def create_default_training_data(output_path):
    """Create minimal training data if none exists"""
    data = {
        'resume': [
            'Experienced software engineer with Python and Java',
            'Data scientist with machine learning expertise',
            'Web developer specializing in React'
        ],
        'job_description': [
            'Seeking software engineer with Python experience',
            'Looking for data scientist with ML background',
            'Hiring frontend developer with React skills'
        ],
        'match_score': [85, 90, 95]
    }
    df = pd.DataFrame(data)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)
    logging.warning(f"Created minimal training data at {output_path}")
    return df

def train_sbert_model(train_data_path, output_dir, 
                     model_name='all-MiniLM-L6-v2', 
                     num_epochs=10, batch_size=32):
    """Train an SBERT model for semantic similarity"""
    
    # Handle missing training data
    if not os.path.exists(train_data_path):
        logging.warning(f"No training data found at {train_data_path}")
        df = create_default_training_data(train_data_path)
    else:
        try:
            df = pd.read_csv(train_data_path)
            logging.info(f"Loaded training data with {len(df)} records")
        except Exception as e:
            logging.error(f"Error loading training data: {str(e)}")
            df = create_default_training_data(train_data_path)
    
    # Prepare training examples
    train_examples = []
    for _, row in df.iterrows():
        try:
            # Normalize score to 0-1 range
            score = max(0, min(100, float(row['match_score']))) / 100
            train_examples.append(InputExample(
                texts=[str(row['resume']), str(row['job_description'])],
                label=score
            ))
        except Exception as e:
            logging.warning(f"Skipping invalid row: {str(e)}")
    
    if not train_examples:
        logging.error("No valid training examples found!")
        return False
    
    # Initialize model
    model = SentenceTransformer(model_name)
    logging.info(f"Initialized model: {model_name}")
    
    # Data loader
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.CosineSimilarityLoss(model)
    
    # Training
    logging.info(f"Starting training for {num_epochs} epochs with {len(train_examples)} examples")
    
    try:
        model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=num_epochs,
            show_progress_bar=True,
            output_path=output_dir
        )
        logging.info(f"Model successfully saved to {output_dir}")
        return True
    except Exception as e:
        logging.error(f"Training failed: {str(e)}")
        return False

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Train SBERT model for resume-JD matching')
    parser.add_argument('--train_data', type=str, 
                        default='data/training/train.csv',
                        help='Path to training data CSV')
    parser.add_argument('--output_dir', type=str, 
                        default='trained_models/sbert',
                        help='Output directory for trained model')
    parser.add_argument('--model_name', type=str, 
                        default='all-MiniLM-L6-v2',
                        help='Base SBERT model name')
    parser.add_argument('--epochs', type=int, default=10,
                        help='Number of training epochs')
    parser.add_argument('--batch_size', type=int, default=32,
                        help='Training batch size')
    
    args = parser.parse_args()
    
    # Create output directory if not exists
    os.makedirs(args.output_dir, exist_ok=True)
    
    # Run training
    success = train_sbert_model(
        train_data_path=args.train_data,
        output_dir=args.output_dir,
        model_name=args.model_name,
        num_epochs=args.epochs,
        batch_size=args.batch_size
    )
    
    sys.exit(0 if success else 1)