File size: 3,266 Bytes
c061ce5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# ==============================================================================
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
# AUTHORS: AMEY THAKUR & MEGA SATISH
# GITHUB (AMEY): https://github.com/Amey-Thakur
# GITHUB (MEGA): https://github.com/msatmod
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
# RELEASE DATE: June 5, 2022
# LICENSE: MIT License
# DESCRIPTION: Script for training machine learning models for tweet analysis.
# ==============================================================================

import argparse
import warnings
import train_utilities as TU

# Suppression of non-critical runtime warnings to ensure output clarity during training
warnings.filterwarnings("ignore")

def main():
    """
    Primary execution routine for the model training utility.
    
    This script facilitates the training of various machine learning 
    architectures by providing a standardized interface for:
        1. Dataset Ingestion: Loading and splitting training data.
        2. Hyperparameter Configuration: Setting up model-specific parameters.
        3. Algorithmic Training: Executing the training process via train_utilities.
        4. Model Serialization: Persisting the resulting model for future inference.
    """
    # Initialize the CLI argument parser
    parser = argparse.ArgumentParser(
        description="Twitter Depression Detection: Model Training Utility"
    )

    # Positional argument for the training dataset path (CSV format)
    parser.add_argument(
        'filename', 
        help="Path to the training dataset (TSV/CSV format with 'label' and 'clean_text')"
    )

    # Positional argument for the classification model architecture
    # Supported: 'DT', 'LR', 'kNN', 'SVM', 'RF', 'NN', 'LSTM'
    parser.add_argument(
        'model', 
        help="Target model architecture for training"
    )

    # Execution of the parsing logic
    args = parser.parse_args()

    # Deployment of the selected training pipeline based on the 'model' parameter
    model_type = args.model
    dataset_path = args.filename

    # Pipeline selection logic
    if model_type in ["DT", "LR", "kNN", "SVM", "RF", "NN"]:
        # Logic for standardized Scikit-learn architectures
        print(f"Initializing {model_type} training pipeline...")
        
        # Step 1: Data Acquisition and Validation Splitting
        X_train, X_test, Y_train, Y_test = TU.load_prepare_split_df(dataset_path)

        # Step 2: Algorithmic Training and Parameter Optimization
        # The 'classification' method handles instantiation and fitting
        trained_model = TU.classification(X_train=X_train, Y_train=Y_train, model=model_type)
        
        print(f"Training for {model_type} successful.")

    elif model_type == "LSTM":
        # Specialized logic for Long Short-Term Memory (LSTM) Neural Networks
        # LSTMs are utilized here to capture long-range temporal dependencies in text
        print("Initializing LSTM deep learning pipeline...")
        TU.LSTM(dataset_path)
        
    else:
        print(f"Error: Model architecture '{model_type}' is not currently recognized.")
        print("Supported architectures: DT, LR, kNN, SVM, RF, NN, LSTM")

if __name__ == '__main__':
    main()