File size: 8,874 Bytes
4d1cb0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# ==============================================================================
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
# AUTHORS: AMEY THAKUR & MEGA SATISH
# GITHUB (AMEY): https://github.com/Amey-Thakur
# GITHUB (MEGA): https://github.com/msatmod
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
# RELEASE DATE: June 5, 2022
# LICENSE: MIT License
# DESCRIPTION: Utility module for the model training pipeline.
# ==============================================================================

import pickle
import warnings
import numpy as np
import pandas as pd
import spacy
import en_core_web_lg
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

# Suppression of non-critical runtime warnings to maintain algorithmic output integrity
warnings.filterwarnings("ignore")

def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7):
    """
    Ingests raw data, performs feature extraction via word embeddings, 
    and partitions the dataset for model validation.
    
    Methodology:
        - TSV Ingestion: Data is loaded from the specified file.
        - Semantic Vectorization: Utilizing spaCy's dense 300-dimensional 
          word embeddings (centroid of token vectors).
        - Validation Partitioning: Stratified splitting of data into 
          training and testing subsets.

    Args:
        filename (str): Path to the TSV/CSV dataset.
        targets (list): Column name for the dependent variable.
        validation_size (float): Proportion of data reserved for testing.
        seed (int): Random seed for reproducibility.

    Returns:
        tuple: (X_train, X_test, Y_train, Y_test) feature and label sets.
    """
    print(f"Acquiring dataset from: {filename}")
    df_all = pd.read_csv(filename, sep='\t', encoding='utf-8')

    # Step 1: Initialize the Linguistic Engine
    nlp_engine = en_core_web_lg.load()

    # Step 2: Compute Dense Word Embeddings (Feature Extraction)
    print("Extracting semantic features via spaCy embeddings...")
    feature_vectors = np.array([
        np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
        for s in df_all['clean_text']
    ])
    
    # Step 3: Dataset Splitting
    y_labels = df_all.loc[:, targets]
    x_features = feature_vectors

    x_train, x_test, y_train, y_test = train_test_split(
        x_features, y_labels, test_size=validation_size, random_state=seed
    )

    return x_train, x_test, y_train, y_test

def classification(X_train, Y_train, model=""):
    """
    Facilitates the training and serialization of various classification 
    architectures.
    
    Architectures Supported:
        - SVM: Support Vector Machine (Selected as the production primary).
        - LR: Logistic Regression.
        - DT: Decision Tree Classifier.
        - KNN: k-Nearest Neighbors (with automated k-optimization).
        - RF: Random Forest Classifier.
        - NN: Multi-layer Perceptron (MLP) Neural Network.

    Args:
        X_train: Training feature set.
        Y_train: Training label set.
        model (str): Target architecture identifier.

    Returns:
        object: The trained Scikit-learn model instance.
    """
    if model == "SVM":
        # Support Vector Machines are effective in high-dimensional semantic spaces
        print("Initializing SVM (Support Vector Machine) training...")
        clf = SVC(probability=True)
        clf.fit(X_train, Y_train)
        
        # Performance Evaluation (Accuracy Metric)
        train_accuracy = accuracy_score(clf.predict(X_train), Y_train)
        print(f"Training Convergence Accuracy: {train_accuracy:.4f}")

        # Persistence: Serializing the model artifact
        save_path = "../assets/models/model_svm_pc.pkl"
        with open(save_path, 'wb') as file:
            pickle.dump(clf, file)
        return clf

    elif model == "LR":
        # Logistic Regression serves as a robust baseline for linear classification
        print("Initializing Logistic Regression training...")
        lr_model = LogisticRegression()
        lr_model.fit(X_train, Y_train)
        
        save_path = "../assets/models/model_LogReg.pkl"
        with open(save_path, 'wb') as file:
            pickle.dump(lr_model, file)
        return lr_model

    elif model == "DT":
        # Decision Trees provide hierarchical decision boundaries
        print("Initializing Decision Tree training...")
        dt_model = DecisionTreeClassifier()
        dt_model.fit(X_train, Y_train)
        
        save_path = "../assets/models/model_DTC.pkl"
        with open(save_path, 'wb') as file:
            pickle.dump(dt_model, file)
        return dt_model

    elif model == "KNN":
        # kNN requires hyperparameter tuning (k value) via cross-validation
        print("Initializing kNN training with automated k-optimization...")
        k_values = range(1, 32, 1)
        k_scores = []

        # 10-Fold Cross-Validation for optimal k-neighbor selection
        for k in k_values:
            knn = KNeighborsClassifier(n_neighbors=k)
            score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10))
            k_scores.append(score)
            
        optimal_k = k_values[np.argmax(k_scores)]
        print(f"Optimized Hyperparameter discovered: k = {optimal_k}")

        best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
        best_knn.fit(X_train, Y_train)

        save_path = "../assets/models/model_KNN.pkl"
        with open(save_path, 'wb') as file:
            pickle.dump(best_knn, file)
        return best_knn

    elif model == "RF":
        # Random Forest: Ensemble bagged decision trees for variance reduction
        print("Initializing Random Forest training...")
        rf_model = RandomForestClassifier()
        rf_model.fit(X_train, Y_train)
        
        save_path = "../assets/models/model_RF.pkl"
        with open(save_path, 'wb') as file:
            pickle.dump(rf_model, file)
        return rf_model

    elif model == "NN":
        # MLP (Multi-layer Perceptron): Basic artificial neural network
        print("Initializing Neural Network (MLP) training...")
        nn_model = MLPClassifier()
        nn_model.fit(X_train, Y_train)
        
        save_path = "../assets/models/model_NN.pkl"
        with open(save_path, 'wb') as file:
            pickle.dump(nn_model, file)
        return nn_model

def LSTM(filename: str):
    """
    Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM) 
    recurrent neural networks for capturing temporal lingustical patterns.
    
    Methodology:
        - Tokenization: Integer encoding of sequences.
        - Padding: Uniform sequence length normalization.
        - Architecture: Embedding layer followed by LSTM with Dropouts.
    """
    from keras.models import Sequential
    from keras.layers import Dense, Embedding, LSTM
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.wrappers.scikit_learn import KerasClassifier

    print(f"Acquiring data for Deep Learning (LSTM): {filename}")
    df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8')

    # Step 1: Sequence Tokenization and Padding
    vocab_size = 20000
    max_len = 50
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(df_dl['clean_text'])
    seqs = tokenizer.texts_to_sequences(df_dl['clean_text'])
    x_lstm = pad_sequences(seqs, maxlen=max_len)
    y_lstm = df_dl["label"]

    # Step 2: Architecture Definition
    print("Constructing LSTM topology...")
    model = Sequential()
    model.add(Embedding(vocab_size, 300, input_length=max_len))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Step 3: Model Execution and Persistance
    print("Commencing Deep Learning Convergence (LSTM)...")
    # In a professional context, create_model should be passed to KerasClassifier
    # Here we demonstrate the fundamental fit operation
    model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3)

    # Persistence: JSON topology and H5 weights
    model_json = model.to_json()
    with open("model_LSTM.json", "w") as json_file:
        json_file.write(model_json)
    model.save_weights("model_LSTM.h5")
    print("Deep Learning model (LSTM) artifacts successfully persisted.")