File size: 8,874 Bytes
c061ce5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 | # ==============================================================================
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
# AUTHORS: AMEY THAKUR & MEGA SATISH
# GITHUB (AMEY): https://github.com/Amey-Thakur
# GITHUB (MEGA): https://github.com/msatmod
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
# RELEASE DATE: June 5, 2022
# LICENSE: MIT License
# DESCRIPTION: Utility module for the model training pipeline.
# ==============================================================================
import pickle
import warnings
import numpy as np
import pandas as pd
import spacy
import en_core_web_lg
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
# Suppression of non-critical runtime warnings to maintain algorithmic output integrity
warnings.filterwarnings("ignore")
def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7):
"""
Ingests raw data, performs feature extraction via word embeddings,
and partitions the dataset for model validation.
Methodology:
- TSV Ingestion: Data is loaded from the specified file.
- Semantic Vectorization: Utilizing spaCy's dense 300-dimensional
word embeddings (centroid of token vectors).
- Validation Partitioning: Stratified splitting of data into
training and testing subsets.
Args:
filename (str): Path to the TSV/CSV dataset.
targets (list): Column name for the dependent variable.
validation_size (float): Proportion of data reserved for testing.
seed (int): Random seed for reproducibility.
Returns:
tuple: (X_train, X_test, Y_train, Y_test) feature and label sets.
"""
print(f"Acquiring dataset from: {filename}")
df_all = pd.read_csv(filename, sep='\t', encoding='utf-8')
# Step 1: Initialize the Linguistic Engine
nlp_engine = en_core_web_lg.load()
# Step 2: Compute Dense Word Embeddings (Feature Extraction)
print("Extracting semantic features via spaCy embeddings...")
feature_vectors = np.array([
np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
for s in df_all['clean_text']
])
# Step 3: Dataset Splitting
y_labels = df_all.loc[:, targets]
x_features = feature_vectors
x_train, x_test, y_train, y_test = train_test_split(
x_features, y_labels, test_size=validation_size, random_state=seed
)
return x_train, x_test, y_train, y_test
def classification(X_train, Y_train, model=""):
"""
Facilitates the training and serialization of various classification
architectures.
Architectures Supported:
- SVM: Support Vector Machine (Selected as the production primary).
- LR: Logistic Regression.
- DT: Decision Tree Classifier.
- KNN: k-Nearest Neighbors (with automated k-optimization).
- RF: Random Forest Classifier.
- NN: Multi-layer Perceptron (MLP) Neural Network.
Args:
X_train: Training feature set.
Y_train: Training label set.
model (str): Target architecture identifier.
Returns:
object: The trained Scikit-learn model instance.
"""
if model == "SVM":
# Support Vector Machines are effective in high-dimensional semantic spaces
print("Initializing SVM (Support Vector Machine) training...")
clf = SVC(probability=True)
clf.fit(X_train, Y_train)
# Performance Evaluation (Accuracy Metric)
train_accuracy = accuracy_score(clf.predict(X_train), Y_train)
print(f"Training Convergence Accuracy: {train_accuracy:.4f}")
# Persistence: Serializing the model artifact
save_path = "../assets/models/model_svm_pc.pkl"
with open(save_path, 'wb') as file:
pickle.dump(clf, file)
return clf
elif model == "LR":
# Logistic Regression serves as a robust baseline for linear classification
print("Initializing Logistic Regression training...")
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)
save_path = "../assets/models/model_LogReg.pkl"
with open(save_path, 'wb') as file:
pickle.dump(lr_model, file)
return lr_model
elif model == "DT":
# Decision Trees provide hierarchical decision boundaries
print("Initializing Decision Tree training...")
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)
save_path = "../assets/models/model_DTC.pkl"
with open(save_path, 'wb') as file:
pickle.dump(dt_model, file)
return dt_model
elif model == "KNN":
# kNN requires hyperparameter tuning (k value) via cross-validation
print("Initializing kNN training with automated k-optimization...")
k_values = range(1, 32, 1)
k_scores = []
# 10-Fold Cross-Validation for optimal k-neighbor selection
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10))
k_scores.append(score)
optimal_k = k_values[np.argmax(k_scores)]
print(f"Optimized Hyperparameter discovered: k = {optimal_k}")
best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
best_knn.fit(X_train, Y_train)
save_path = "../assets/models/model_KNN.pkl"
with open(save_path, 'wb') as file:
pickle.dump(best_knn, file)
return best_knn
elif model == "RF":
# Random Forest: Ensemble bagged decision trees for variance reduction
print("Initializing Random Forest training...")
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)
save_path = "../assets/models/model_RF.pkl"
with open(save_path, 'wb') as file:
pickle.dump(rf_model, file)
return rf_model
elif model == "NN":
# MLP (Multi-layer Perceptron): Basic artificial neural network
print("Initializing Neural Network (MLP) training...")
nn_model = MLPClassifier()
nn_model.fit(X_train, Y_train)
save_path = "../assets/models/model_NN.pkl"
with open(save_path, 'wb') as file:
pickle.dump(nn_model, file)
return nn_model
def LSTM(filename: str):
"""
Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM)
recurrent neural networks for capturing temporal lingustical patterns.
Methodology:
- Tokenization: Integer encoding of sequences.
- Padding: Uniform sequence length normalization.
- Architecture: Embedding layer followed by LSTM with Dropouts.
"""
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
print(f"Acquiring data for Deep Learning (LSTM): {filename}")
df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8')
# Step 1: Sequence Tokenization and Padding
vocab_size = 20000
max_len = 50
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_dl['clean_text'])
seqs = tokenizer.texts_to_sequences(df_dl['clean_text'])
x_lstm = pad_sequences(seqs, maxlen=max_len)
y_lstm = df_dl["label"]
# Step 2: Architecture Definition
print("Constructing LSTM topology...")
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Step 3: Model Execution and Persistance
print("Commencing Deep Learning Convergence (LSTM)...")
# In a professional context, create_model should be passed to KerasClassifier
# Here we demonstrate the fundamental fit operation
model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3)
# Persistence: JSON topology and H5 weights
model_json = model.to_json()
with open("model_LSTM.json", "w") as json_file:
json_file.write(model_json)
model.save_weights("model_LSTM.h5")
print("Deep Learning model (LSTM) artifacts successfully persisted.")
|