ameythakur's picture
DEPRESSION-DETECTION
4d1cb0c verified
# ==============================================================================
# PROJECT: DEPRESSION-DETECTION-USING-TWEETS
# AUTHORS: AMEY THAKUR & MEGA SATISH
# GITHUB (AMEY): https://github.com/Amey-Thakur
# GITHUB (MEGA): https://github.com/msatmod
# REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS
# RELEASE DATE: June 5, 2022
# LICENSE: MIT License
# DESCRIPTION: Utility module for the model training pipeline.
# ==============================================================================
import pickle
import warnings
import numpy as np
import pandas as pd
import spacy
import en_core_web_lg
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
# Suppression of non-critical runtime warnings to maintain algorithmic output integrity
warnings.filterwarnings("ignore")
def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7):
"""
Ingests raw data, performs feature extraction via word embeddings,
and partitions the dataset for model validation.
Methodology:
- TSV Ingestion: Data is loaded from the specified file.
- Semantic Vectorization: Utilizing spaCy's dense 300-dimensional
word embeddings (centroid of token vectors).
- Validation Partitioning: Stratified splitting of data into
training and testing subsets.
Args:
filename (str): Path to the TSV/CSV dataset.
targets (list): Column name for the dependent variable.
validation_size (float): Proportion of data reserved for testing.
seed (int): Random seed for reproducibility.
Returns:
tuple: (X_train, X_test, Y_train, Y_test) feature and label sets.
"""
print(f"Acquiring dataset from: {filename}")
df_all = pd.read_csv(filename, sep='\t', encoding='utf-8')
# Step 1: Initialize the Linguistic Engine
nlp_engine = en_core_web_lg.load()
# Step 2: Compute Dense Word Embeddings (Feature Extraction)
print("Extracting semantic features via spaCy embeddings...")
feature_vectors = np.array([
np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300))
for s in df_all['clean_text']
])
# Step 3: Dataset Splitting
y_labels = df_all.loc[:, targets]
x_features = feature_vectors
x_train, x_test, y_train, y_test = train_test_split(
x_features, y_labels, test_size=validation_size, random_state=seed
)
return x_train, x_test, y_train, y_test
def classification(X_train, Y_train, model=""):
"""
Facilitates the training and serialization of various classification
architectures.
Architectures Supported:
- SVM: Support Vector Machine (Selected as the production primary).
- LR: Logistic Regression.
- DT: Decision Tree Classifier.
- KNN: k-Nearest Neighbors (with automated k-optimization).
- RF: Random Forest Classifier.
- NN: Multi-layer Perceptron (MLP) Neural Network.
Args:
X_train: Training feature set.
Y_train: Training label set.
model (str): Target architecture identifier.
Returns:
object: The trained Scikit-learn model instance.
"""
if model == "SVM":
# Support Vector Machines are effective in high-dimensional semantic spaces
print("Initializing SVM (Support Vector Machine) training...")
clf = SVC(probability=True)
clf.fit(X_train, Y_train)
# Performance Evaluation (Accuracy Metric)
train_accuracy = accuracy_score(clf.predict(X_train), Y_train)
print(f"Training Convergence Accuracy: {train_accuracy:.4f}")
# Persistence: Serializing the model artifact
save_path = "../assets/models/model_svm_pc.pkl"
with open(save_path, 'wb') as file:
pickle.dump(clf, file)
return clf
elif model == "LR":
# Logistic Regression serves as a robust baseline for linear classification
print("Initializing Logistic Regression training...")
lr_model = LogisticRegression()
lr_model.fit(X_train, Y_train)
save_path = "../assets/models/model_LogReg.pkl"
with open(save_path, 'wb') as file:
pickle.dump(lr_model, file)
return lr_model
elif model == "DT":
# Decision Trees provide hierarchical decision boundaries
print("Initializing Decision Tree training...")
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)
save_path = "../assets/models/model_DTC.pkl"
with open(save_path, 'wb') as file:
pickle.dump(dt_model, file)
return dt_model
elif model == "KNN":
# kNN requires hyperparameter tuning (k value) via cross-validation
print("Initializing kNN training with automated k-optimization...")
k_values = range(1, 32, 1)
k_scores = []
# 10-Fold Cross-Validation for optimal k-neighbor selection
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10))
k_scores.append(score)
optimal_k = k_values[np.argmax(k_scores)]
print(f"Optimized Hyperparameter discovered: k = {optimal_k}")
best_knn = KNeighborsClassifier(n_neighbors=optimal_k)
best_knn.fit(X_train, Y_train)
save_path = "../assets/models/model_KNN.pkl"
with open(save_path, 'wb') as file:
pickle.dump(best_knn, file)
return best_knn
elif model == "RF":
# Random Forest: Ensemble bagged decision trees for variance reduction
print("Initializing Random Forest training...")
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)
save_path = "../assets/models/model_RF.pkl"
with open(save_path, 'wb') as file:
pickle.dump(rf_model, file)
return rf_model
elif model == "NN":
# MLP (Multi-layer Perceptron): Basic artificial neural network
print("Initializing Neural Network (MLP) training...")
nn_model = MLPClassifier()
nn_model.fit(X_train, Y_train)
save_path = "../assets/models/model_NN.pkl"
with open(save_path, 'wb') as file:
pickle.dump(nn_model, file)
return nn_model
def LSTM(filename: str):
"""
Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM)
recurrent neural networks for capturing temporal lingustical patterns.
Methodology:
- Tokenization: Integer encoding of sequences.
- Padding: Uniform sequence length normalization.
- Architecture: Embedding layer followed by LSTM with Dropouts.
"""
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier
print(f"Acquiring data for Deep Learning (LSTM): {filename}")
df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8')
# Step 1: Sequence Tokenization and Padding
vocab_size = 20000
max_len = 50
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_dl['clean_text'])
seqs = tokenizer.texts_to_sequences(df_dl['clean_text'])
x_lstm = pad_sequences(seqs, maxlen=max_len)
y_lstm = df_dl["label"]
# Step 2: Architecture Definition
print("Constructing LSTM topology...")
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=max_len))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Step 3: Model Execution and Persistance
print("Commencing Deep Learning Convergence (LSTM)...")
# In a professional context, create_model should be passed to KerasClassifier
# Here we demonstrate the fundamental fit operation
model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3)
# Persistence: JSON topology and H5 weights
model_json = model.to_json()
with open("model_LSTM.json", "w") as json_file:
json_file.write(model_json)
model.save_weights("model_LSTM.h5")
print("Deep Learning model (LSTM) artifacts successfully persisted.")