# ============================================================================== # PROJECT: DEPRESSION-DETECTION-USING-TWEETS # AUTHORS: AMEY THAKUR & MEGA SATISH # GITHUB (AMEY): https://github.com/Amey-Thakur # GITHUB (MEGA): https://github.com/msatmod # REPOSITORY: https://github.com/Amey-Thakur/DEPRESSION-DETECTION-USING-TWEETS # RELEASE DATE: June 5, 2022 # LICENSE: MIT License # DESCRIPTION: Utility module for the model training pipeline. # ============================================================================== import pickle import warnings import numpy as np import pandas as pd import spacy import en_core_web_lg from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier # Suppression of non-critical runtime warnings to maintain algorithmic output integrity warnings.filterwarnings("ignore") def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7): """ Ingests raw data, performs feature extraction via word embeddings, and partitions the dataset for model validation. Methodology: - TSV Ingestion: Data is loaded from the specified file. - Semantic Vectorization: Utilizing spaCy's dense 300-dimensional word embeddings (centroid of token vectors). - Validation Partitioning: Stratified splitting of data into training and testing subsets. Args: filename (str): Path to the TSV/CSV dataset. targets (list): Column name for the dependent variable. validation_size (float): Proportion of data reserved for testing. seed (int): Random seed for reproducibility. Returns: tuple: (X_train, X_test, Y_train, Y_test) feature and label sets. """ print(f"Acquiring dataset from: {filename}") df_all = pd.read_csv(filename, sep='\t', encoding='utf-8') # Step 1: Initialize the Linguistic Engine nlp_engine = en_core_web_lg.load() # Step 2: Compute Dense Word Embeddings (Feature Extraction) print("Extracting semantic features via spaCy embeddings...") feature_vectors = np.array([ np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300)) for s in df_all['clean_text'] ]) # Step 3: Dataset Splitting y_labels = df_all.loc[:, targets] x_features = feature_vectors x_train, x_test, y_train, y_test = train_test_split( x_features, y_labels, test_size=validation_size, random_state=seed ) return x_train, x_test, y_train, y_test def classification(X_train, Y_train, model=""): """ Facilitates the training and serialization of various classification architectures. Architectures Supported: - SVM: Support Vector Machine (Selected as the production primary). - LR: Logistic Regression. - DT: Decision Tree Classifier. - KNN: k-Nearest Neighbors (with automated k-optimization). - RF: Random Forest Classifier. - NN: Multi-layer Perceptron (MLP) Neural Network. Args: X_train: Training feature set. Y_train: Training label set. model (str): Target architecture identifier. Returns: object: The trained Scikit-learn model instance. """ if model == "SVM": # Support Vector Machines are effective in high-dimensional semantic spaces print("Initializing SVM (Support Vector Machine) training...") clf = SVC(probability=True) clf.fit(X_train, Y_train) # Performance Evaluation (Accuracy Metric) train_accuracy = accuracy_score(clf.predict(X_train), Y_train) print(f"Training Convergence Accuracy: {train_accuracy:.4f}") # Persistence: Serializing the model artifact save_path = "../assets/models/model_svm_pc.pkl" with open(save_path, 'wb') as file: pickle.dump(clf, file) return clf elif model == "LR": # Logistic Regression serves as a robust baseline for linear classification print("Initializing Logistic Regression training...") lr_model = LogisticRegression() lr_model.fit(X_train, Y_train) save_path = "../assets/models/model_LogReg.pkl" with open(save_path, 'wb') as file: pickle.dump(lr_model, file) return lr_model elif model == "DT": # Decision Trees provide hierarchical decision boundaries print("Initializing Decision Tree training...") dt_model = DecisionTreeClassifier() dt_model.fit(X_train, Y_train) save_path = "../assets/models/model_DTC.pkl" with open(save_path, 'wb') as file: pickle.dump(dt_model, file) return dt_model elif model == "KNN": # kNN requires hyperparameter tuning (k value) via cross-validation print("Initializing kNN training with automated k-optimization...") k_values = range(1, 32, 1) k_scores = [] # 10-Fold Cross-Validation for optimal k-neighbor selection for k in k_values: knn = KNeighborsClassifier(n_neighbors=k) score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10)) k_scores.append(score) optimal_k = k_values[np.argmax(k_scores)] print(f"Optimized Hyperparameter discovered: k = {optimal_k}") best_knn = KNeighborsClassifier(n_neighbors=optimal_k) best_knn.fit(X_train, Y_train) save_path = "../assets/models/model_KNN.pkl" with open(save_path, 'wb') as file: pickle.dump(best_knn, file) return best_knn elif model == "RF": # Random Forest: Ensemble bagged decision trees for variance reduction print("Initializing Random Forest training...") rf_model = RandomForestClassifier() rf_model.fit(X_train, Y_train) save_path = "../assets/models/model_RF.pkl" with open(save_path, 'wb') as file: pickle.dump(rf_model, file) return rf_model elif model == "NN": # MLP (Multi-layer Perceptron): Basic artificial neural network print("Initializing Neural Network (MLP) training...") nn_model = MLPClassifier() nn_model.fit(X_train, Y_train) save_path = "../assets/models/model_NN.pkl" with open(save_path, 'wb') as file: pickle.dump(nn_model, file) return nn_model def LSTM(filename: str): """ Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM) recurrent neural networks for capturing temporal lingustical patterns. Methodology: - Tokenization: Integer encoding of sequences. - Padding: Uniform sequence length normalization. - Architecture: Embedding layer followed by LSTM with Dropouts. """ from keras.models import Sequential from keras.layers import Dense, Embedding, LSTM from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.wrappers.scikit_learn import KerasClassifier print(f"Acquiring data for Deep Learning (LSTM): {filename}") df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8') # Step 1: Sequence Tokenization and Padding vocab_size = 20000 max_len = 50 tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(df_dl['clean_text']) seqs = tokenizer.texts_to_sequences(df_dl['clean_text']) x_lstm = pad_sequences(seqs, maxlen=max_len) y_lstm = df_dl["label"] # Step 2: Architecture Definition print("Constructing LSTM topology...") model = Sequential() model.add(Embedding(vocab_size, 300, input_length=max_len)) model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Step 3: Model Execution and Persistance print("Commencing Deep Learning Convergence (LSTM)...") # In a professional context, create_model should be passed to KerasClassifier # Here we demonstrate the fundamental fit operation model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3) # Persistence: JSON topology and H5 weights model_json = model.to_json() with open("model_LSTM.json", "w") as json_file: json_file.write(model_json) model.save_weights("model_LSTM.h5") print("Deep Learning model (LSTM) artifacts successfully persisted.")