# -*- coding: utf-8 -*- """modeling.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1x78fRDZAuK5FaSTKHPGy8eSbZ_gYAFr6 """ from google.colab import drive drive.mount('/content/drive') #!pip install -qqq h5py #!pip install --upgrade -qqq gensim !python -m spacy download en_core_web_lg !pip install -U SpaCy==2.2.0 ## Import required libraries ## warnings import warnings warnings.filterwarnings("ignore") ## for data import numpy as np import pandas as pd ## for plotting import matplotlib.pyplot as plt import seaborn as sns ## TF-IDF from sklearn.feature_extraction.text import TfidfVectorizer ## T-Sne from yellowbrick.text import TSNEVisualizer from sklearn import manifold ## Train-Test Split from sklearn.model_selection import train_test_split ## Feature selection from sklearn import feature_selection ## libraraies for classification from sklearn.pipeline import Pipeline import sklearn.metrics as skm from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier ## for saving model import pickle ## for explainer #from lime import lime_text ## detokenization from nltk.tokenize.treebank import TreebankWordDetokenizer ## for word embedding with gensim import gensim import gensim.downloader as gensim_api from gensim.models import Word2Vec from gensim.models import KeyedVectors from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences ## for word embedding with Spacy import spacy import en_core_web_lg ## for deep learning from keras.models import load_model from keras.models import Model, Sequential from keras.callbacks import EarlyStopping, ModelCheckpoint from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D from tensorflow.keras import models, layers, preprocessing as kprocessing from tensorflow.keras import backend as K import tensorflow as tf import keras from keras.layers import Lambda import tensorflow as tf from keras.models import model_from_json ## for bert language model #import transformers """## Loading the dataset:""" df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv", sep='\t', encoding='utf-8') df_all """## Classification models as well as LSTM with pretrained model(Spacy): In order to run a supervised learning model, we first need to convert the clean_text into feature representation. """ nlp = en_core_web_lg.load() ## word-embedding all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \ for s in df_all['clean_text']]) # split out validation dataset for the end Y= df_all["label"] X = all_vectors from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV validation_size = 0.3 seed = 7 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed) # test options for classification num_folds = 10 seed = 7 scoring = 'accuracy' ## spot check the algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('SVM', SVC())) ## Neural Network models.append(('NN', MLPClassifier())) ## Ensable Models models.append(('RF', RandomForestClassifier())) ## Running the classification models results = [] names = [] kfold_results = [] test_results = [] train_results = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) #msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) #print(msg) # Full Training period res = model.fit(X_train, Y_train) train_result = accuracy_score(res.predict(X_train), Y_train) train_results.append(train_result) # Test results test_result = accuracy_score(res.predict(X_test), Y_test) test_results.append(test_result) msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result) print(msg) print(confusion_matrix(res.predict(X_test), Y_test)) #print(classification_report(res.predict(X_test), Y_test)) # compare algorithms from matplotlib import pyplot fig = pyplot.figure() ind = np.arange(len(names)) # the x locations for the groups width = 0.35 # the width of the bars fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.bar(ind - width/2, train_results, width=width, label='Train Error') pyplot.bar(ind + width/2, test_results, width=width, label='Test Error') fig.set_size_inches(15,8) pyplot.legend() ax.set_xticks(ind) ax.set_xticklabels(names) pyplot.show() """The best model with the highest accuracy is **Support Vector Machine(SVM)** with **85.79**% accuracy on test dataset. Logistic Regression performed good as well but we see overfitting problem with CART, NN and RF. ### LSTM model: """ ### Create sequence vocabulary_size = 20000 tokenizer = Tokenizer(num_words= vocabulary_size) tokenizer.fit_on_texts(df_all['clean_text']) sequences = tokenizer.texts_to_sequences(df_all['clean_text']) X_LSTM = pad_sequences(sequences, maxlen=50) ## Split the data into train and test Y_LSTM = df_all["label"] X_train_LSTM, X_test_LSTM, Y_train_LSTM, Y_test_LSTM = train_test_split(X_LSTM, \ Y_LSTM, test_size=validation_size, random_state=seed) from keras.wrappers.scikit_learn import KerasClassifier def create_model(input_length=50): model = Sequential() model.add(Embedding(20000, 300, input_length=50)) model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model model_LSTM = KerasClassifier(build_fn=create_model, epochs=3, verbose=1, validation_split=0.4) model_LSTM.fit(X_train_LSTM, Y_train_LSTM) train_result_LSTM = accuracy_score(model_LSTM.predict(X_train_LSTM), Y_train_LSTM) # Test results test_result_LSTM = accuracy_score(model_LSTM.predict(X_test_LSTM), Y_test_LSTM) print("train result:", train_result_LSTM) print("test result:", test_result_LSTM) confusion_matrix(model_LSTM.predict(X_test_LSTM), Y_test_LSTM) """### Compare all the models:""" train_results.append(train_result_LSTM);test_results.append(test_result_LSTM) names.append("LSTM") # compare algorithms from matplotlib import pyplot fig = pyplot.figure() ind = np.arange(len(names)) # the x locations for the groups width = 0.35 # the width of the bars fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.bar(ind - width/2, train_results, width=width, label='Train Error') pyplot.bar(ind + width/2, test_results, width=width, label='Test Error') fig.set_size_inches(15,8) pyplot.legend() ax.set_xticks(ind) ax.set_xticklabels(names) pyplot.show() plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/classification_comparision.png') """## Evaluate the performance: * **Accuracy:** the fraction of predictions the model got right. * **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class. * **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one. * **Precision:** the fraction of relevant instances among the retrieved instances. * **Recall:** the fraction of the total amount of relevant instances that were actually retrieved. """ def conf_matrix_acc(y_true, y_pred): ## Plot confusion matrix cm = confusion_matrix(y_true, y_pred) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, cbar=False) ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, yticklabels=classes, title="Confusion matrix") plt.yticks(rotation=0) print("=========================================") print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}') print("=========================================") print("Detail:") print(skm.classification_report(y_true, y_pred)) ## Plot ROC and precision-recall curve def roc_precision_auc(): fig, ax = plt.subplots(nrows=1, ncols=2) ## Plot roc for i in range(len(classes)): fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i], probs[:,i]) ax[0].plot(fpr, tpr, lw=3, label='{0} (area={1:0.2f})'.format(classes[i], skm.auc(fpr, tpr)) ) ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--') ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], xlabel='False Positive Rate', ylabel="True Positive Rate (Recall)", title="Receiver operating characteristic") ax[0].legend(loc="lower right") ax[0].grid(True) ## Plot precision-recall curve for i in range(len(classes)): precision, recall, thresholds = skm.precision_recall_curve( y_test_array[:,i], probs[:,i]) ax[1].plot(recall, precision, lw=3, label='{0} (area={1:0.2f})'.format(classes[i], skm.auc(recall, precision)) ) ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', ylabel="Precision", title="Precision-Recall curve") ax[1].legend(loc="best") ax[1].grid(True) plt.show() #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_LR.png') #plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/ROC_Precision_SVM.png') ## AUC score print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}') """## Support Vector Machine(SVM) with word embedding:""" nlp = en_core_web_lg.load() ## word-embedding all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \ for s in df_all['clean_text']]) # split out validation dataset for the end Y= df_all["label"] X = all_vectors from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV validation_size = 0.3 seed = 7 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed) # test options for classification num_folds = 10 seed = 7 scoring = 'accuracy' #Create a svm Classifier clf = SVC(probability=True) ## Running the svm Classifier # Full Training period res = clf.fit(X_train, Y_train) train_result = accuracy_score(res.predict(X_train), Y_train) test_result = accuracy_score(res.predict(X_test), Y_test) print("train_result:", "test_resuld:", train_result, test_result, sep=" ") ## Save the Modle to file in the current working directory SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm1.pkl" with open(SVM, 'wb') as file: pickle.dump(clf, file) ## Load the Model back from file with open(SVM, 'rb') as file: clf = pickle.load(file) clf ## Test results ## y_pred_svm = res.predict(X_test) classes = np.unique(Y_test.to_list()) y_test_array = pd.get_dummies(Y_test, drop_first=False).values probs = res.predict_proba(X_test) conf_matrix_acc(Y_test.to_list(),y_pred_svm) roc_precision_auc() """## Exploring False positive and False negative:""" ## creating lists of true values and predictions y_test_1 = [x for x in y_test] y_pred_lr_1 = [x for x in y_pred_lr] ## Find the indices of wrong predictions idx = [] for i in range(len(y_test_1)): if y_test_1[i] != y_pred_lr_1[i]: idx.append(i) i+=1 print('There are", {} "wrong preditions", len(idx)) wrong_arr = cv.inverse_transform(X_test_tfidf[idx]) ## detokenize the wrong array detokenized = [TreebankWordDetokenizer().detokenize(x) for x in wrong_arr] detokenized[:50] """There is no specific patterns between false positive and false negative predictions."""