| | |
| | """modeling.ipynb |
| | |
| | Automatically generated by Colaboratory. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/drive/1x78fRDZAuK5FaSTKHPGy8eSbZ_gYAFr6 |
| | """ |
| |
|
| | from google.colab import drive |
| | drive.mount('/content/drive') |
| |
|
| | |
| |
|
| | |
| |
|
| | !python -m spacy download en_core_web_lg |
| |
|
| | !pip install -U SpaCy==2.2.0 |
| |
|
| | |
| |
|
| | |
| | import warnings |
| | warnings.filterwarnings("ignore") |
| |
|
| | |
| | import numpy as np |
| | import pandas as pd |
| |
|
| | |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| |
|
| | |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| |
|
| | |
| | from yellowbrick.text import TSNEVisualizer |
| | from sklearn import manifold |
| |
|
| | |
| | from sklearn.model_selection import train_test_split |
| |
|
| | |
| | from sklearn import feature_selection |
| |
|
| | |
| | from sklearn.pipeline import Pipeline |
| | import sklearn.metrics as skm |
| | from sklearn.metrics import confusion_matrix, accuracy_score |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.neighbors import KNeighborsClassifier |
| | from sklearn.svm import SVC |
| | from sklearn.tree import DecisionTreeClassifier |
| | from sklearn.neural_network import MLPClassifier |
| | from sklearn.ensemble import RandomForestClassifier |
| |
|
| | |
| | import pickle |
| |
|
| | |
| | |
| |
|
| | |
| | from nltk.tokenize.treebank import TreebankWordDetokenizer |
| |
|
| | |
| | import gensim |
| | import gensim.downloader as gensim_api |
| | from gensim.models import Word2Vec |
| | from gensim.models import KeyedVectors |
| | from keras.preprocessing.text import Tokenizer |
| | from keras.preprocessing.sequence import pad_sequences |
| |
|
| | |
| | import spacy |
| | import en_core_web_lg |
| |
|
| | |
| | from keras.models import load_model |
| | from keras.models import Model, Sequential |
| | from keras.callbacks import EarlyStopping, ModelCheckpoint |
| | from keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D |
| | from tensorflow.keras import models, layers, preprocessing as kprocessing |
| | from tensorflow.keras import backend as K |
| | import tensorflow as tf |
| | import keras |
| | from keras.layers import Lambda |
| | import tensorflow as tf |
| | from keras.models import model_from_json |
| |
|
| | |
| | |
| |
|
| | """## Loading the dataset:""" |
| |
|
| | df_all = pd.read_csv("/content/drive/MyDrive/NLP/Depression_Detection/data_cleaning/processed_data/processed_data.csv", |
| | sep='\t', encoding='utf-8') |
| |
|
| | df_all |
| |
|
| | """## Classification models as well as LSTM with pretrained model(Spacy): |
| | |
| | In order to run a supervised learning model, we first need to convert the clean_text into feature representation. |
| | """ |
| |
|
| | nlp = en_core_web_lg.load() |
| |
|
| | |
| | all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \ |
| | for s in df_all['clean_text']]) |
| |
|
| | |
| | Y= df_all["label"] |
| | X = all_vectors |
| |
|
| | from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV |
| | validation_size = 0.3 |
| | seed = 7 |
| | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed) |
| |
|
| | |
| | num_folds = 10 |
| | seed = 7 |
| | scoring = 'accuracy' |
| |
|
| | |
| | models = [] |
| | models.append(('LR', LogisticRegression())) |
| | models.append(('KNN', KNeighborsClassifier())) |
| | models.append(('CART', DecisionTreeClassifier())) |
| | models.append(('SVM', SVC())) |
| | |
| | models.append(('NN', MLPClassifier())) |
| | |
| | models.append(('RF', RandomForestClassifier())) |
| |
|
| | |
| | results = [] |
| | names = [] |
| | kfold_results = [] |
| | test_results = [] |
| | train_results = [] |
| | for name, model in models: |
| | kfold = KFold(n_splits=num_folds, random_state=seed) |
| | cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) |
| | results.append(cv_results) |
| | names.append(name) |
| | |
| | |
| | |
| | |
| | res = model.fit(X_train, Y_train) |
| | train_result = accuracy_score(res.predict(X_train), Y_train) |
| | train_results.append(train_result) |
| | |
| | |
| | test_result = accuracy_score(res.predict(X_test), Y_test) |
| | test_results.append(test_result) |
| | |
| | msg = "%s: %f (%f) %f %f" % (name, cv_results.mean(), cv_results.std(), train_result, test_result) |
| | print(msg) |
| | print(confusion_matrix(res.predict(X_test), Y_test)) |
| | |
| |
|
| | |
| | from matplotlib import pyplot |
| | fig = pyplot.figure() |
| | ind = np.arange(len(names)) |
| | width = 0.35 |
| | fig.suptitle('Algorithm Comparison') |
| | ax = fig.add_subplot(111) |
| | pyplot.bar(ind - width/2, train_results, width=width, label='Train Error') |
| | pyplot.bar(ind + width/2, test_results, width=width, label='Test Error') |
| | fig.set_size_inches(15,8) |
| | pyplot.legend() |
| | ax.set_xticks(ind) |
| | ax.set_xticklabels(names) |
| | pyplot.show() |
| |
|
| | """The best model with the highest accuracy is **Support Vector Machine(SVM)** with **85.79**% accuracy on test dataset. Logistic Regression performed good as well but we see overfitting problem with CART, NN and RF. |
| | |
| | ### LSTM model: |
| | """ |
| |
|
| | |
| | vocabulary_size = 20000 |
| | tokenizer = Tokenizer(num_words= vocabulary_size) |
| | tokenizer.fit_on_texts(df_all['clean_text']) |
| | sequences = tokenizer.texts_to_sequences(df_all['clean_text']) |
| | X_LSTM = pad_sequences(sequences, maxlen=50) |
| |
|
| | |
| | Y_LSTM = df_all["label"] |
| | X_train_LSTM, X_test_LSTM, Y_train_LSTM, Y_test_LSTM = train_test_split(X_LSTM, \ |
| | Y_LSTM, test_size=validation_size, random_state=seed) |
| |
|
| | from keras.wrappers.scikit_learn import KerasClassifier |
| | def create_model(input_length=50): |
| | model = Sequential() |
| | model.add(Embedding(20000, 300, input_length=50)) |
| | model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) |
| | model.add(Dense(1, activation='sigmoid')) |
| | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
| | return model |
| | model_LSTM = KerasClassifier(build_fn=create_model, epochs=3, verbose=1, validation_split=0.4) |
| | model_LSTM.fit(X_train_LSTM, Y_train_LSTM) |
| |
|
| | train_result_LSTM = accuracy_score(model_LSTM.predict(X_train_LSTM), Y_train_LSTM) |
| | |
| | test_result_LSTM = accuracy_score(model_LSTM.predict(X_test_LSTM), Y_test_LSTM) |
| |
|
| | print("train result:", train_result_LSTM) |
| | print("test result:", test_result_LSTM) |
| |
|
| | confusion_matrix(model_LSTM.predict(X_test_LSTM), Y_test_LSTM) |
| |
|
| | """### Compare all the models:""" |
| |
|
| | train_results.append(train_result_LSTM);test_results.append(test_result_LSTM) |
| | names.append("LSTM") |
| |
|
| | |
| | from matplotlib import pyplot |
| | fig = pyplot.figure() |
| | ind = np.arange(len(names)) |
| | width = 0.35 |
| | fig.suptitle('Algorithm Comparison') |
| | ax = fig.add_subplot(111) |
| | pyplot.bar(ind - width/2, train_results, width=width, label='Train Error') |
| | pyplot.bar(ind + width/2, test_results, width=width, label='Test Error') |
| | fig.set_size_inches(15,8) |
| | pyplot.legend() |
| | ax.set_xticks(ind) |
| | ax.set_xticklabels(names) |
| | pyplot.show() |
| | plt.savefig('/content/drive/MyDrive/NLP/Depression_Detection/modeling/classification_comparision.png') |
| |
|
| | """## Evaluate the performance: |
| | |
| | * **Accuracy:** the fraction of predictions the model got right. |
| | * **Confusion Matrix:** a summary table that breaks down the number of correct and incorrect predictions by each class. |
| | * **ROC:** a plot that illustrates the true positive rate against the false positive rate at various threshold settings. The area under the curve (AUC) indicates the probability that the classifier will rank a randomly chosen positive observation higher than a randomly chosen negative one. |
| | * **Precision:** the fraction of relevant instances among the retrieved instances. |
| | * **Recall:** the fraction of the total amount of relevant instances that were actually retrieved. |
| | """ |
| |
|
| | def conf_matrix_acc(y_true, y_pred): |
| | |
| | cm = confusion_matrix(y_true, y_pred) |
| | fig, ax = plt.subplots() |
| | sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, |
| | cbar=False) |
| | ax.set(xlabel="Pred", ylabel="True", xticklabels=classes, |
| | yticklabels=classes, title="Confusion matrix") |
| | plt.yticks(rotation=0) |
| | print("=========================================") |
| | print(f'Accuracy score is : {accuracy_score(y_true, y_pred)}') |
| | print("=========================================") |
| | print("Detail:") |
| | print(skm.classification_report(y_true, y_pred)) |
| |
|
| | |
| | def roc_precision_auc(): |
| | fig, ax = plt.subplots(nrows=1, ncols=2) |
| | |
| | for i in range(len(classes)): |
| | fpr, tpr, thresholds = skm.roc_curve(y_test_array[:,i], |
| | probs[:,i]) |
| | ax[0].plot(fpr, tpr, lw=3, |
| | label='{0} (area={1:0.2f})'.format(classes[i], |
| | skm.auc(fpr, tpr)) |
| | ) |
| | ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--') |
| | ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], |
| | xlabel='False Positive Rate', |
| | ylabel="True Positive Rate (Recall)", |
| | title="Receiver operating characteristic") |
| | ax[0].legend(loc="lower right") |
| | ax[0].grid(True) |
| |
|
| | |
| | for i in range(len(classes)): |
| | precision, recall, thresholds = skm.precision_recall_curve( |
| | y_test_array[:,i], probs[:,i]) |
| | ax[1].plot(recall, precision, lw=3, |
| | label='{0} (area={1:0.2f})'.format(classes[i], |
| | skm.auc(recall, precision)) |
| | ) |
| | ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', |
| | ylabel="Precision", title="Precision-Recall curve") |
| | ax[1].legend(loc="best") |
| | ax[1].grid(True) |
| | plt.show() |
| | |
| | |
| | |
| | print(f'AUC score is : {skm.roc_auc_score(Y_test, probs[:,1])}') |
| |
|
| | """## Support Vector Machine(SVM) with word embedding:""" |
| |
|
| | nlp = en_core_web_lg.load() |
| |
|
| | |
| | all_vectors = pd.np.array([pd.np.array([token.vector for token in nlp(s)]).mean(axis=0) * pd.np.ones((300)) \ |
| | for s in df_all['clean_text']]) |
| |
|
| | |
| | Y= df_all["label"] |
| | X = all_vectors |
| |
|
| | from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV |
| | validation_size = 0.3 |
| | seed = 7 |
| | X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed) |
| |
|
| | |
| | num_folds = 10 |
| | seed = 7 |
| | scoring = 'accuracy' |
| |
|
| | |
| | clf = SVC(probability=True) |
| |
|
| | |
| | |
| | |
| | res = clf.fit(X_train, Y_train) |
| | train_result = accuracy_score(res.predict(X_train), Y_train) |
| | test_result = accuracy_score(res.predict(X_test), Y_test) |
| |
|
| | print("train_result:", "test_resuld:", train_result, test_result, sep=" ") |
| |
|
| | |
| | SVM = "/content/drive/MyDrive/NLP/Depression_Detection/modeling/model_svm1.pkl" |
| |
|
| | with open(SVM, 'wb') as file: |
| | pickle.dump(clf, file) |
| |
|
| | |
| | with open(SVM, 'rb') as file: |
| | clf = pickle.load(file) |
| |
|
| | clf |
| |
|
| | |
| | |
| | y_pred_svm = res.predict(X_test) |
| | classes = np.unique(Y_test.to_list()) |
| | y_test_array = pd.get_dummies(Y_test, drop_first=False).values |
| | probs = res.predict_proba(X_test) |
| | conf_matrix_acc(Y_test.to_list(),y_pred_svm) |
| | roc_precision_auc() |
| |
|
| | """## Exploring False positive and False negative:""" |
| |
|
| | |
| | y_test_1 = [x for x in y_test] |
| | y_pred_lr_1 = [x for x in y_pred_lr] |
| |
|
| | |
| | idx = [] |
| | for i in range(len(y_test_1)): |
| | if y_test_1[i] != y_pred_lr_1[i]: |
| | idx.append(i) |
| | i+=1 |
| |
|
| | print('There are", {} "wrong preditions", len(idx)) |
| | |
| | wrong_arr = cv.inverse_transform(X_test_tfidf[idx]) |
| | |
| | ## detokenize the wrong array |
| | detokenized = [TreebankWordDetokenizer().detokenize(x) for x in wrong_arr] |
| | |
| | detokenized[:50] |
| | |
| | """There is no specific patterns between false positive and false negative predictions.""" |