| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import pickle |
| | import warnings |
| | import numpy as np |
| | import pandas as pd |
| | import spacy |
| | import en_core_web_lg |
| | from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV |
| | from sklearn.metrics import confusion_matrix, accuracy_score |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.neighbors import KNeighborsClassifier |
| | from sklearn.svm import SVC |
| | from sklearn.tree import DecisionTreeClassifier |
| | from sklearn.neural_network import MLPClassifier |
| | from sklearn.ensemble import RandomForestClassifier |
| |
|
| | |
| | warnings.filterwarnings("ignore") |
| |
|
| | def load_prepare_split_df(filename: str, targets=['label'], validation_size=0.3, seed=7): |
| | """ |
| | Ingests raw data, performs feature extraction via word embeddings, |
| | and partitions the dataset for model validation. |
| | |
| | Methodology: |
| | - TSV Ingestion: Data is loaded from the specified file. |
| | - Semantic Vectorization: Utilizing spaCy's dense 300-dimensional |
| | word embeddings (centroid of token vectors). |
| | - Validation Partitioning: Stratified splitting of data into |
| | training and testing subsets. |
| | |
| | Args: |
| | filename (str): Path to the TSV/CSV dataset. |
| | targets (list): Column name for the dependent variable. |
| | validation_size (float): Proportion of data reserved for testing. |
| | seed (int): Random seed for reproducibility. |
| | |
| | Returns: |
| | tuple: (X_train, X_test, Y_train, Y_test) feature and label sets. |
| | """ |
| | print(f"Acquiring dataset from: {filename}") |
| | df_all = pd.read_csv(filename, sep='\t', encoding='utf-8') |
| |
|
| | |
| | nlp_engine = en_core_web_lg.load() |
| |
|
| | |
| | print("Extracting semantic features via spaCy embeddings...") |
| | feature_vectors = np.array([ |
| | np.array([token.vector for token in nlp_engine(s)]).mean(axis=0) * np.ones((300)) |
| | for s in df_all['clean_text'] |
| | ]) |
| | |
| | |
| | y_labels = df_all.loc[:, targets] |
| | x_features = feature_vectors |
| |
|
| | x_train, x_test, y_train, y_test = train_test_split( |
| | x_features, y_labels, test_size=validation_size, random_state=seed |
| | ) |
| |
|
| | return x_train, x_test, y_train, y_test |
| |
|
| | def classification(X_train, Y_train, model=""): |
| | """ |
| | Facilitates the training and serialization of various classification |
| | architectures. |
| | |
| | Architectures Supported: |
| | - SVM: Support Vector Machine (Selected as the production primary). |
| | - LR: Logistic Regression. |
| | - DT: Decision Tree Classifier. |
| | - KNN: k-Nearest Neighbors (with automated k-optimization). |
| | - RF: Random Forest Classifier. |
| | - NN: Multi-layer Perceptron (MLP) Neural Network. |
| | |
| | Args: |
| | X_train: Training feature set. |
| | Y_train: Training label set. |
| | model (str): Target architecture identifier. |
| | |
| | Returns: |
| | object: The trained Scikit-learn model instance. |
| | """ |
| | if model == "SVM": |
| | |
| | print("Initializing SVM (Support Vector Machine) training...") |
| | clf = SVC(probability=True) |
| | clf.fit(X_train, Y_train) |
| | |
| | |
| | train_accuracy = accuracy_score(clf.predict(X_train), Y_train) |
| | print(f"Training Convergence Accuracy: {train_accuracy:.4f}") |
| |
|
| | |
| | save_path = "../assets/models/model_svm_pc.pkl" |
| | with open(save_path, 'wb') as file: |
| | pickle.dump(clf, file) |
| | return clf |
| |
|
| | elif model == "LR": |
| | |
| | print("Initializing Logistic Regression training...") |
| | lr_model = LogisticRegression() |
| | lr_model.fit(X_train, Y_train) |
| | |
| | save_path = "../assets/models/model_LogReg.pkl" |
| | with open(save_path, 'wb') as file: |
| | pickle.dump(lr_model, file) |
| | return lr_model |
| |
|
| | elif model == "DT": |
| | |
| | print("Initializing Decision Tree training...") |
| | dt_model = DecisionTreeClassifier() |
| | dt_model.fit(X_train, Y_train) |
| | |
| | save_path = "../assets/models/model_DTC.pkl" |
| | with open(save_path, 'wb') as file: |
| | pickle.dump(dt_model, file) |
| | return dt_model |
| |
|
| | elif model == "KNN": |
| | |
| | print("Initializing kNN training with automated k-optimization...") |
| | k_values = range(1, 32, 1) |
| | k_scores = [] |
| |
|
| | |
| | for k in k_values: |
| | knn = KNeighborsClassifier(n_neighbors=k) |
| | score = np.mean(cross_val_score(knn, X_train, Y_train, cv=10)) |
| | k_scores.append(score) |
| | |
| | optimal_k = k_values[np.argmax(k_scores)] |
| | print(f"Optimized Hyperparameter discovered: k = {optimal_k}") |
| |
|
| | best_knn = KNeighborsClassifier(n_neighbors=optimal_k) |
| | best_knn.fit(X_train, Y_train) |
| |
|
| | save_path = "../assets/models/model_KNN.pkl" |
| | with open(save_path, 'wb') as file: |
| | pickle.dump(best_knn, file) |
| | return best_knn |
| |
|
| | elif model == "RF": |
| | |
| | print("Initializing Random Forest training...") |
| | rf_model = RandomForestClassifier() |
| | rf_model.fit(X_train, Y_train) |
| | |
| | save_path = "../assets/models/model_RF.pkl" |
| | with open(save_path, 'wb') as file: |
| | pickle.dump(rf_model, file) |
| | return rf_model |
| |
|
| | elif model == "NN": |
| | |
| | print("Initializing Neural Network (MLP) training...") |
| | nn_model = MLPClassifier() |
| | nn_model.fit(X_train, Y_train) |
| | |
| | save_path = "../assets/models/model_NN.pkl" |
| | with open(save_path, 'wb') as file: |
| | pickle.dump(nn_model, file) |
| | return nn_model |
| |
|
| | def LSTM(filename: str): |
| | """ |
| | Executes a Deep Learning pipeline using Long Short-Term Memory (LSTM) |
| | recurrent neural networks for capturing temporal lingustical patterns. |
| | |
| | Methodology: |
| | - Tokenization: Integer encoding of sequences. |
| | - Padding: Uniform sequence length normalization. |
| | - Architecture: Embedding layer followed by LSTM with Dropouts. |
| | """ |
| | from keras.models import Sequential |
| | from keras.layers import Dense, Embedding, LSTM |
| | from keras.preprocessing.text import Tokenizer |
| | from keras.preprocessing.sequence import pad_sequences |
| | from keras.wrappers.scikit_learn import KerasClassifier |
| |
|
| | print(f"Acquiring data for Deep Learning (LSTM): {filename}") |
| | df_dl = pd.read_csv(filename, sep='\t', encoding='utf-8') |
| |
|
| | |
| | vocab_size = 20000 |
| | max_len = 50 |
| | tokenizer = Tokenizer(num_words=vocab_size) |
| | tokenizer.fit_on_texts(df_dl['clean_text']) |
| | seqs = tokenizer.texts_to_sequences(df_dl['clean_text']) |
| | x_lstm = pad_sequences(seqs, maxlen=max_len) |
| | y_lstm = df_dl["label"] |
| |
|
| | |
| | print("Constructing LSTM topology...") |
| | model = Sequential() |
| | model.add(Embedding(vocab_size, 300, input_length=max_len)) |
| | model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) |
| | model.add(Dense(1, activation='sigmoid')) |
| | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
| |
|
| | |
| | print("Commencing Deep Learning Convergence (LSTM)...") |
| | |
| | |
| | model.fit(x_lstm, y_lstm, epochs=3, verbose=1, validation_split=0.3) |
| |
|
| | |
| | model_json = model.to_json() |
| | with open("model_LSTM.json", "w") as json_file: |
| | json_file.write(model_json) |
| | model.save_weights("model_LSTM.h5") |
| | print("Deep Learning model (LSTM) artifacts successfully persisted.") |
| |
|
| |
|