| |
| import torch |
| import torch.nn as nn |
| from torch.utils.data import DataLoader, TensorDataset |
| from transformers import BertModel, AdamW |
| from sklearn.metrics import accuracy_score |
| import numpy as np |
|
|
| |
| from transactify.data_preprocessing import preprocessing_data, split_data, read_data |
|
|
| |
| class BertClassifier(nn.Module): |
| def __init__(self, num_labels, dropout_rate=0.3): |
| super(BertClassifier, self).__init__() |
| self.bert = BertModel.from_pretrained("bert-base-uncased") |
| self.dropout = nn.Dropout(dropout_rate) |
| self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels) |
|
|
| def forward(self, input_ids, attention_mask): |
| outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) |
| pooled_output = outputs[1] |
| output = self.dropout(pooled_output) |
| logits = self.classifier(output) |
| return logits |
|
|
| |
| |
| def train_model(model, train_dataloader, val_dataloader, device, epochs=3, lr=2e-5): |
| optimizer = AdamW(model.parameters(), lr=lr) |
| loss_fn = nn.CrossEntropyLoss() |
|
|
| for epoch in range(epochs): |
| model.train() |
| total_train_loss = 0 |
| for step, batch in enumerate(train_dataloader): |
| b_input_ids, b_input_mask, b_labels = batch |
|
|
| b_input_ids = b_input_ids.to(device) |
| b_input_mask = b_input_mask.to(device) |
| b_labels = b_labels.to(device).long() |
|
|
| model.zero_grad() |
| outputs = model(b_input_ids, b_input_mask) |
|
|
| loss = loss_fn(outputs, b_labels) |
| total_train_loss += loss.item() |
| loss.backward() |
| optimizer.step() |
|
|
| avg_train_loss = total_train_loss / len(train_dataloader) |
| print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}") |
|
|
| model.eval() |
| total_val_accuracy = 0 |
| total_val_loss = 0 |
|
|
| with torch.no_grad(): |
| for batch in val_dataloader: |
| b_input_ids, b_input_mask, b_labels = batch |
| b_input_ids = b_input_ids.to(device) |
| b_input_mask = b_input_mask.to(device) |
| b_labels = b_labels.to(device) |
|
|
| outputs = model(b_input_ids, b_input_mask) |
| loss = loss_fn(outputs, b_labels) |
| total_val_loss += loss.item() |
|
|
| preds = torch.argmax(outputs, dim=1) |
| total_val_accuracy += (preds == b_labels).sum().item() |
|
|
| avg_val_accuracy = total_val_accuracy / len(val_dataloader.dataset) |
| avg_val_loss = total_val_loss / len(val_dataloader) |
| print(f"Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}") |
|
|
| |
| def test_model(model, test_dataloader, device): |
| model.eval() |
| all_preds = [] |
| all_labels = [] |
| with torch.no_grad(): |
| for batch in test_dataloader: |
| b_input_ids, b_input_mask, b_labels = batch |
| b_input_ids = b_input_ids.to(device) |
| b_input_mask = b_input_mask.to(device) |
| b_labels = b_labels.to(device) |
|
|
| outputs = model(b_input_ids, b_input_mask) |
| preds = torch.argmax(outputs, dim=1) |
|
|
| all_preds.append(preds.cpu().numpy()) |
| all_labels.append(b_labels.cpu().numpy()) |
|
|
| all_preds = np.concatenate(all_preds) |
| all_labels = np.concatenate(all_labels) |
| accuracy = accuracy_score(all_labels, all_preds) |
| print(f"Test Accuracy: {accuracy}") |
|
|
| |
| def main(data_path, epochs=3, batch_size=16): |
| |
| data = read_data(data_path) |
| if data is None: |
| return |
| |
| input_ids, attention_masks, labels, labelencoder = preprocessing_data(data) |
| X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = split_data(input_ids, attention_masks, labels) |
|
|
| |
| num_labels = len(labelencoder.classes_) |
| |
| |
| model = BertClassifier(num_labels) |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
|
|
| |
| train_dataset = TensorDataset(X_train_ids, X_train_masks, y_train) |
| train_dataloader = DataLoader(train_dataset, batch_size=batch_size) |
|
|
| val_dataset = TensorDataset(X_test_ids, X_test_masks, y_test) |
| val_dataloader = DataLoader(val_dataset, batch_size=batch_size) |
|
|
| |
| train_model(model, train_dataloader, val_dataloader, device, epochs=epochs) |
|
|
| |
| test_dataloader = DataLoader(val_dataset, batch_size=batch_size) |
| test_model(model, test_dataloader, device) |
|
|
| if __name__ == "__main__": |
| data_path = r"E:\transactify\transactify\Dataset\transaction_data.csv" |
| main(data_path) |
|
|