""" Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/ Jon Reifschneider Brinnae Bent """ import os import urllib.request import zipfile import json import pandas as pd import time import torch import numpy as np import pandas as pd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder def prep_dataloaders(X_train,y_train,X_val,y_val,batch_size): # Convert training and test data to TensorDatasets trainset = TensorDataset(torch.from_numpy(np.array(X_train)).long(), torch.from_numpy(np.array(y_train)).float()) valset = TensorDataset(torch.from_numpy(np.array(X_val)).long(), torch.from_numpy(np.array(y_val)).float()) # Create Dataloaders for our training and test data to allow us to iterate over minibatches trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True) valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False) return trainloader, valloader class NNColabFiltering(nn.Module): def __init__(self, n_playlists, n_artists, embedding_dim_users, embedding_dim_items, n_activations, rating_range): super().__init__() self.user_embeddings = nn.Embedding(num_embeddings=n_playlists,embedding_dim=embedding_dim_users) self.item_embeddings = nn.Embedding(num_embeddings=n_artists,embedding_dim=embedding_dim_items) self.fc1 = nn.Linear(embedding_dim_users+embedding_dim_items,n_activations) self.fc2 = nn.Linear(n_activations,1) self.rating_range = rating_range def forward(self, X): # Get embeddings for minibatch embedded_users = self.user_embeddings(X[:,0]) embedded_items = self.item_embeddings(X[:,1]) # Concatenate user and item embeddings embeddings = torch.cat([embedded_users,embedded_items],dim=1) # Pass embeddings through network preds = self.fc1(embeddings) preds = F.relu(preds) preds = self.fc2(preds) # Scale predicted ratings to target-range [low,high] preds = torch.sigmoid(preds) * (self.rating_range[1]-self.rating_range[0]) + self.rating_range[0] return preds def train_model(model, criterion, optimizer, dataloaders, device, num_epochs=5, scheduler=None): model = model.to(device) # Send model to GPU if available since = time.time() costpaths = {'train':[],'val':[]} for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 # Get the inputs and labels, and send to GPU if available index = 0 for (inputs,labels) in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # Zero the weight gradients optimizer.zero_grad() # Forward pass to get outputs and calculate loss # Track gradient only for training data with torch.set_grad_enabled(phase == 'train'): outputs = model.forward(inputs).view(-1) loss = criterion(outputs, labels) # Backpropagation to get the gradients with respect to each weight # Only if in train if phase == 'train': loss.backward() # Update the weights optimizer.step() # Convert loss into a scalar and add it to running_loss running_loss += np.sqrt(loss.item()) * labels.size(0) print(f'\r{running_loss} {index} {(index / len(dataloaders[phase]))*100:.2f}%', end='') index +=1 # Step along learning rate scheduler when in train if (phase == 'train') and (scheduler is not None): scheduler.step() # Calculate and display average loss and accuracy for the epoch epoch_loss = running_loss / len(dataloaders[phase].dataset) costpaths[phase].append(epoch_loss) print('\n{} loss: {:.4f}'.format(phase, epoch_loss)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) return costpaths if __name__ == '__main__': artists = pd.read_csv(os.getcwd() + '/data/processed/playlists.csv') X = artists.loc[:,['playlist_id','artist_album_id',]] y = artists.loc[:,'song_percent'] # Split our data into training and test sets X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2) batchsize = 64 trainloader,valloader = prep_dataloaders(X_train,y_train,X_val,y_val,batchsize) dataloaders = {'train':trainloader, 'val':valloader} n_users = X.loc[:,'playlist_id'].max()+1 n_items = X.loc[:,'artist_album_id'].max()+1 model = NNColabFiltering(n_users,n_items,embedding_dim_users=50, embedding_dim_items=50, n_activations = 100,rating_range=[0.,1.]) criterion = nn.MSELoss() lr=0.001 n_epochs=10 wd=1e-3 optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") cost_paths = train_model(model,criterion,optimizer,dataloaders, device,n_epochs, scheduler=None) # Save the entire model torch.save(model, os.getcwd() + '/models/recommender.pt')