|
|
import numpy as np
|
|
|
import os
|
|
|
import urllib.request
|
|
|
import zipfile
|
|
|
import json
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import torch
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
from torch.utils.data import DataLoader, TensorDataset
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
import matplotlib.pyplot as plt
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
import shutil
|
|
|
import os
|
|
|
import pyarrow.parquet as pq
|
|
|
|
|
|
def make_dir(directory):
|
|
|
if os.path.exists(directory):
|
|
|
shutil.rmtree(directory)
|
|
|
os.makedirs(directory)
|
|
|
else:
|
|
|
os.makedirs(directory)
|
|
|
|
|
|
|
|
|
def read_parquet_folder(folder_path):
|
|
|
dataframes = []
|
|
|
for file in os.listdir(folder_path):
|
|
|
if file.endswith('.parquet'):
|
|
|
file_path = os.path.join(folder_path, file)
|
|
|
df = pd.read_parquet(file_path)
|
|
|
dataframes.append(df)
|
|
|
|
|
|
return pd.concat(dataframes, ignore_index=True)
|
|
|
|
|
|
|
|
|
def create_ids(df, col, name):
|
|
|
|
|
|
value_to_id = {val: i for i, val in enumerate(df[col].unique())}
|
|
|
|
|
|
|
|
|
df[f'{name}_id'] = df[col].map(value_to_id)
|
|
|
df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
|
|
|
|
|
|
return df
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
folder_path = os.getcwd() + '/data/raw/data'
|
|
|
df = read_parquet_folder(folder_path)
|
|
|
|
|
|
directory = os.getcwd() + '/data/processed'
|
|
|
make_dir(directory)
|
|
|
|
|
|
df = create_ids(df, 'artist_name', 'artist')
|
|
|
df = create_ids(df, 'pid', 'playlist')
|
|
|
df = create_ids(df, 'album_name', 'album')
|
|
|
|
|
|
df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
|
|
|
df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
|
|
|
df['playlist_songs'] += 1
|
|
|
|
|
|
df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
|
|
|
value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
|
|
|
df['artist_album_id'] = df['artist_album'].map(value_to_id)
|
|
|
|
|
|
df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
|
|
|
|
|
|
df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
|
|
|
|
|
|
encoder = LabelEncoder()
|
|
|
encoder.fit(df['track_name'])
|
|
|
|
|
|
df['track_id'] = encoder.transform(df['track_name'])
|
|
|
df['song_percent'] = df['song_count'] / df['playlist_songs']
|
|
|
df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
|
|
|
|
|
|
artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
|
|
|
artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')
|
|
|
|