import numpy as np import os import urllib.request import zipfile import json import pandas as pd import time import torch import numpy as np import pandas as pd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder import shutil import os import pyarrow.parquet as pq def make_dir(directory): if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) else: os.makedirs(directory) def read_parquet_folder(folder_path): dataframes = [] for file in os.listdir(folder_path): if file.endswith('.parquet'): file_path = os.path.join(folder_path, file) df = pd.read_parquet(file_path) dataframes.append(df) return pd.concat(dataframes, ignore_index=True) def create_ids(df, col, name): # Create a dictionary mapping unique values to IDs value_to_id = {val: i for i, val in enumerate(df[col].unique())} # Create a new column with the IDs df[f'{name}_id'] = df[col].map(value_to_id) df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv') return df if __name__ == '__main__': folder_path = os.getcwd() + '/data/raw/data' df = read_parquet_folder(folder_path) directory = os.getcwd() + '/data/processed' make_dir(directory) df = create_ids(df, 'artist_name', 'artist') df = create_ids(df, 'pid', 'playlist') df = create_ids(df, 'album_name', 'album') df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique') df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max') df['playlist_songs'] += 1 df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1) value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())} df['artist_album_id'] = df['artist_album'].map(value_to_id) df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv') df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum') encoder = LabelEncoder() encoder.fit(df['track_name']) df['track_id'] = encoder.transform(df['track_name']) df['song_percent'] = df['song_count'] / df['playlist_songs'] df['song_percent'] = 1 / (1 + np.exp(-df['song_percent'])) artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates() artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')