keesephillips's picture
Upload 19 files
35702e9 verified
raw
history blame
2.91 kB
import numpy as np
import os
import urllib.request
import zipfile
import json
import pandas as pd
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import shutil
import os
import pyarrow.parquet as pq
def make_dir(directory):
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
else:
os.makedirs(directory)
def read_parquet_folder(folder_path):
dataframes = []
for file in os.listdir(folder_path):
if file.endswith('.parquet'):
file_path = os.path.join(folder_path, file)
df = pd.read_parquet(file_path)
dataframes.append(df)
return pd.concat(dataframes, ignore_index=True)
def create_ids(df, col, name):
# Create a dictionary mapping unique values to IDs
value_to_id = {val: i for i, val in enumerate(df[col].unique())}
# Create a new column with the IDs
df[f'{name}_id'] = df[col].map(value_to_id)
df[[f'{name}_id', col]].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/{name}.csv')
return df
if __name__ == '__main__':
folder_path = os.getcwd() + '/data/raw/data'
df = read_parquet_folder(folder_path)
directory = os.getcwd() + '/data/processed'
make_dir(directory)
df = create_ids(df, 'artist_name', 'artist')
df = create_ids(df, 'pid', 'playlist')
df = create_ids(df, 'album_name', 'album')
df['song_count'] = df.groupby(['pid','artist_name','album_name'])['track_name'].transform('nunique')
df['playlist_songs'] = df.groupby(['pid'])['pos'].transform('max')
df['playlist_songs'] += 1
df['artist_album'] = df[['artist_name', 'album_name']].agg('::'.join, axis=1)
value_to_id = {val: i for i, val in enumerate(df['artist_album'].unique())}
df['artist_album_id'] = df['artist_album'].map(value_to_id)
df[[f'artist_album_id', 'artist_album', 'artist_name', 'album_name', 'track_name']].drop_duplicates().to_csv(os.getcwd() + f'/data/processed/artist_album.csv')
df['song_count'] = df.groupby(['playlist_id','artist_album_id'])['song_count'].transform('sum')
encoder = LabelEncoder()
encoder.fit(df['track_name'])
df['track_id'] = encoder.transform(df['track_name'])
df['song_percent'] = df['song_count'] / df['playlist_songs']
df['song_percent'] = 1 / (1 + np.exp(-df['song_percent']))
artists = df.loc[:,['playlist_id','artist_album_id','song_percent']].drop_duplicates()
artists.loc[:,['playlist_id','artist_album_id',]].to_csv(os.getcwd() + '/data/processed/playlists.csv')