|
|
import os
|
|
|
import urllib.request
|
|
|
import zipfile
|
|
|
import json
|
|
|
import pandas as pd
|
|
|
import time
|
|
|
import torch
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import torch.nn as nn
|
|
|
import torch.nn.functional as F
|
|
|
import torch.optim as optim
|
|
|
from torch.utils.data import DataLoader, TensorDataset
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
import matplotlib.pyplot as plt
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
import shutil
|
|
|
import os
|
|
|
import pyarrow.parquet as pq
|
|
|
|
|
|
|
|
|
cols = [
|
|
|
'name',
|
|
|
'pid',
|
|
|
'num_followers',
|
|
|
'pos',
|
|
|
'artist_name',
|
|
|
'track_name',
|
|
|
'album_name'
|
|
|
]
|
|
|
|
|
|
|
|
|
def copy_file(src, dst):
|
|
|
|
|
|
dst_dir = os.path.dirname(dst)
|
|
|
if not os.path.exists(dst_dir):
|
|
|
os.makedirs(dst_dir)
|
|
|
|
|
|
shutil.copy2(src, dst)
|
|
|
|
|
|
def unzip_archive(filepath, dir_path):
|
|
|
with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
|
|
|
zip_ref.extractall(dir_path)
|
|
|
|
|
|
|
|
|
def make_dir(directory):
|
|
|
if os.path.exists(directory):
|
|
|
shutil.rmtree(directory)
|
|
|
os.makedirs(directory)
|
|
|
else:
|
|
|
os.makedirs(directory)
|
|
|
|
|
|
|
|
|
def make_dataset():
|
|
|
directory = os.getcwd() + '/data/raw/playlists/data'
|
|
|
df = pd.DataFrame()
|
|
|
index = 0
|
|
|
|
|
|
for filename in os.listdir(directory):
|
|
|
|
|
|
if os.path.isfile(os.path.join(directory, filename)):
|
|
|
if filename.find('.json') != -1 :
|
|
|
index += 1
|
|
|
|
|
|
|
|
|
print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
|
|
|
|
|
|
|
|
|
full_path = os.path.join(directory, filename)
|
|
|
|
|
|
with open(full_path, 'r') as file:
|
|
|
json_data = json.load(file)
|
|
|
|
|
|
temp = pd.DataFrame(json_data['playlists'])
|
|
|
expanded_df = temp.explode('tracks').reset_index(drop=True)
|
|
|
|
|
|
|
|
|
json_normalized = pd.json_normalize(expanded_df['tracks'])
|
|
|
|
|
|
|
|
|
result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
|
|
|
|
|
|
result = result[cols]
|
|
|
|
|
|
df = pd.concat([df, result], axis=0, ignore_index=True)
|
|
|
|
|
|
if index % 50 == 0:
|
|
|
df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
|
|
|
del df
|
|
|
df = pd.DataFrame()
|
|
|
if index % 200 == 0:
|
|
|
break
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
|
|
|
directory = os.getcwd() + '/data/raw/data'
|
|
|
make_dir(directory)
|
|
|
directory = os.getcwd() + '/data/processed'
|
|
|
make_dir(directory)
|
|
|
make_dataset()
|
|
|
|
|
|
|