keesephillips's picture
Upload 19 files
35702e9 verified
raw
history blame
3.07 kB
import os
import urllib.request
import zipfile
import json
import pandas as pd
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import shutil
import os
import pyarrow.parquet as pq
cols = [
'name',
'pid',
'num_followers',
'pos',
'artist_name',
'track_name',
'album_name'
]
def copy_file(src, dst):
dst_dir = os.path.dirname(dst)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
shutil.copy2(src, dst)
def unzip_archive(filepath, dir_path):
with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
zip_ref.extractall(dir_path)
def make_dir(directory):
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
else:
os.makedirs(directory)
def make_dataset():
directory = os.getcwd() + '/data/raw/playlists/data'
df = pd.DataFrame()
index = 0
# Loop through all files in the directory
for filename in os.listdir(directory):
# Check if the item is a file (not a subdirectory)
if os.path.isfile(os.path.join(directory, filename)):
if filename.find('.json') != -1 :
index += 1
# Print the filename or perform operations on the file
print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')
# If you need the full file path, you can use:
full_path = os.path.join(directory, filename)
with open(full_path, 'r') as file:
json_data = json.load(file)
temp = pd.DataFrame(json_data['playlists'])
expanded_df = temp.explode('tracks').reset_index(drop=True)
# Normalize the JSON data
json_normalized = pd.json_normalize(expanded_df['tracks'])
# Concatenate the original DataFrame with the normalized JSON data
result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)
result = result[cols]
df = pd.concat([df, result], axis=0, ignore_index=True)
if index % 50 == 0:
df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
del df
df = pd.DataFrame()
if index % 200 == 0:
break
if __name__ == '__main__':
unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
directory = os.getcwd() + '/data/raw/data'
make_dir(directory)
directory = os.getcwd() + '/data/processed'
make_dir(directory)
make_dataset()