import os import urllib.request import zipfile import json import pandas as pd import time import torch import numpy as np import pandas as pd import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder import shutil import os import pyarrow.parquet as pq cols = [ 'name', 'pid', 'num_followers', 'pos', 'artist_name', 'track_name', 'album_name' ] def copy_file(src, dst): dst_dir = os.path.dirname(dst) if not os.path.exists(dst_dir): os.makedirs(dst_dir) shutil.copy2(src, dst) def unzip_archive(filepath, dir_path): with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref: zip_ref.extractall(dir_path) def make_dir(directory): if os.path.exists(directory): shutil.rmtree(directory) os.makedirs(directory) else: os.makedirs(directory) def make_dataset(): directory = os.getcwd() + '/data/raw/playlists/data' df = pd.DataFrame() index = 0 # Loop through all files in the directory for filename in os.listdir(directory): # Check if the item is a file (not a subdirectory) if os.path.isfile(os.path.join(directory, filename)): if filename.find('.json') != -1 : index += 1 # Print the filename or perform operations on the file print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='') # If you need the full file path, you can use: full_path = os.path.join(directory, filename) with open(full_path, 'r') as file: json_data = json.load(file) temp = pd.DataFrame(json_data['playlists']) expanded_df = temp.explode('tracks').reset_index(drop=True) # Normalize the JSON data json_normalized = pd.json_normalize(expanded_df['tracks']) # Concatenate the original DataFrame with the normalized JSON data result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1) result = result[cols] df = pd.concat([df, result], axis=0, ignore_index=True) if index % 50 == 0: df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet') del df df = pd.DataFrame() if index % 200 == 0: break if __name__ == '__main__': unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists') directory = os.getcwd() + '/data/raw/data' make_dir(directory) directory = os.getcwd() + '/data/processed' make_dir(directory) make_dataset()