File size: 3,071 Bytes

35702e9

import os
import urllib.request
import zipfile
import json
import pandas as pd
import time
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import shutil
import os
import pyarrow.parquet as pq


cols = [
    'name',
    'pid',
    'num_followers',
    'pos',
    'artist_name',
    'track_name',
    'album_name'
]


def copy_file(src, dst):

  dst_dir = os.path.dirname(dst)
  if not os.path.exists(dst_dir):
    os.makedirs(dst_dir)

  shutil.copy2(src, dst)

def unzip_archive(filepath, dir_path):
  with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
    zip_ref.extractall(dir_path)


def make_dir(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
        os.makedirs(directory)
    else:
        os.makedirs(directory)


def make_dataset():
    directory = os.getcwd() + '/data/raw/playlists/data'
    df = pd.DataFrame()
    index = 0
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        # Check if the item is a file (not a subdirectory)
        if os.path.isfile(os.path.join(directory, filename)):
            if filename.find('.json') != -1 :
                index += 1

                # Print the filename or perform operations on the file
                print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')

                # If you need the full file path, you can use:
                full_path = os.path.join(directory, filename)

                with open(full_path, 'r') as file:
                    json_data = json.load(file)

                temp = pd.DataFrame(json_data['playlists'])
                expanded_df = temp.explode('tracks').reset_index(drop=True)

                # Normalize the JSON data
                json_normalized = pd.json_normalize(expanded_df['tracks'])

                # Concatenate the original DataFrame with the normalized JSON data
                result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)

                result = result[cols]

                df = pd.concat([df, result], axis=0, ignore_index=True)

                if index % 50 == 0:
                    df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
                    del df
                    df = pd.DataFrame()
                    if index % 200 == 0:
                        break
                    

if __name__ == '__main__':
    unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
    directory = os.getcwd() + '/data/raw/data'
    make_dir(directory)
    directory = os.getcwd() + '/data/processed'
    make_dir(directory)
    make_dataset()