recommendation_module_project / scripts /build_features.py

Upload 19 files

35702e9 verified over 1 year ago

3.07 kB

	import os
	import urllib.request
	import zipfile
	import json
	import pandas as pd
	import time
	import torch
	import numpy as np
	import pandas as pd
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.optim as optim
	from torch.utils.data import DataLoader, TensorDataset
	from sklearn.model_selection import train_test_split
	import matplotlib.pyplot as plt
	from sklearn.preprocessing import LabelEncoder
	import shutil
	import os
	import pyarrow.parquet as pq


	cols = [
	'name',
	'pid',
	'num_followers',
	'pos',
	'artist_name',
	'track_name',
	'album_name'
	]


	def copy_file(src, dst):

	dst_dir = os.path.dirname(dst)
	if not os.path.exists(dst_dir):
	os.makedirs(dst_dir)

	shutil.copy2(src, dst)

	def unzip_archive(filepath, dir_path):
	with zipfile.ZipFile(f"{filepath}", 'r') as zip_ref:
	zip_ref.extractall(dir_path)


	def make_dir(directory):
	if os.path.exists(directory):
	shutil.rmtree(directory)
	os.makedirs(directory)
	else:
	os.makedirs(directory)


	def make_dataset():
	directory = os.getcwd() + '/data/raw/playlists/data'
	df = pd.DataFrame()
	index = 0
	# Loop through all files in the directory
	for filename in os.listdir(directory):
	# Check if the item is a file (not a subdirectory)
	if os.path.isfile(os.path.join(directory, filename)):
	if filename.find('.json') != -1 :
	index += 1

	# Print the filename or perform operations on the file
	print(f'\r{filename}\t{index}/1000\t{((index/1000)*100):.1f}%', end='')

	# If you need the full file path, you can use:
	full_path = os.path.join(directory, filename)

	with open(full_path, 'r') as file:
	json_data = json.load(file)

	temp = pd.DataFrame(json_data['playlists'])
	expanded_df = temp.explode('tracks').reset_index(drop=True)

	# Normalize the JSON data
	json_normalized = pd.json_normalize(expanded_df['tracks'])

	# Concatenate the original DataFrame with the normalized JSON data
	result = pd.concat([expanded_df.drop(columns=['tracks']), json_normalized], axis=1)

	result = result[cols]

	df = pd.concat([df, result], axis=0, ignore_index=True)

	if index % 50 == 0:
	df.to_parquet(f'{os.getcwd()}/data/raw/data/playlists_{index % 1000}.parquet')
	del df
	df = pd.DataFrame()
	if index % 200 == 0:
	break


	if __name__ == '__main__':
	unzip_archive(os.getcwd() + '/data/raw/spotify_million_playlist_dataset.zip', os.getcwd() + '/data/raw/playlists')
	directory = os.getcwd() + '/data/raw/data'
	make_dir(directory)
	directory = os.getcwd() + '/data/processed'
	make_dir(directory)
	make_dataset()