import os import urllib.request import tarfile from pathlib import Path import shutil import zipfile import os def get_archive(path,url,Set): try: os.mkdir(path) except: path=path urllib.request.urlretrieve(url,f"{path}/{Set}.tar") def extract_tar(tar_file): print(f'{os.getcwd()}/data/raw/{tar_file}.tar', end='\r') file = tarfile.open(f'{os.getcwd()}/data/raw/{tar_file}.tar') file.extractall(f'{os.getcwd()}/data/raw/{tar_file}') file.close() os.remove(f'{os.getcwd()}/data/raw/{tar_file}.tar') def make_dir(target_dir): if Path(target_dir).exists() and Path(target_dir).is_dir(): shutil.rmtree(Path(target_dir)) os.makedirs(target_dir, exist_ok=True) def combine_dirs(source_dirs): for source_dir in source_dirs: for subdir, dirs, files in os.walk(os.getcwd() + '/data/raw/' + source_dir): for file in files: filepath = subdir + os.sep + file if filepath.find('.jpg') != -1: shutil.copy(filepath, target_dir) if Path(os.getcwd() + '/data/raw/' + source_dir).exists(): shutil.rmtree(Path(os.getcwd() + '/data/raw/' + source_dir)) def unzip_file(zip_file_path, extract_to): # Create the target directory if it doesn't exist os.makedirs(extract_to, exist_ok=True) # Open the zip file with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: # Extract all contents to the specified directory zip_ref.extractall(extract_to) if __name__ == '__main__': make_dir(os.getcwd() + '/data/raw') make_dir(os.getcwd() + '/data/processed') make_dir(os.getcwd() + '/data/outputs') make_dir(os.getcwd() + '/models') get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz',"label") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz',"train0") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-1.tar.gz',"train1") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-2.tar.gz',"train2") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-3.tar.gz',"train3") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-4.tar.gz',"train4") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-5.tar.gz',"train5") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-6.tar.gz',"train6") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/val.tar.gz',"val") get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/test.tar.gz',"test") extract_tar("train0") extract_tar("train1") extract_tar("train2") extract_tar("train3") extract_tar("train4") extract_tar("train5") extract_tar("train6") extract_tar("label") extract_tar("val") extract_tar("test") target_dir = os.getcwd() + '/data/raw/train/publaynet/train/' make_dir(target_dir) source_dirs = ['train0','train1','train2','train3', 'train4', 'train5', 'train6'] combine_dirs(source_dirs) source_dirs = ['val', 'test'] combine_dirs(source_dirs) unzip_file('hand_labeled_tables.zip', os.getcwd() + '/data/processed/hand_labeled_tables')