|
|
import os
|
|
|
import urllib.request
|
|
|
import tarfile
|
|
|
from pathlib import Path
|
|
|
import shutil
|
|
|
import zipfile
|
|
|
import os
|
|
|
|
|
|
|
|
|
def get_archive(path,url,Set):
|
|
|
try:
|
|
|
os.mkdir(path)
|
|
|
except:
|
|
|
path=path
|
|
|
|
|
|
urllib.request.urlretrieve(url,f"{path}/{Set}.tar")
|
|
|
|
|
|
|
|
|
def extract_tar(tar_file):
|
|
|
print(f'{os.getcwd()}/data/raw/{tar_file}.tar', end='\r')
|
|
|
file = tarfile.open(f'{os.getcwd()}/data/raw/{tar_file}.tar')
|
|
|
file.extractall(f'{os.getcwd()}/data/raw/{tar_file}')
|
|
|
file.close()
|
|
|
os.remove(f'{os.getcwd()}/data/raw/{tar_file}.tar')
|
|
|
|
|
|
def make_dir(target_dir):
|
|
|
if Path(target_dir).exists() and Path(target_dir).is_dir():
|
|
|
shutil.rmtree(Path(target_dir))
|
|
|
os.makedirs(target_dir, exist_ok=True)
|
|
|
|
|
|
def combine_dirs(source_dirs):
|
|
|
for source_dir in source_dirs:
|
|
|
for subdir, dirs, files in os.walk(os.getcwd() + '/data/raw/' + source_dir):
|
|
|
for file in files:
|
|
|
filepath = subdir + os.sep + file
|
|
|
|
|
|
if filepath.find('.jpg') != -1:
|
|
|
shutil.copy(filepath, target_dir)
|
|
|
|
|
|
if Path(os.getcwd() + '/data/raw/' + source_dir).exists():
|
|
|
shutil.rmtree(Path(os.getcwd() + '/data/raw/' + source_dir))
|
|
|
|
|
|
|
|
|
def unzip_file(zip_file_path, extract_to):
|
|
|
|
|
|
os.makedirs(extract_to, exist_ok=True)
|
|
|
|
|
|
|
|
|
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
|
|
|
|
|
zip_ref.extractall(extract_to)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
make_dir(os.getcwd() + '/data/raw')
|
|
|
make_dir(os.getcwd() + '/data/processed')
|
|
|
make_dir(os.getcwd() + '/data/outputs')
|
|
|
make_dir(os.getcwd() + '/models')
|
|
|
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz',"label")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz',"train0")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-1.tar.gz',"train1")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-2.tar.gz',"train2")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-3.tar.gz',"train3")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-4.tar.gz',"train4")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-5.tar.gz',"train5")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-6.tar.gz',"train6")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/val.tar.gz',"val")
|
|
|
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/test.tar.gz',"test")
|
|
|
|
|
|
extract_tar("train0")
|
|
|
extract_tar("train1")
|
|
|
extract_tar("train2")
|
|
|
extract_tar("train3")
|
|
|
extract_tar("train4")
|
|
|
extract_tar("train5")
|
|
|
extract_tar("train6")
|
|
|
extract_tar("label")
|
|
|
extract_tar("val")
|
|
|
extract_tar("test")
|
|
|
|
|
|
target_dir = os.getcwd() + '/data/raw/train/publaynet/train/'
|
|
|
make_dir(target_dir)
|
|
|
|
|
|
source_dirs = ['train0','train1','train2','train3', 'train4', 'train5', 'train6']
|
|
|
combine_dirs(source_dirs)
|
|
|
|
|
|
source_dirs = ['val', 'test']
|
|
|
combine_dirs(source_dirs)
|
|
|
|
|
|
unzip_file('hand_labeled_tables.zip', os.getcwd() + '/data/processed/hand_labeled_tables') |