table_extraction / scripts /make_dataset.py
keesephillips's picture
initial commit
b3fc8d0 verified
raw
history blame
3.67 kB
import os
import urllib.request
import tarfile
from pathlib import Path
import shutil
import zipfile
import os
def get_archive(path,url,Set):
try:
os.mkdir(path)
except:
path=path
urllib.request.urlretrieve(url,f"{path}/{Set}.tar")
def extract_tar(tar_file):
print(f'{os.getcwd()}/data/raw/{tar_file}.tar', end='\r')
file = tarfile.open(f'{os.getcwd()}/data/raw/{tar_file}.tar')
file.extractall(f'{os.getcwd()}/data/raw/{tar_file}')
file.close()
os.remove(f'{os.getcwd()}/data/raw/{tar_file}.tar')
def make_dir(target_dir):
if Path(target_dir).exists() and Path(target_dir).is_dir():
shutil.rmtree(Path(target_dir))
os.makedirs(target_dir, exist_ok=True)
def combine_dirs(source_dirs):
for source_dir in source_dirs:
for subdir, dirs, files in os.walk(os.getcwd() + '/data/raw/' + source_dir):
for file in files:
filepath = subdir + os.sep + file
if filepath.find('.jpg') != -1:
shutil.copy(filepath, target_dir)
if Path(os.getcwd() + '/data/raw/' + source_dir).exists():
shutil.rmtree(Path(os.getcwd() + '/data/raw/' + source_dir))
def unzip_file(zip_file_path, extract_to):
# Create the target directory if it doesn't exist
os.makedirs(extract_to, exist_ok=True)
# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
# Extract all contents to the specified directory
zip_ref.extractall(extract_to)
if __name__ == '__main__':
make_dir(os.getcwd() + '/data/raw')
make_dir(os.getcwd() + '/data/processed')
make_dir(os.getcwd() + '/data/outputs')
make_dir(os.getcwd() + '/models')
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz',"label")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz',"train0")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-1.tar.gz',"train1")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-2.tar.gz',"train2")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-3.tar.gz',"train3")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-4.tar.gz',"train4")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-5.tar.gz',"train5")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-6.tar.gz',"train6")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/val.tar.gz',"val")
get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/test.tar.gz',"test")
extract_tar("train0")
extract_tar("train1")
extract_tar("train2")
extract_tar("train3")
extract_tar("train4")
extract_tar("train5")
extract_tar("train6")
extract_tar("label")
extract_tar("val")
extract_tar("test")
target_dir = os.getcwd() + '/data/raw/train/publaynet/train/'
make_dir(target_dir)
source_dirs = ['train0','train1','train2','train3', 'train4', 'train5', 'train6']
combine_dirs(source_dirs)
source_dirs = ['val', 'test']
combine_dirs(source_dirs)
unzip_file('hand_labeled_tables.zip', os.getcwd() + '/data/processed/hand_labeled_tables')