File size: 3,674 Bytes
b3fc8d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import urllib.request
import tarfile
from pathlib import Path
import shutil
import zipfile
import os


def get_archive(path,url,Set):
  try:
    os.mkdir(path)
  except:
    path=path

  urllib.request.urlretrieve(url,f"{path}/{Set}.tar")


def extract_tar(tar_file):
  print(f'{os.getcwd()}/data/raw/{tar_file}.tar', end='\r')
  file = tarfile.open(f'{os.getcwd()}/data/raw/{tar_file}.tar')
  file.extractall(f'{os.getcwd()}/data/raw/{tar_file}')
  file.close()
  os.remove(f'{os.getcwd()}/data/raw/{tar_file}.tar')
  
def make_dir(target_dir):
    if Path(target_dir).exists() and Path(target_dir).is_dir():
        shutil.rmtree(Path(target_dir))
    os.makedirs(target_dir, exist_ok=True)
    
def combine_dirs(source_dirs):
  for source_dir in source_dirs:
      for subdir, dirs, files in os.walk(os.getcwd() + '/data/raw/' + source_dir):
          for file in files:
              filepath = subdir + os.sep + file
              
              if filepath.find('.jpg') != -1:
                  shutil.copy(filepath, target_dir)
          
      if Path(os.getcwd() + '/data/raw/' + source_dir).exists():
          shutil.rmtree(Path(os.getcwd() + '/data/raw/' + source_dir))


def unzip_file(zip_file_path, extract_to):
    # Create the target directory if it doesn't exist
    os.makedirs(extract_to, exist_ok=True)
    
    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all contents to the specified directory
        zip_ref.extractall(extract_to)


if __name__ == '__main__':
    make_dir(os.getcwd() + '/data/raw')
    make_dir(os.getcwd() + '/data/processed')
    make_dir(os.getcwd() + '/data/outputs')
    make_dir(os.getcwd() + '/models')
    
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz',"label")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz',"train0")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-1.tar.gz',"train1")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-2.tar.gz',"train2")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-3.tar.gz',"train3")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-4.tar.gz',"train4")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-5.tar.gz',"train5")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-6.tar.gz',"train6")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/val.tar.gz',"val")
    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/test.tar.gz',"test")
 
    extract_tar("train0")
    extract_tar("train1")
    extract_tar("train2")
    extract_tar("train3")
    extract_tar("train4")
    extract_tar("train5")
    extract_tar("train6")
    extract_tar("label")
    extract_tar("val")
    extract_tar("test")

    target_dir = os.getcwd() + '/data/raw/train/publaynet/train/'
    make_dir(target_dir)

    source_dirs = ['train0','train1','train2','train3', 'train4', 'train5', 'train6']
    combine_dirs(source_dirs)

    source_dirs = ['val', 'test']
    combine_dirs(source_dirs)
    
    unzip_file('hand_labeled_tables.zip', os.getcwd() + '/data/processed/hand_labeled_tables')