File size: 2,642 Bytes
5c36ec7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env bash

BASE_DIR="data/datasets/synthesized_datasets"
N=5

# List of dataset subdirectories to include
DATASETS=(
    "docvqa_alpha=1.0"
    "cord_alpha=1.0"
    "doclaynet4k_alpha=1.0_CLS"
    "doclaynet4k_alpha=1.0_DLA"
    "funsd_alpha=1.0"
    "icdar2019_alpha=1.0"
    "kleister_alpha=1.0"
    "publaynet_correct-sampling_alpha=1.0"
    "rvlcdip_alpha=1.0"
    "sroie_alpha=1.0"
    "tobacco3482_alpha=1.0"
    "wtq_alpha=1.0"
)

# Subdirectories to exclude (relative to each dataset root)
EXCLUDE_DIRS=(
    "visual_elements/visual_elements_images"
    "handwriting/handwriting_raw_tokens"
)

OUTPUT_ZIP="output.zip"

echo "[INFO] Base directory: $BASE_DIR"
echo "[INFO] Including all root-level files, first $N files per subdirectory"
echo "[INFO] Excluding subdirectories: ${EXCLUDE_DIRS[*]}"
echo "[INFO] Will include datasets: ${DATASETS[*]}"

tmpfile=$(mktemp)
echo "[INFO] Using temporary file list: $tmpfile"
echo

# Helper function to check if a directory should be excluded
should_exclude() {
    local dir="$1"
    for ex in "${EXCLUDE_DIRS[@]}"; do
        if [[ "$dir" == *"/$ex"* ]]; then
            return 0  # exclude
        fi
    done
    return 1  # include
}

# Iterate over all datasets
for dataset in "${DATASETS[@]}"; do
    ROOT="$BASE_DIR/$dataset"
    echo "[INFO] Processing dataset: $dataset"

    # Collect directories
    while IFS= read -r d; do
        if should_exclude "$d"; then
            echo "  [SKIP DIR] $d"
            continue
        fi
        echo "  [DIR] $d"
        echo "$d" >> "$tmpfile"
    done < <(find "$ROOT" -type d)

    # Collect files
    while IFS= read -r d; do
        if should_exclude "$d"; then
            continue
        fi

        # Check if this is the root-level directory
        if [[ "$d" == "$ROOT" ]]; then
            # Include ALL files in root
            files=$(find "$d" -maxdepth 1 -type f | sort)
        else
            # Include only first N files in subdirectories
            files=$(find "$d" -maxdepth 1 -type f | sort | head -n "$N")
        fi

        if [[ -z "$files" ]]; then
            echo "    (no files)"
        else
            echo "$files" | tee -a "$tmpfile" | sed 's/^/      [FILE] /'
        fi
    done < <(find "$ROOT" -type d)

    echo
done

echo "[INFO] Summary:"
echo "  Total paths in list: $(wc -l < "$tmpfile")"
echo "  Output zip: $OUTPUT_ZIP"
echo

echo "[INFO] Creating archive..."
zip -@ "$OUTPUT_ZIP" < "$tmpfile"

echo "[INFO] Cleaning up temp file: $tmpfile"
rm "$tmpfile"

echo "[INFO] Done!"