#!/usr/bin/env bash BASE_DIR="data/datasets/synthesized_datasets" N=5 # List of dataset subdirectories to include DATASETS=( "docvqa_alpha=1.0" "cord_alpha=1.0" "doclaynet4k_alpha=1.0_CLS" "doclaynet4k_alpha=1.0_DLA" "funsd_alpha=1.0" "icdar2019_alpha=1.0" "kleister_alpha=1.0" "publaynet_correct-sampling_alpha=1.0" "rvlcdip_alpha=1.0" "sroie_alpha=1.0" "tobacco3482_alpha=1.0" "wtq_alpha=1.0" ) # Subdirectories to exclude (relative to each dataset root) EXCLUDE_DIRS=( "visual_elements/visual_elements_images" "handwriting/handwriting_raw_tokens" ) OUTPUT_ZIP="output.zip" echo "[INFO] Base directory: $BASE_DIR" echo "[INFO] Including all root-level files, first $N files per subdirectory" echo "[INFO] Excluding subdirectories: ${EXCLUDE_DIRS[*]}" echo "[INFO] Will include datasets: ${DATASETS[*]}" tmpfile=$(mktemp) echo "[INFO] Using temporary file list: $tmpfile" echo # Helper function to check if a directory should be excluded should_exclude() { local dir="$1" for ex in "${EXCLUDE_DIRS[@]}"; do if [[ "$dir" == *"/$ex"* ]]; then return 0 # exclude fi done return 1 # include } # Iterate over all datasets for dataset in "${DATASETS[@]}"; do ROOT="$BASE_DIR/$dataset" echo "[INFO] Processing dataset: $dataset" # Collect directories while IFS= read -r d; do if should_exclude "$d"; then echo " [SKIP DIR] $d" continue fi echo " [DIR] $d" echo "$d" >> "$tmpfile" done < <(find "$ROOT" -type d) # Collect files while IFS= read -r d; do if should_exclude "$d"; then continue fi # Check if this is the root-level directory if [[ "$d" == "$ROOT" ]]; then # Include ALL files in root files=$(find "$d" -maxdepth 1 -type f | sort) else # Include only first N files in subdirectories files=$(find "$d" -maxdepth 1 -type f | sort | head -n "$N") fi if [[ -z "$files" ]]; then echo " (no files)" else echo "$files" | tee -a "$tmpfile" | sed 's/^/ [FILE] /' fi done < <(find "$ROOT" -type d) echo done echo "[INFO] Summary:" echo " Total paths in list: $(wc -l < "$tmpfile")" echo " Output zip: $OUTPUT_ZIP" echo echo "[INFO] Creating archive..." zip -@ "$OUTPUT_ZIP" < "$tmpfile" echo "[INFO] Cleaning up temp file: $tmpfile" rm "$tmpfile" echo "[INFO] Done!"