| #!/usr/bin/env bash
|
|
|
| BASE_DIR="data/datasets/synthesized_datasets"
|
| N=5
|
|
|
|
|
| DATASETS=(
|
| "docvqa_alpha=1.0"
|
| "cord_alpha=1.0"
|
| "doclaynet4k_alpha=1.0_CLS"
|
| "doclaynet4k_alpha=1.0_DLA"
|
| "funsd_alpha=1.0"
|
| "icdar2019_alpha=1.0"
|
| "kleister_alpha=1.0"
|
| "publaynet_correct-sampling_alpha=1.0"
|
| "rvlcdip_alpha=1.0"
|
| "sroie_alpha=1.0"
|
| "tobacco3482_alpha=1.0"
|
| "wtq_alpha=1.0"
|
| )
|
|
|
|
|
| EXCLUDE_DIRS=(
|
| "visual_elements/visual_elements_images"
|
| "handwriting/handwriting_raw_tokens"
|
| )
|
|
|
| OUTPUT_ZIP="output.zip"
|
|
|
| echo "[INFO] Base directory: $BASE_DIR"
|
| echo "[INFO] Including all root-level files, first $N files per subdirectory"
|
| echo "[INFO] Excluding subdirectories: ${EXCLUDE_DIRS[*]}"
|
| echo "[INFO] Will include datasets: ${DATASETS[*]}"
|
|
|
| tmpfile=$(mktemp)
|
| echo "[INFO] Using temporary file list: $tmpfile"
|
| echo
|
|
|
|
|
| should_exclude() {
|
| local dir="$1"
|
| for ex in "${EXCLUDE_DIRS[@]}"; do
|
| if [[ "$dir" == *"/$ex"* ]]; then
|
| return 0
|
| fi
|
| done
|
| return 1
|
| }
|
|
|
|
|
| for dataset in "${DATASETS[@]}"; do
|
| ROOT="$BASE_DIR/$dataset"
|
| echo "[INFO] Processing dataset: $dataset"
|
|
|
|
|
| while IFS= read -r d; do
|
| if should_exclude "$d"; then
|
| echo " [SKIP DIR] $d"
|
| continue
|
| fi
|
| echo " [DIR] $d"
|
| echo "$d" >> "$tmpfile"
|
| done < <(find "$ROOT" -type d)
|
|
|
|
|
| while IFS= read -r d; do
|
| if should_exclude "$d"; then
|
| continue
|
| fi
|
|
|
|
|
| if [[ "$d" == "$ROOT" ]]; then
|
|
|
| files=$(find "$d" -maxdepth 1 -type f | sort)
|
| else
|
|
|
| files=$(find "$d" -maxdepth 1 -type f | sort | head -n "$N")
|
| fi
|
|
|
| if [[ -z "$files" ]]; then
|
| echo " (no files)"
|
| else
|
| echo "$files" | tee -a "$tmpfile" | sed 's/^/ [FILE] /'
|
| fi
|
| done < <(find "$ROOT" -type d)
|
|
|
| echo
|
| done
|
|
|
| echo "[INFO] Summary:"
|
| echo " Total paths in list: $(wc -l < "$tmpfile")"
|
| echo " Output zip: $OUTPUT_ZIP"
|
| echo
|
|
|
| echo "[INFO] Creating archive..."
|
| zip -@ "$OUTPUT_ZIP" < "$tmpfile"
|
|
|
| echo "[INFO] Cleaning up temp file: $tmpfile"
|
| rm "$tmpfile"
|
|
|
| echo "[INFO] Done!"
|
|
|