Docgenie-API / scripts /pack_example.sh
Ahadhassan-2003
deploy: update HF Space
5c36ec7
#!/usr/bin/env bash
BASE_DIR="data/datasets/synthesized_datasets"
N=5
# List of dataset subdirectories to include
DATASETS=(
"docvqa_alpha=1.0"
"cord_alpha=1.0"
"doclaynet4k_alpha=1.0_CLS"
"doclaynet4k_alpha=1.0_DLA"
"funsd_alpha=1.0"
"icdar2019_alpha=1.0"
"kleister_alpha=1.0"
"publaynet_correct-sampling_alpha=1.0"
"rvlcdip_alpha=1.0"
"sroie_alpha=1.0"
"tobacco3482_alpha=1.0"
"wtq_alpha=1.0"
)
# Subdirectories to exclude (relative to each dataset root)
EXCLUDE_DIRS=(
"visual_elements/visual_elements_images"
"handwriting/handwriting_raw_tokens"
)
OUTPUT_ZIP="output.zip"
echo "[INFO] Base directory: $BASE_DIR"
echo "[INFO] Including all root-level files, first $N files per subdirectory"
echo "[INFO] Excluding subdirectories: ${EXCLUDE_DIRS[*]}"
echo "[INFO] Will include datasets: ${DATASETS[*]}"
tmpfile=$(mktemp)
echo "[INFO] Using temporary file list: $tmpfile"
echo
# Helper function to check if a directory should be excluded
should_exclude() {
local dir="$1"
for ex in "${EXCLUDE_DIRS[@]}"; do
if [[ "$dir" == *"/$ex"* ]]; then
return 0 # exclude
fi
done
return 1 # include
}
# Iterate over all datasets
for dataset in "${DATASETS[@]}"; do
ROOT="$BASE_DIR/$dataset"
echo "[INFO] Processing dataset: $dataset"
# Collect directories
while IFS= read -r d; do
if should_exclude "$d"; then
echo " [SKIP DIR] $d"
continue
fi
echo " [DIR] $d"
echo "$d" >> "$tmpfile"
done < <(find "$ROOT" -type d)
# Collect files
while IFS= read -r d; do
if should_exclude "$d"; then
continue
fi
# Check if this is the root-level directory
if [[ "$d" == "$ROOT" ]]; then
# Include ALL files in root
files=$(find "$d" -maxdepth 1 -type f | sort)
else
# Include only first N files in subdirectories
files=$(find "$d" -maxdepth 1 -type f | sort | head -n "$N")
fi
if [[ -z "$files" ]]; then
echo " (no files)"
else
echo "$files" | tee -a "$tmpfile" | sed 's/^/ [FILE] /'
fi
done < <(find "$ROOT" -type d)
echo
done
echo "[INFO] Summary:"
echo " Total paths in list: $(wc -l < "$tmpfile")"
echo " Output zip: $OUTPUT_ZIP"
echo
echo "[INFO] Creating archive..."
zip -@ "$OUTPUT_ZIP" < "$tmpfile"
echo "[INFO] Cleaning up temp file: $tmpfile"
rm "$tmpfile"
echo "[INFO] Done!"