Spaces:

JackIsNotInTheBox
/

watermark_remover

Paused

File size: 4,608 Bytes

ebe8a5c

# /// script
# requires-python = ">=3.10"
# dependencies = [
#   "huggingface_hub>=0.26",
#   "requests>=2.31",
# ]
# ///
"""
mirror_checkpoints.py
---------------------
One-off mirror job: copies the three model dependencies for the
Video Watermark Remover Space into JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints
so the Space is insulated from upstream deletion.

Sources mirrored:
  1. Wan-AI/Wan2.1-VACE-14B-diffusers     (~75 GB, Apache-2.0) → vace-14b/
  2. lightx2v/Wan2.1-Distill-Loras        (single LoRA file)   → loras/
  3. big-lama.pt from GitHub releases     (~196 MB, Apache-2.0) → lama/

Strategy
--------
Per-file streaming: download → upload → delete. Disk usage at any moment
is ~one file (max ~5 GB for a single VACE transformer shard), so this fits
on cpu-basic / cpu-upgrade Jobs without ever holding the full 75 GB locally.
"""

import os
import sys
from pathlib import Path

import requests
from huggingface_hub import HfApi, hf_hub_download, list_repo_files

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
DEST_REPO = "JackIsNotInTheBox/Video_Watermark_Remover_Checkpoints"
TOKEN = os.environ.get("HF_TOKEN")
if not TOKEN:
    sys.exit("HF_TOKEN env var not set; pass via `--secrets HF_TOKEN=...`")

WORK = Path("/tmp/mirror")
WORK.mkdir(parents=True, exist_ok=True)

api = HfApi(token=TOKEN)


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def stream_repo(
    src_repo: str,
    dest_prefix: str,
    src_type: str = "model",
    exclude_globs: list[str] | None = None,
) -> None:
    """Mirror every file in src_repo under dest_prefix in DEST_REPO."""
    files = list_repo_files(src_repo, repo_type=src_type, token=TOKEN)
    exclude = exclude_globs or []
    files = [f for f in files if not any(Path(f).match(g) for g in exclude)]
    print(f"\n=== {src_repo} → {dest_prefix}/ ({len(files)} files) ===", flush=True)

    for i, fname in enumerate(files, 1):
        print(f"  [{i:>3}/{len(files)}] {fname}", flush=True)
        local = hf_hub_download(
            repo_id=src_repo,
            repo_type=src_type,
            filename=fname,
            local_dir=str(WORK),
            token=TOKEN,
        )
        api.upload_file(
            path_or_fileobj=local,
            path_in_repo=f"{dest_prefix}/{fname}",
            repo_id=DEST_REPO,
            repo_type="model",
            commit_message=f"Mirror {src_repo}: {fname}",
        )
        Path(local).unlink(missing_ok=True)


def stream_url(url: str, dest_path_in_repo: str, commit_message: str) -> None:
    """Download a single file from an arbitrary URL, push to DEST_REPO, delete."""
    fname = Path(dest_path_in_repo).name
    print(f"\n=== {url} → {dest_path_in_repo} ===", flush=True)
    local = WORK / fname
    with requests.get(url, stream=True, timeout=300) as r:
        r.raise_for_status()
        with open(local, "wb") as fp:
            for chunk in r.iter_content(chunk_size=1 << 20):  # 1 MB chunks
                fp.write(chunk)
    api.upload_file(
        path_or_fileobj=str(local),
        path_in_repo=dest_path_in_repo,
        repo_id=DEST_REPO,
        repo_type="model",
        commit_message=commit_message,
    )
    local.unlink(missing_ok=True)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main() -> None:
    # 1. VACE-14B (largest — do first while disk is freshest)
    stream_repo(
        "Wan-AI/Wan2.1-VACE-14B-diffusers",
        dest_prefix="vace-14b",
        exclude_globs=["assets/*", ".gitattributes"],
    )

    # 2. 4-step distill LoRA (single file)
    stream_repo(
        "lightx2v/Wan2.1-Distill-Loras",
        dest_prefix="loras",
        exclude_globs=[
            "*.md",
            ".gitattributes",
            # Pull only the rank-64 t2v 4-step LoRA — matches vace.py 8-step plan
            "*i2v*",
            "*rank32*",
            "*rank128*",
        ],
    )

    # 3. LaMa from GitHub release
    stream_url(
        url="https://github.com/enesmsahin/simple-lama-inpainting/releases/download/v0.1.0/big-lama.pt",
        dest_path_in_repo="lama/big-lama.pt",
        commit_message="Mirror big-lama.pt from simple-lama-inpainting v0.1.0 GitHub release",
    )

    print("\n✅ All mirrors complete.")


if __name__ == "__main__":
    main()