Spaces:

joemartis
/

Video2Guide

Running

File size: 10,632 Bytes

"""`vgm` command-line interface.

Three subcommands:
- `build` — full pipeline: video (+ optional transcript) → HTML / zip / review HTML.
- `export-metadata` — run the pipeline and dump the unified metadata JSON.
- `render-from-metadata` — re-render HTML from a previously exported metadata JSON.
"""
from __future__ import annotations

import contextlib
import logging
from pathlib import Path
from typing import Optional

import typer
from rich.progress import (
    BarColumn,
    Progress,
    TaskProgressColumn,
    TextColumn,
    TimeElapsedColumn,
)

from .pipeline import bundle_zip, dump_metadata, load_metadata, render_guide, render_review
from .pipeline.html_gen import metadata_to_segment, render_from_metadata
from .pipeline.orchestrator import (
    PipelineError,
    PipelineInputs,
    ProgressEvent,
    run_pipeline,
)


@contextlib.contextmanager
def _progress_bar():
    """Yield a (callable progress_cb, finalize) pair backed by a rich Progress.

    The progress_cb signature matches orchestrator.ProgressCallback.
    """
    progress = Progress(
        TextColumn("[bold]{task.fields[stage]:<13}", justify="left"),
        BarColumn(bar_width=None),
        TaskProgressColumn(),
        TextColumn("{task.fields[msg]}"),
        TimeElapsedColumn(),
        transient=False,
    )
    progress.start()
    task_id = progress.add_task("vgm", total=100, stage="starting", msg="")

    def cb(ev: ProgressEvent) -> None:
        progress.update(task_id, completed=ev.percent, stage=ev.stage, msg=ev.message)

    try:
        yield cb
    finally:
        progress.stop()

app = typer.Typer(
    add_completion=False,
    help="VideoGuideMaker — generate WCAG-ready study guides from video + transcript.",
    no_args_is_help=True,
)

log = logging.getLogger("videoguidemaker.cli")


def _setup_logging(verbose: bool) -> None:
    # Default to WARNING so the rich progress bar isn't disrupted by INFO logs.
    # `--verbose` opts into DEBUG output.
    logging.basicConfig(
        level=logging.DEBUG if verbose else logging.WARNING,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )


def _resolve_format(fmt: str, output: Path) -> str:
    fmt = fmt.lower()
    if fmt not in ("review", "single", "zip", "guide"):
        raise typer.BadParameter("format must be one of: review, single, zip, guide")
    return fmt


def _safe_filename(title: str) -> str:
    safe = "".join(c if c.isalnum() or c in "-_ " else "-" for c in title).strip()
    safe = safe.replace(" ", "-")
    return safe or "study-guide"


@app.command()
def build(
    video: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
    transcript: Optional[Path] = typer.Argument(None, exists=False, dir_okay=False),
    title: str = typer.Option("Untitled Study Guide", "--title"),
    subtitle: Optional[str] = typer.Option(None, "--subtitle"),
    module: Optional[str] = typer.Option(None, "--module"),
    output: Path = typer.Option(Path("study_guide.html"), "--output", "-o"),
    frames_dir: Path = typer.Option(Path("static"), "--frames-dir"),
    auto_transcribe: bool = typer.Option(False, "--auto-transcribe"),
    whisper_model: str = typer.Option("small", "--whisper-model"),
    threshold: float = typer.Option(27.0, "--threshold"),
    min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
    max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
    skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
    skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
    face_threshold: float = typer.Option(0.12, "--face-threshold"),
    lang: str = typer.Option("en", "--lang"),
    fmt: str = typer.Option("single", "--format", help="review | single | zip | guide"),
    export_metadata: Optional[Path] = typer.Option(None, "--export-metadata"),
    verbose: bool = typer.Option(False, "--verbose", "-v"),
) -> None:
    """Run the full pipeline: video + transcript → HTML."""
    _setup_logging(verbose)
    fmt = _resolve_format(fmt, output)

    inputs = PipelineInputs(
        video_path=video,
        transcript_path=transcript if transcript and transcript.exists() else None,
        frames_dir=frames_dir,
        title=title,
        subtitle=subtitle,
        module=module,
        lang=lang,
        threshold=threshold,
        min_gap_seconds=min_gap,
        max_frames=max_frames,
        skip_ocr=skip_ocr,
        skip_inverted_ocr=skip_inverted_ocr,
        face_threshold=face_threshold,
        auto_transcribe=auto_transcribe,
        whisper_model=whisper_model,
        inline_images=(fmt in ("single", "review")),
    )
    try:
        with _progress_bar() as cb:
            result = run_pipeline(inputs, progress=cb)
    except PipelineError as exc:
        typer.secho(f"error: {exc}", fg=typer.colors.RED, err=True)
        raise typer.Exit(2)

    # Dump metadata BEFORE rendering: a render failure (template bug,
    # disk full mid-write) would otherwise discard the LLM/OCR work
    # the user just paid for.
    if export_metadata:
        dump_metadata(export_metadata, result.page)
        typer.echo(f"wrote {export_metadata}")

    common = dict(
        title=title,
        segments=result.segments,
        lang=lang,
        subtitle=subtitle,
        module=module,
        meta_lines=result.page.meta_lines or None,
        eyebrow=result.page.eyebrow,
    )

    if fmt == "review":
        html = render_review(**common)
    else:
        inline = fmt == "single"
        if inline:
            # Inline audio data URIs alongside images so the single HTML
            # stays self-contained (no broken audio/foo.mp3 references).
            import base64
            for seg, ap in zip(result.segments, result.audio_paths):
                if ap and ap.exists():
                    seg.audio_data_uri = (
                        "data:audio/mpeg;base64,"
                        + base64.b64encode(ap.read_bytes()).decode("ascii")
                    )
        html = render_guide(inline_images=inline, **common)

    if fmt == "zip":
        audio_disk_paths = [p for p in result.audio_paths if p is not None]
        zip_bytes = bundle_zip(
            html,
            [f.image_path for f in result.kept_frames],
            audio_paths=audio_disk_paths,
        )
        if output.suffix.lower() != ".zip":
            output = output.with_suffix(".zip")
        output.write_bytes(zip_bytes)
    else:
        output.write_text(html, encoding="utf-8")

    typer.echo(f"wrote {output}")


@app.command("export-metadata")
def export_metadata_cmd(
    video: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
    transcript: Optional[Path] = typer.Argument(None, exists=False, dir_okay=False),
    title: str = typer.Option("Untitled Study Guide", "--title"),
    subtitle: Optional[str] = typer.Option(None, "--subtitle"),
    module: Optional[str] = typer.Option(None, "--module"),
    output: Path = typer.Option(Path("study_guide_metadata.json"), "--output", "-o"),
    frames_dir: Path = typer.Option(Path("static"), "--frames-dir"),
    auto_transcribe: bool = typer.Option(False, "--auto-transcribe"),
    whisper_model: str = typer.Option("small", "--whisper-model"),
    threshold: float = typer.Option(27.0, "--threshold"),
    min_gap: float = typer.Option(0.0, "--min-gap", help="Drop frames closer than N seconds to the previous one."),
    max_frames: Optional[int] = typer.Option(None, "--max-frames", help="Cap total frames; uniformly downsamples preserving first + last."),
    skip_ocr: bool = typer.Option(False, "--skip-ocr", help="Skip OCR pass; on-screen text fields will be empty."),
    skip_inverted_ocr: bool = typer.Option(False, "--skip-inverted-ocr", help="Skip the inverted-binarization OCR pass. Halves OCR time and avoids inverted-pass garbling on slides without coloured callouts."),
    face_threshold: float = typer.Option(0.12, "--face-threshold"),
    lang: str = typer.Option("en", "--lang"),
    verbose: bool = typer.Option(False, "--verbose", "-v"),
) -> None:
    """Run the pipeline and dump the metadata JSON only (no HTML)."""
    _setup_logging(verbose)
    inputs = PipelineInputs(
        video_path=video,
        transcript_path=transcript if transcript and transcript.exists() else None,
        frames_dir=frames_dir,
        title=title,
        subtitle=subtitle,
        module=module,
        lang=lang,
        threshold=threshold,
        min_gap_seconds=min_gap,
        max_frames=max_frames,
        skip_ocr=skip_ocr,
        skip_inverted_ocr=skip_inverted_ocr,
        face_threshold=face_threshold,
        auto_transcribe=auto_transcribe,
        whisper_model=whisper_model,
        inline_images=False,
    )
    try:
        with _progress_bar() as cb:
            result = run_pipeline(inputs, progress=cb)
    except PipelineError as exc:
        typer.secho(f"error: {exc}", fg=typer.colors.RED, err=True)
        raise typer.Exit(2)
    dump_metadata(output, result.page)
    typer.echo(f"wrote {output} ({len(result.page.segments)} segments, frames in {frames_dir})")


@app.command("render-from-metadata")
def render_from_metadata_cmd(
    metadata_json: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
    output: Path = typer.Option(Path("study_guide.html"), "--output", "-o"),
    frames_dir: Optional[Path] = typer.Option(
        None, "--frames-dir",
        help="Override the frames_dir recorded in the metadata JSON.",
    ),
    fmt: str = typer.Option("single", "--format", help="review | single | guide"),
    verbose: bool = typer.Option(False, "--verbose", "-v"),
) -> None:
    """Re-render HTML from a previously exported metadata JSON."""
    _setup_logging(verbose)
    fmt = _resolve_format(fmt, output)
    if fmt == "zip":
        raise typer.BadParameter("zip format requires source frames; use 'build' instead.")
    page = load_metadata(metadata_json)

    resolved_frames_dir = (
        frames_dir
        if frames_dir is not None
        else (metadata_json.parent / page.frames_dir).resolve()
    )
    html = render_from_metadata(page, Path(resolved_frames_dir), mode=fmt)
    output.write_text(html, encoding="utf-8")
    typer.echo(f"wrote {output}")


if __name__ == "__main__":  # pragma: no cover
    app()