Spaces:

twangodev
/

compare-codec

Sleeping

App Files Files Community

twangodev commited on Apr 12

Commit

1df078a

verified ·

1 Parent(s): 58090b5

feat: add DAC codec implementation and codec registry

Browse files

Files changed (6) hide show

app.py +125 -0
compare_codec/__init__.py +49 -0
compare_codec/dac.py +81 -0
pyproject.toml +17 -0
requirements.txt +431 -0
uv.lock +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""Codecomp — upload audio, pick codecs, hear reconstructions side by side."""
+from __future__ import annotations
+import time
+from pathlib import Path
+import gradio as gr
+from compare_codec import CodecConfig, get_all
+MAX_DURATION_S = 30.0
+def _codec_choices() -> list[str]:
+    """Return display labels for every (codec, config) pair."""
+    choices = []
+    for codec in get_all().values():
+        for cfg in codec.configs():
+            choices.append(f"{codec.name} — {cfg.name}")
+    return choices
+def _resolve_selection(label: str) -> tuple[str, CodecConfig, int]:
+    """Map a display label back to (codec_name, config, output_sample_rate)."""
+    codec_name, cfg_name = label.split(" — ", 1)
+    codec = get_all()[codec_name]
+    for cfg in codec.configs():
+        if cfg.name == cfg_name:
+            sr = cfg.params.get("sample_rate", codec.sample_rate)
+            return codec_name, cfg, sr
+    raise ValueError(f"Unknown config: {label}")
+def compare(audio_path: str | None, selected: list[str]) -> list[dict]:
+    if audio_path is None or not selected:
+        return []
+    results = []
+    for label in selected:
+        codec_name, cfg, sr = _resolve_selection(label)
+        codec = get_all()[codec_name]
+        t0 = time.perf_counter()
+        audio_out = codec.encode_decode(Path(audio_path), cfg)
+        elapsed = time.perf_counter() - t0
+        max_samples = int(MAX_DURATION_S * sr)
+        if len(audio_out) > max_samples:
+            audio_out = audio_out[:max_samples]
+        results.append({"label": label, "audio": (sr, audio_out), "time": elapsed})
+    return results
+def build_ui() -> gr.Blocks:
+    all_choices = _codec_choices()
+    max_codecs = len(all_choices)
+    with gr.Blocks(title="Codecomp") as demo:
+        gr.Markdown(
+            "# Codecomp\n\n"
+            "Upload audio, select one or more codec configurations, "
+            "and listen to the reconstructions side by side."
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                audio_in = gr.Audio(
+                    sources=["upload", "microphone"],
+                    type="filepath",
+                    label="Input audio",
+                )
+                codec_select = gr.CheckboxGroup(
+                    choices=all_choices,
+                    label="Codec configurations",
+                )
+                run_btn = gr.Button("Compare", variant="primary")
+            with gr.Column(scale=2):
+                output_components: list[tuple[gr.Group, gr.Markdown, gr.Audio]] = []
+                for i in range(max_codecs):
+                    with gr.Group(visible=False) as group:
+                        md = gr.Markdown()
+                        audio_out = gr.Audio(
+                            label=f"Result {i + 1}",
+                            type="numpy",
+                            interactive=False,
+                        )
+                    output_components.append((group, md, audio_out))
+        def on_compare(audio_path: str | None, selected: list[str]) -> list:
+            results = compare(audio_path, selected)
+            updates = []
+            for i in range(max_codecs):
+                if i < len(results):
+                    r = results[i]
+                    updates.append(gr.update(visible=True))
+                    updates.append(
+                        gr.update(value=f"**{r['label']}** — {r['time']:.2f}s")
+                    )
+                    updates.append(gr.update(value=r["audio"], label=r["label"]))
+                else:
+                    updates.append(gr.update(visible=False))
+                    updates.append(gr.update(value=""))
+                    updates.append(gr.update(value=None))
+            return updates
+        all_outputs: list[gr.Component] = []
+        for group, md, audio_out in output_components:
+            all_outputs.extend([group, md, audio_out])
+        run_btn.click(
+            on_compare,
+            inputs=[audio_in, codec_select],
+            outputs=all_outputs,
+        )
+    return demo
+demo = build_ui()
+if __name__ == "__main__":
+    demo.launch()

compare_codec/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Codec registry — protocol, config dataclass, and discovery."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+import numpy as np
+@dataclass(frozen=True)
+class CodecConfig:
+    """A single configuration offered by a codec (e.g. '44kHz / 9 quantizers')."""
+    name: str
+    params: dict = field(default_factory=dict)
+@runtime_checkable
+class AudioCodec(Protocol):
+    """Interface every codec must implement."""
+    @property
+    def name(self) -> str: ...
+    @property
+    def sample_rate(self) -> int: ...
+    def configs(self) -> list[CodecConfig]: ...
+    def encode_decode(self, audio_path: Path, config: CodecConfig) -> np.ndarray:
+        """Round-trip: raw file in -> mono float32 numpy array out at self.sample_rate."""
+        ...
+_REGISTRY: dict[str, AudioCodec] = {}
+def register(codec: AudioCodec) -> None:
+    _REGISTRY[codec.name] = codec
+def get_all() -> dict[str, AudioCodec]:
+    return dict(_REGISTRY)
+# Import codec modules so they self-register on startup.
+from compare_codec import dac as _dac  # noqa: E402, F401

compare_codec/dac.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""DAC (Descript Audio Codec) — wraps the descript-audio-codec package."""
+from __future__ import annotations
+from pathlib import Path
+import numpy as np
+import torch
+from compare_codec import CodecConfig, register
+class DACCodec:
+    """DAC codec with lazy model loading."""
+    def __init__(self) -> None:
+        self._models: dict[str, object] = {}
+    @property
+    def name(self) -> str:
+        return "DAC"
+    @property
+    def sample_rate(self) -> int:
+        return 44_100
+    def configs(self) -> list[CodecConfig]:
+        configs = []
+        for model_type, sr, max_nq in [
+            ("44khz", 44_100, 9),
+            ("24khz", 24_000, 9),
+            ("16khz", 16_000, 9),
+        ]:
+            for nq in (max_nq, 6, 4, 2):
+                configs.append(
+                    CodecConfig(
+                        name=f"{model_type} / {nq} quantizers",
+                        params={
+                            "model_type": model_type,
+                            "n_quantizers": nq,
+                            "sample_rate": sr,
+                        },
+                    )
+                )
+        return configs
+    def _get_model(self, model_type: str) -> object:
+        if model_type not in self._models:
+            import dac as _dac
+            model_path = _dac.utils.download(model_type=model_type)
+            model = _dac.DAC.load(model_path)
+            model.eval()
+            self._models[model_type] = model
+        return self._models[model_type]
+    @torch.no_grad()
+    def encode_decode(self, audio_path: Path, config: CodecConfig) -> np.ndarray:
+        from audiotools import AudioSignal
+        model_type: str = config.params["model_type"]
+        n_quantizers: int = config.params["n_quantizers"]
+        target_sr: int = config.params["sample_rate"]
+        model = self._get_model(model_type)
+        signal = AudioSignal(str(audio_path))
+        if signal.audio_data.shape[1] > 1:
+            signal.audio_data = signal.audio_data.mean(dim=1, keepdim=True)
+        if signal.sample_rate != target_sr:
+            signal = signal.resample(target_sr)
+        signal = signal.to(model.device)
+        x = model.preprocess(signal.audio_data, signal.sample_rate)
+        z, codes, latents, _, _ = model.encode(x, n_quantizers=n_quantizers)
+        y = model.decode(z)
+        return y.squeeze(0).squeeze(0).cpu().numpy()
+register(DACCodec())

pyproject.toml ADDED Viewed

	@@ -0,0 +1,17 @@

+[project]
+name = "compare-codec"
+version = "0.1.0"
+description = "Compare audio codecs side by side in a web UI"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.12"
+dependencies = [
+    "descript-audio-codec>=1.0.0",
+    "gradio",
+    "numpy",
+]
+[dependency-groups]
+dev = [
+    "ruff>=0.15.10",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,431 @@

+--extra-index-url https://download.pytorch.org/whl/cpu
+# This file was autogenerated by uv via the following command:
+#    uv export --format requirements-txt --no-hashes --no-emit-project
+absl-py==2.4.0
+    # via tensorboard
+annotated-doc==0.0.4
+    # via
+    #   fastapi
+    #   typer
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.13.0
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+argbind==0.3.9
+    # via
+    #   descript-audio-codec
+    #   descript-audiotools
+asttokens==3.0.1
+    # via stack-data
+audioop-lts==0.2.2 ; python_full_version >= '3.13'
+    # via
+    #   gradio
+    #   standard-aifc
+    #   standard-sunau
+audioread==3.1.0
+    # via librosa
+brotli==1.2.0
+    # via gradio
+certifi==2026.2.25
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==2.0.0
+    # via soundfile
+charset-normalizer==3.4.7
+    # via requests
+click==8.3.2
+    # via
+    #   typer
+    #   uvicorn
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   ipython
+    #   tqdm
+contourpy==1.3.3
+    # via matplotlib
+cuda-bindings==13.2.0 ; sys_platform == 'linux'
+    # via torch
+cuda-pathfinder==1.5.2 ; sys_platform == 'linux'
+    # via cuda-bindings
+cuda-toolkit==13.0.2 ; sys_platform == 'linux'
+    # via torch
+cycler==0.12.1
+    # via matplotlib
+decorator==5.2.1
+    # via
+    #   ipython
+    #   librosa
+descript-audio-codec==1.0.0
+    # via codec-arena
+descript-audiotools==0.7.2
+    # via descript-audio-codec
+docstring-parser==0.17.0
+    # via argbind
+einops==0.8.2
+    # via descript-audio-codec
+executing==2.2.1
+    # via stack-data
+fastapi==0.135.3
+    # via gradio
+ffmpy==1.0.0
+    # via descript-audiotools
+filelock==3.25.2
+    # via
+    #   huggingface-hub
+    #   torch
+fire==0.7.1
+    # via randomname
+flatten-dict==0.4.2
+    # via descript-audiotools
+fonttools==4.62.1
+    # via matplotlib
+fsspec==2026.3.0
+    # via
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+gradio==6.12.0
+    # via codec-arena
+gradio-client==2.4.1
+    # via
+    #   gradio
+    #   hf-gradio
+groovy==0.1.2
+    # via gradio
+grpcio==1.80.0
+    # via tensorboard
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-gradio==0.3.0
+    # via gradio
+hf-xet==1.4.3 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   safehttpx
+huggingface-hub==1.10.1
+    # via
+    #   gradio
+    #   gradio-client
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+importlib-resources==7.1.0
+    # via descript-audiotools
+ipython==9.12.0
+    # via descript-audiotools
+ipython-pygments-lexers==1.1.1
+    # via ipython
+jedi==0.19.2
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+joblib==1.5.3
+    # via
+    #   librosa
+    #   scikit-learn
+julius==0.2.7
+    # via descript-audiotools
+kiwisolver==1.5.0
+    # via matplotlib
+lazy-loader==0.5
+    # via librosa
+librosa==0.11.0
+    # via descript-audiotools
+llvmlite==0.47.0
+    # via numba
+markdown==3.10.2
+    # via tensorboard
+markdown-it-py==4.0.0
+    # via rich
+markdown2==2.5.5
+    # via descript-audiotools
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+    #   werkzeug
+matplotlib==3.10.8
+    # via descript-audiotools
+matplotlib-inline==0.2.1
+    # via ipython
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.2
+    # via librosa
+networkx==3.6.1
+    # via torch
+numba==0.65.0
+    # via librosa
+numpy==2.4.4
+    # via
+    #   codec-arena
+    #   contourpy
+    #   descript-audio-codec
+    #   descript-audiotools
+    #   gradio
+    #   librosa
+    #   matplotlib
+    #   numba
+    #   pandas
+    #   pyloudnorm
+    #   pystoi
+    #   scikit-learn
+    #   scipy
+    #   soundfile
+    #   soxr
+    #   tensorboard
+    #   torch-stoi
+nvidia-cublas==13.1.0.3 ; sys_platform == 'linux'
+    # via
+    #   cuda-toolkit
+    #   nvidia-cudnn-cu13
+    #   nvidia-cusolver
+nvidia-cuda-cupti==13.0.85 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-cuda-nvrtc==13.0.88 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-cuda-runtime==13.0.96 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-cudnn-cu13==9.19.0.56 ; sys_platform == 'linux'
+    # via torch
+nvidia-cufft==12.0.0.61 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-cufile==1.15.1.6 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-curand==10.4.0.35 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-cusolver==12.0.4.66 ; sys_platform == 'linux'
+    # via cuda-toolkit
+nvidia-cusparse==12.6.3.3 ; sys_platform == 'linux'
+    # via
+    #   cuda-toolkit
+    #   nvidia-cusolver
+nvidia-cusparselt-cu13==0.8.0 ; sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu13==2.28.9 ; sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink==13.0.88 ; sys_platform == 'linux'
+    # via
+    #   cuda-toolkit
+    #   nvidia-cufft
+    #   nvidia-cusolver
+    #   nvidia-cusparse
+nvidia-nvshmem-cu13==3.4.5 ; sys_platform == 'linux'
+    # via torch
+nvidia-nvtx==13.0.85 ; sys_platform == 'linux'
+    # via cuda-toolkit
+orjson==3.11.8
+    # via gradio
+packaging==26.0
+    # via
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   lazy-loader
+    #   matplotlib
+    #   pooch
+    #   tensorboard
+pandas==3.0.2
+    # via gradio
+parso==0.8.6
+    # via jedi
+pexpect==4.9.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
+    # via ipython
+pillow==12.2.0
+    # via
+    #   gradio
+    #   matplotlib
+    #   tensorboard
+platformdirs==4.9.6
+    # via pooch
+pooch==1.9.0
+    # via librosa
+prompt-toolkit==3.0.52
+    # via ipython
+protobuf==3.19.6
+    # via
+    #   descript-audiotools
+    #   tensorboard
+ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
+    # via pexpect
+pure-eval==0.2.3
+    # via stack-data
+pycparser==3.0 ; implementation_name != 'PyPy'
+    # via cffi
+pydantic==2.12.5
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.41.5
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.20.0
+    # via
+    #   ipython
+    #   ipython-pygments-lexers
+    #   rich
+pyloudnorm==0.2.0
+    # via descript-audiotools
+pyparsing==3.3.2
+    # via matplotlib
+pystoi==0.4.1
+    # via
+    #   descript-audiotools
+    #   torch-stoi
+python-dateutil==2.9.0.post0
+    # via
+    #   matplotlib
+    #   pandas
+python-multipart==0.0.26
+    # via gradio
+pytz==2026.1.post1
+    # via gradio
+pyyaml==6.0.3
+    # via
+    #   argbind
+    #   gradio
+    #   huggingface-hub
+randomname==0.2.1
+    # via descript-audiotools
+requests==2.33.1
+    # via pooch
+rich==14.3.4
+    # via
+    #   descript-audiotools
+    #   typer
+safehttpx==0.1.7
+    # via gradio
+scikit-learn==1.8.0
+    # via librosa
+scipy==1.17.1
+    # via
+    #   descript-audiotools
+    #   librosa
+    #   pyloudnorm
+    #   pystoi
+    #   scikit-learn
+semantic-version==2.10.0
+    # via gradio
+setuptools==81.0.0
+    # via
+    #   tensorboard
+    #   torch
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   flatten-dict
+    #   python-dateutil
+soundfile==0.13.1
+    # via
+    #   descript-audiotools
+    #   librosa
+soxr==1.0.0
+    # via librosa
+stack-data==0.6.3
+    # via ipython
+standard-aifc==3.13.0 ; python_full_version >= '3.13'
+    # via
+    #   audioread
+    #   librosa
+standard-chunk==3.13.0 ; python_full_version >= '3.13'
+    # via standard-aifc
+standard-sunau==3.13.0 ; python_full_version >= '3.13'
+    # via
+    #   audioread
+    #   librosa
+starlette==1.0.0
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.14.0
+    # via torch
+tensorboard==2.20.0
+    # via descript-audiotools
+tensorboard-data-server==0.7.2
+    # via tensorboard
+termcolor==3.3.0
+    # via fire
+threadpoolctl==3.6.0
+    # via scikit-learn
+tomlkit==0.14.0
+    # via gradio
+torch==2.11.0
+    # via
+    #   descript-audio-codec
+    #   descript-audiotools
+    #   julius
+    #   torch-stoi
+torch-stoi==0.2.3
+    # via descript-audiotools
+torchaudio==2.11.0
+    # via
+    #   descript-audio-codec
+    #   descript-audiotools
+    #   torch-stoi
+tqdm==4.67.3
+    # via
+    #   descript-audio-codec
+    #   descript-audiotools
+    #   huggingface-hub
+traitlets==5.14.3
+    # via
+    #   ipython
+    #   matplotlib-inline
+triton==3.6.0 ; sys_platform == 'linux'
+    # via torch
+typer==0.24.1
+    # via
+    #   gradio
+    #   hf-gradio
+    #   huggingface-hub
+typing-extensions==4.15.0
+    # via
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   grpcio
+    #   huggingface-hub
+    #   librosa
+    #   pydantic
+    #   pydantic-core
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via
+    #   fastapi
+    #   pydantic
+tzdata==2026.1 ; sys_platform == 'emscripten' or sys_platform == 'win32'
+    # via pandas
+urllib3==2.6.3
+    # via requests
+uvicorn==0.44.0
+    # via gradio
+wcwidth==0.6.0
+    # via prompt-toolkit
+werkzeug==3.1.8
+    # via tensorboard

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff