ltuncay commited on Feb 13

Commit

eca55dc

verified ·

1 Parent(s): 673d263

Submission to the Interspeech 2026 Audio Encoder Capability Challenge

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

BEST-RQ-2.safetensors +3 -0
BEST-RQ-2_encoder.py +233 -0
README.md +35 -3
audio-embeddings/.githooks/post-checkout +13 -0
audio-embeddings/.githooks/post-merge +13 -0
audio-embeddings/.githooks/post-rewrite +13 -0
audio-embeddings/.gitignore +29 -0
audio-embeddings/.pre-commit-config.yaml +39 -0
audio-embeddings/.project-root +0 -0
audio-embeddings/.python-version +1 -0
audio-embeddings/AGENTS.md +150 -0
audio-embeddings/README.md +142 -0
audio-embeddings/THIRD_PARTY_LICENSES.md +13 -0
audio-embeddings/configs/__init__.py +1 -0
audio-embeddings/configs/callbacks/default.yaml +34 -0
audio-embeddings/configs/callbacks/early_stopping.yaml +15 -0
audio-embeddings/configs/callbacks/ema_weight_averaging.yaml +9 -0
audio-embeddings/configs/callbacks/model_checkpoint.yaml +17 -0
audio-embeddings/configs/callbacks/model_summary.yaml +5 -0
audio-embeddings/configs/callbacks/none.yaml +0 -0
audio-embeddings/configs/callbacks/rich_progress_bar.yaml +4 -0
audio-embeddings/configs/callbacks/wandb_offline.yaml +2 -0
audio-embeddings/configs/data/audioset.yaml +12 -0
audio-embeddings/configs/data/mock_audioset.yaml +7 -0
audio-embeddings/configs/data/yt1b.yaml +13 -0
audio-embeddings/configs/experiment/audio_jepa/baseline.yaml +34 -0
audio-embeddings/configs/experiment/audio_jepa/large.yaml +44 -0
audio-embeddings/configs/experiment/audio_jepa/rope.yaml +41 -0
audio-embeddings/configs/experiment/audio_jepa/time_res2x.yaml +54 -0
audio-embeddings/configs/experiment/audio_jepa/time_res4x.yaml +54 -0
audio-embeddings/configs/experiment/best_rq/audioset.yaml +33 -0
audio-embeddings/configs/experiment/best_rq/yt1b.yaml +31 -0
audio-embeddings/configs/experiment/best_rq_2/audioset.yaml +33 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_100k_512bs.yaml +30 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_1m_128bs_4gpu.yaml +30 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_200k_256bs_4gpu.yaml +30 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_400k_128bs.yaml +30 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_400k_128bs_4gpu.yaml +30 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_800k_64bs_4gpu.yaml +30 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_ema.yaml +35 -0
audio-embeddings/configs/experiment/best_rq_2/audioset_ema_600k.yaml +36 -0
audio-embeddings/configs/experiment/best_rq_2/yt1b_ema.yaml +37 -0
audio-embeddings/configs/experiment/local/audio_jepa.yaml +29 -0
audio-embeddings/configs/experiment/local/audio_jepa_rope.yaml +36 -0
audio-embeddings/configs/experiment/local/best_rq.yaml +29 -0
audio-embeddings/configs/experiment/local/best_rq2.yaml +38 -0
audio-embeddings/configs/experiment/local/m4_mock_jepa.yaml +37 -0
audio-embeddings/configs/experiment/local/rqa_jepa.yaml +29 -0
audio-embeddings/configs/experiment/rqa_jepa/audioset.yaml +33 -0
audio-embeddings/configs/experiment/rqa_jepa/yt1b.yaml +31 -0

BEST-RQ-2.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7111465e6c868e3d0b55c5fe9a23dc5069ac80beba8444c5ee9db4691d796899
+size 483870856

BEST-RQ-2_encoder.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import glob
+import os
+import sys
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf
+from safetensors.torch import load_file
+# Add audio-embeddings to path dynamically
+# We assume audio-embeddings is a sibling directory to xares-llm or provided via env var
+# Prioritize absolute path if known, otherwise relative
+POSSIBLE_PATHS = [
+    # "/media/ltuncay/Shared-4TB/dev/audio-embeddings",
+    os.path.abspath(os.path.join(os.path.dirname(__file__), "audio-embeddings")),
+    # os.path.abspath(os.path.join(os.getcwd(), "../audio-embeddings")),
+]
+AUDIO_EMBEDDINGS_PATH = None
+for p in POSSIBLE_PATHS:
+    if os.path.exists(p):
+        AUDIO_EMBEDDINGS_PATH = p
+        break
+if AUDIO_EMBEDDINGS_PATH:
+    if AUDIO_EMBEDDINGS_PATH not in sys.path:
+        sys.path.append(AUDIO_EMBEDDINGS_PATH)
+        print(f"Added {AUDIO_EMBEDDINGS_PATH} to sys.path")
+else:
+    print(
+        "Warning: audio-embeddings path not found. Imports may fail if not installed in environment."
+    )
+try:
+    from src.models.best_rq2_module import BestRQ2Module
+except ImportError as e:
+    raise ImportError(
+        f"Could not import src.models.best_rq2_module. Ensure audio-embeddings is correctly located or installed. Error: {e}"
+    )
+class BestRQ2Encoder(nn.Module):
+    def __init__(self, checkpoint_path=None, model_config_path=None, **kwargs):
+        super().__init__()
+        base_path = os.path.dirname(__file__)
+        model_config_path = os.path.join(base_path, "config.yaml")
+        checkpoint_path = os.path.join(base_path, "BEST-RQ-2.safetensors")
+        if not os.path.exists(model_config_path):
+            raise FileNotFoundError(f"Config not found at {model_config_path}")
+        if not checkpoint_path or not os.path.exists(checkpoint_path):
+            raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
+        print(f"Loading BestRQ2 config from {model_config_path}")
+        cfg = OmegaConf.load(model_config_path)
+        print(f"Loading BestRQ2 checkpoint from {checkpoint_path}")
+        # Reconstruct model args from config
+        model_cfg = cfg.model
+        net_cfg = model_cfg.net
+        # Instantiate model
+        # Note: BestRQ2Module inherits from LightningModule
+        self.module = BestRQ2Module(
+            optimizer=None,  # Not needed for inference
+            net=net_cfg,
+            warmup_pct=model_cfg.get("warmup_pct", 0.1),
+            final_lr_ratio=model_cfg.get("final_lr_ratio", 0.001),
+            spectrogram_adjustment_mode=model_cfg.get(
+                "spectrogram_adjustment_mode", "pad"
+            ),
+            codebook_dim=model_cfg.get("codebook_dim", 16),
+            vocab_size=model_cfg.get("vocab_size", 8192),
+            criterion=None,
+        )
+        # Load weights
+        try:
+            state_dict = load_file(checkpoint_path)
+        except Exception as e:
+            print(f"Error loading safetensors: {e}. Trying torch.load...")
+            state_dict = torch.load(checkpoint_path, map_location="cpu")
+            if "state_dict" in state_dict:
+                state_dict = state_dict["state_dict"]
+        # Handle 'module.' prefix if present in checkpoint vs model
+        # Usually LightningModules save with state_dict keys matching model attributes.
+        # But sometimes they might be wrapped.
+        # We will try loading strict=False and inspect.
+        missing, unexpected = self.module.load_state_dict(state_dict, strict=False)
+        if missing:
+            # Check if prefixes match
+            # If all missing keys start with something common, or if state_dict has prefixes
+            print(f"Warning: {len(missing)} keys missing during loading.")
+            # print(missing[:5])
+        if unexpected:
+            print(f"Warning: {len(unexpected)} keys unexpected during loading.")
+        self.module.eval()
+        self.output_dim = net_cfg.encoder.embed_dim
+        # Extract dynamic parameters for length handling
+        try:
+            # 1. Sample Rate & Hop Length (from Spectrogram)
+            # BestRQ2Module -> Spectrogram -> MelSpectrogram -> hop_length
+            self.sample_rate = self.module.spectrogram.mel_spec.sample_rate
+            self.hop_length = self.module.spectrogram.mel_spec.hop_length
+            # 2. Patch Size (Time dimension)
+            # BestRQ2Module -> PatchEmbed -> patch_size (H, W) -> W is time
+            self.patch_size_time = self.module.patch_embed.patch_size[1]
+            # 3. Max Input Frames (Time dimension)
+            # BestRQ2Module -> PatchEmbed -> img_size (H, W) -> W is time frames
+            self.max_frames = self.module.patch_embed.img_size[1]
+            # Calculations
+            # Minimum samples required to get at least 1 patch width in spectrogram
+            # We need T_spec >= patch_size_time
+            # T_spec = T_samples // hop_length (roughly)
+            # So T_samples >= patch_size_time * hop_length
+            self.min_samples = self.patch_size_time * self.hop_length
+            # Chunk size: The maximum audio length the model's positional embeddings can handle
+            # T_samples_max = max_frames * hop_length
+            self.chunk_samples = self.max_frames * self.hop_length
+            print(
+                f"BestRQ2Encoder constraints: Min Samples={self.min_samples}, Chunk Samples={self.chunk_samples}"
+            )
+        except Exception as e:
+            print(f"Warning: Could not extract dynamic length constraints: {e}")
+            print("Falling back to safe defaults (1s min, 10s chunk)")
+            self.min_samples = 16000
+            self.chunk_samples = 16000 * 10
+    def _forward_chunk(self, audio_chunk: torch.Tensor) -> torch.Tensor:
+        """Helper to process a single time-chunk of audio."""
+        # Determine target device from the spectrogram window (safest for STFT)
+        try:
+            target_device = self.module.spectrogram.mel_spec.spectrogram.window.device
+        except AttributeError:
+            if hasattr(self.module.spectrogram.mel_spec, "window"):
+                target_device = self.module.spectrogram.mel_spec.window.device
+            else:
+                target_device = self.module.device
+        if audio_chunk.device != target_device:
+            audio_chunk = audio_chunk.to(target_device)
+        # BestRQ2Module expects [B, C, T]
+        if audio_chunk.ndim == 2:
+            audio_chunk = audio_chunk.unsqueeze(1)  # [B, 1, T]
+        # _process_audio returns (patches, grid_size)
+        patches, grid_size = self.module._process_audio(audio_chunk)
+        # Create Dummy Mask (all False = keep all)
+        B, N, D = patches.shape
+        mask = torch.zeros((B, N), dtype=torch.bool, device=patches.device)
+        # Compute encoder
+        encoder_out = self.module.compute_encoder(patches, mask, grid_size)
+        return encoder_out
+    def forward(
+        self, audio: torch.Tensor, audio_attention_mask=None
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # audio: [B, T]
+        if audio.ndim == 1:
+            audio = audio.unsqueeze(0)
+        B, T = audio.shape
+        # 1. Handle Short Audio (Whole Batch)
+        if T < self.min_samples:
+            pad_amt = self.min_samples - T
+            audio = torch.nn.functional.pad(audio, (0, pad_amt))
+            T = self.min_samples  # Update T
+        # 2. Sequential Chunking
+        if T <= self.chunk_samples:
+            # Single chunk processing
+            return self._forward_chunk(audio), None
+        else:
+            # Split into chunks of max length
+            chunks = torch.split(audio, self.chunk_samples, dim=1)
+            outputs = []
+            for chunk in chunks:
+                # Handle potentially short last chunk
+                chunk_len = chunk.shape[1]
+                if chunk_len < self.min_samples:
+                    pad_amt = self.min_samples - chunk_len
+                    chunk = torch.nn.functional.pad(chunk, (0, pad_amt))
+                # Process
+                out_chunk = self._forward_chunk(chunk)
+                # If we padded the last chunk solely to meet min_samples,
+                # should we slice? BestRQ2 output is patches.
+                # 1 patch covers `min_samples`.
+                # If original was < 1 patch, we produced 1 patch.
+                # We can't slice sub-patch. We just return the 1 patch.
+                outputs.append(out_chunk)
+            # Concatenate along sequence dimension (dim=1)
+            final_output = torch.cat(outputs, dim=1)
+            return final_output, None
+if __name__ == "__main__":
+    try:
+        mdl = BestRQ2Encoder()
+        print("Model initialized successfully")
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        mdl.module.to(device)
+        x = torch.randn(1, 160000).to(device)
+        y, _ = mdl(x)
+        print(f"Output shape: {y.shape}")
+    except Exception as e:
+        print(f"Error testing model: {e}")
+        import traceback
+        traceback.print_exc()

README.md CHANGED Viewed

@@ -1,3 +1,35 @@
----
-license: mit
----

+# BEST-RQ-2 (xares-llm encoder)
+This folder contains the BEST-RQ-2 audio encoder integration for `xares-llm`.
+Benchmark repository: [xiaomi-research/xares-llm](https://github.com/xiaomi-research/xares-llm)
+## Setup
+1. Make sure the environment to run `xares-llm` is set up properly (virtual environment initialized and the `xares-llm` package downloaded/installed).
+2. Add the `BEST-RQ-2` folder to the `xares-llm` (current) directory so it is available at `./BEST-RQ-2`.
+3. Before running a `xares-llm` evaluation, you must install the required packages for BEST-RQ-2:
+```bash
+uv pip install -r BEST-RQ-2/audio-embeddings/pyproject.toml
+```
+## Run an evaluation
+Single task (e.g. to test that everything works):
+```bash
+uv run -m xares_llm.run BEST-RQ-2.BEST-RQ-2_encoder.BestRQ2Encoder <task> <task> # e.g. replace <task> with esc-50, score should be around 0.48
+```
+All tasks:
+```bash
+uv run -m xares_llm.run BEST-RQ-2.BEST-RQ-2_encoder.BestRQ2Encoder all
+```
+## Help
+If you encounter any problems, contact: [ludovic.tuncay@irit.fr](mailto:ludovic.tuncay@irit.fr)

audio-embeddings/.githooks/post-checkout ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -euo pipefail
+repo_root="$(git rev-parse --show-toplevel)"
+cd "$repo_root"
+if ! command -v uv >/dev/null 2>&1; then
+  echo "[post-checkout] uv not found; skipping uv sync" >&2
+  exit 0
+fi
+echo "[post-checkout] Running uv sync..."
+uv sync

audio-embeddings/.githooks/post-merge ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -euo pipefail
+repo_root="$(git rev-parse --show-toplevel)"
+cd "$repo_root"
+if ! command -v uv >/dev/null 2>&1; then
+  echo "[post-merge] uv not found; skipping uv sync" >&2
+  exit 0
+fi
+echo "[post-merge] Running uv sync..."
+uv sync

audio-embeddings/.githooks/post-rewrite ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+set -euo pipefail
+repo_root="$(git rev-parse --show-toplevel)"
+cd "$repo_root"
+if ! command -v uv >/dev/null 2>&1; then
+  echo "[post-rewrite] uv not found; skipping uv sync" >&2
+  exit 0
+fi
+echo "[post-rewrite] Running uv sync..."
+uv sync

audio-embeddings/.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Project logs
+logs/
+# Large data files
+*.h5
+# Data
+data/
+# Jupyter
+.ipynb_checkpoints/
+# macOS Finder metadata
+.DS_Store
+# Explicitly untracked large docs
+documents/LeJEPA.pdf
+documents/Audio-LeJEPA.pdf

audio-embeddings/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      # Trim whitespace at end of lines.
+      - id: trailing-whitespace
+      # Ensure files end with a single newline.
+      - id: end-of-file-fixer
+      # Catch unresolved merge conflict markers.
+      - id: check-merge-conflict
+      # Validate YAML syntax (Hydra configs, etc.).
+      - id: check-yaml
+      # Validate TOML syntax (e.g., pyproject.toml).
+      - id: check-toml
+      # Detect mixed CRLF/LF endings without auto-rewriting.
+      - id: mixed-line-ending
+        args: ["--fix=no"]
+      # Catch accidental debug leftovers (breakpoint/pdb).
+      - id: debug-statements
+      # Block accidental private key commits.
+      - id: detect-private-key
+      # Prevent case-colliding paths across filesystems.
+      - id: check-case-conflict
+      # Block newly added large files unless explicitly allowlisted below.
+      - id: check-added-large-files
+        args: ["--maxkb=5000"]
+        exclude: ^(documents/LeJEPA\.pdf|documents/Audio-LeJEPA\.pdf)$
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.15.0
+    hooks:
+      # Run the linter.
+      - id: ruff-check
+        types_or: [python, pyi]
+        args: [--fix]
+      # Run the formatter.
+      - id: ruff-format
+        types_or: [python, pyi]

audio-embeddings/.project-root ADDED Viewed

File without changes

audio-embeddings/.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

audio-embeddings/AGENTS.md ADDED Viewed

	@@ -0,0 +1,150 @@

+# AGENTS Guide - audio-embeddings
+This file is for coding agents working in this repository.
+Follow these repo-specific rules over generic defaults.
+## 1) Environment Snapshot
+- Python: `>=3.12` (from `pyproject.toml`).
+- Dependency manager: `uv`.
+- Main stack: PyTorch, PyTorch Lightning, Hydra, OmegaConf.
+- Project root marker: `.project-root`.
+- Main entrypoint: `src/train.py`.
+## 2) Cursor / Copilot Rule Files
+- Checked `.cursor/rules/`: not present.
+- Checked `.cursorrules`: not present.
+- Checked `.github/copilot-instructions.md`: not present.
+- Therefore, no additional Cursor/Copilot rule files are currently enforced.
+## 3) Install / Setup Commands
+```bash
+uv sync
+uv run <command>
+uv add <package>
+```
+## 4) Build / Train / Eval Commands
+There is no separate "build" step (this is a training codebase).
+Use quick-run training as the integration sanity check.
+```bash
+uv run src/train.py
+uv run src/train.py trainer.fast_dev_run=True
+uv run src/train.py trainer=cpu trainer.fast_dev_run=True
+uv run src/train.py experiment=local/audio_jepa
+uv run src/train.py trainer.max_epochs=10 data.batch_size=32 model.optimizer.lr=1e-4
+```
+Cluster-style execution (existing project pattern):
+```bash
+srun .venv/bin/python -u -O src/train.py experiment=cluster_jepa_audioset_rope +trainer.max_time="00:19:50:00"
+```
+## 5) Lint / Formatting / Static Checks
+Use the commands below as pragmatic checks:
+```bash
+uv run pre-commit run --all-files
+uv run pre-commit run ruff --all-files
+uv run pre-commit run ruff-format --all-files
+uv run python -m compileall src
+```
+Ruff is configured via `.pre-commit-config.yaml` and runs both lint fixes and formatting.
+## 6) Test Commands (Including Single Test)
+Primary validation in this repo is script-based verification under `tests/`.
+Run test files directly as native Python files:
+```bash
+uv run tests/verify_rope.py
+uv run tests/verify_custom_rope.py
+uv run tests/verify_data.py
+```
+Useful single-file checks (native execution):
+```bash
+uv run src/train.py trainer.fast_dev_run=True
+uv run src/train.py trainer=cpu trainer.fast_dev_run=True
+uv run scripts/verify_shapes.py
+uv run scripts/verify_scheduler.py
+```
+Notes:
+- `tests/test_*.py` are pytest-style and are not part of the default native-file workflow.
+- Prefer `tests/verify_*.py` and `scripts/verify_*.py` for lightweight checks.
+## 7) Repository Architecture Expectations
+- `configs/`: Hydra composition (trainer/data/model/logger/callbacks/experiment).
+- `src/train.py`: orchestration only (instantiate and run).
+- `src/models/`: LightningModules (high-level training logic).
+- `src/models/components/`: reusable `nn.Module` building blocks.
+- `src/data/`: DataModules/Datasets and collate logic.
+- `src/utils/`: logging, instantiation, wrappers, scheduler helpers.
+When possible, prefer config changes over hardcoded Python changes.
+## 8) Code Style Guidelines
+### Imports
+- Group imports as: standard library -> third-party -> local `src.*`.
+- Keep one import per line unless importing multiple names from same module.
+- Avoid wildcard imports.
+- Prefer absolute imports from `src...`.
+### Formatting
+- Use 4-space indentation and readable line lengths.
+- Keep functions small; extract helpers for complex logic.
+- Do not introduce unrelated reformatting in touched files.
+- Keep comments for non-obvious intent, not obvious mechanics.
+### Typing
+- Type hints are expected for function arguments and return values.
+- Use concrete tensor/container types when practical.
+- Use `Optional[T]` / `T | None` consistently within a file.
+- For dict-like configs, type as `DictConfig` when passing Hydra config objects.
+### Naming
+- `snake_case`: functions, variables, module filenames.
+- `PascalCase`: classes (`AudioJEPAModule`, `AudioSetDataModule`).
+- `UPPER_SNAKE_CASE`: constants.
+- Prefer descriptive names (`mask_indices`) over short names (`m2`) except local math temporaries.
+### PyTorch / Lightning / Hydra Conventions
+- Keep heavy compute out of `__init__` where possible.
+- `forward()` for inference logic; training behavior in `training_step()`.
+- Use `self.log(...)` with explicit flags (`on_step`, `on_epoch`, `prog_bar`, `batch_size`).
+- Instantiate components through Hydra (`hydra.utils.instantiate`).
+- Expose tunable parameters in config files, not hardcoded literals.
+### Error Handling and Validation
+- Raise informative `ValueError` / `RuntimeError` for invalid config/state.
+- Validate critical tensor assumptions with assertions or explicit checks.
+- Prefer logger/warnings over bare `print()` in new code.
+- For file I/O, prefer `pathlib.Path` and existence checks.
+### Data and Paths
+- Do not hardcode absolute machine paths.
+- Use `rootutils.setup_root(..., indicator=".project-root", pythonpath=True)` in entrypoints/scripts when needed.
+- Respect `cfg.paths.*` outputs for logs/checkpoints/artifacts.
+## 9) Agent Workflow Rules
+- Reuse existing components before adding new abstractions.
+- Keep `src/train.py` generic; place model/data logic in dedicated modules.
+- Prefer minimal, focused diffs.
+- Update configs and docs when behavior changes.
+- Validate with the smallest meaningful command first (`fast_dev_run`, single test), then broader checks.
+## 10) Git / Change Hygiene
+- Do not revert unrelated local changes.
+- Keep commits scoped to one concern.
+- Write clear commit messages describing intent.
+- Prefer Conventional Commit-like format: `type(scope): intent`.
+- Common types in this repo: `feat`, `fix`, `conf`, `build`, `docs`, `style`, `chore`.
+- Never commit secrets, credentials, or environment-specific absolute paths.
+## 11) Practical Agent Defaults
+- Prefer reusing existing modules over creating new abstractions.
+- Keep edits local to the requested change; avoid drive-by refactors.
+- Run the smallest useful verification command after changes.
+- If you touch training logic, run at least one fast training sanity check.
+- If you touch model components, run relevant verify script(s) in `tests/`.
+- If you touch Hydra config wiring, run a config-backed entry command via `uv run src/train.py ...`.
+## 12) Common Pitfalls
+- Avoid hardcoding data paths; use config (`cfg.paths`, data config fields).
+- Avoid printing in new code paths; use ranked loggers/warnings.
+- Avoid putting heavy tensor compute in constructors.
+- Avoid bypassing Hydra by manually instantiating configurable components.
+- Avoid changing unrelated formatting in files you touch.

audio-embeddings/README.md ADDED Viewed

	@@ -0,0 +1,142 @@

+# Audio Embeddings with Lightning & Hydra
+This project is a clean, modular, and scalable implementation of audio embedding models using **PyTorch Lightning** and **Hydra**. It is designed to be easily extensible and runnable on local or cluster environments. It is based on the [Audio-JEPA](https://github.com/LudovicTuncay/Audio-JEPA) implementation and therefore implements the Audio-JEPA architecture. Other architecture can and will be added in the future.
+## 🎯 Goal
+The goal of this project is to provide a robust codebase for training and experimenting with audio embedding models. Key features include:
+- **Modular Architecture**: Components like Spectrogram, Masking, and ViT are decoupled.
+- **Configurable Positional Embeddings**: Support for **RoPE** (2D Rotary Embeddings), **SinCos** (2D Sinusoidal), and **Learnable** embeddings.
+- **Hydra Configuration**: flexible experiment management via hierarchical config files.
+- **Lightning Trainer**: Simplified training loop, logging, and checkpointing.
+- **Modern Tooling**: Uses `uv` for fast and reliable dependency management.
+## 🚀 Installation
+This project uses [`uv`](https://github.com/astral-sh/uv) for dependency management.
+1.  **Install `uv`** (if not already installed):
+    ```bash
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    ```
+2.  **Clone the repository**:
+    ```bash
+    git clone <repository_url>
+    cd audio-embeddings
+    ```
+3.  **Install dependencies**:
+    ```bash
+    uv sync
+    ```
+4.  **Enable shared git hooks** (runs `uv sync` after merge/checkout/rewrite):
+    ```bash
+    git config core.hooksPath .githooks
+    ```
+## 🏃 Usage
+### Basic Training
+To start training with the default configuration:
+```bash
+uv run src/train.py
+```
+### Common Commands
+Run on GPU with Weights & Biases logging:
+```bash
+uv run src/train.py trainer=gpu logger=wandb
+```
+Override hyperparameters on the command line:
+```bash
+uv run src/train.py data.batch_size=64 trainer.max_epochs=50
+```
+### Configurable Positional Embeddings
+You can switch between different positional embedding strategies easily:
+**RoPE**:
+```bash
+uv run src/train.py model.net.encoder.pos_embed_type=rope
+```
+### Offline WandB Logging with Model Checkpoints
+To run training offline but still have model checkpoints staged for upload (which standard WandB restricts):
+```bash
+uv run src/train.py \
+    logger=wandb \
+    logger.wandb.offline=True \
+    logger.wandb.log_model=False \
+    +callbacks.wandb_offline_checkpoint._target_=src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback \
+    trainer=gpu trainer.devices=1 \
+    data.batch_size=128 trainer.max_epochs=100
+```
+These checkpoints will be uploaded when you run `wandb sync`.
+**2D SinCos**:
+```bash
+uv run src/train.py ++model.net.encoder.pos_embed_type=sincos ++model.net.predictor.pos_embed_type=sincos
+```
+**Learnable**:
+```bash
+uv run src/train.py ++model.net.encoder.pos_embed_type=learnable ++model.net.predictor.pos_embed_type=learnable
+```
+## 📂 Project Structure
+```text
+├── configs/                 # Hydra configuration files
+│   ├── callbacks/           # Callback configs (checkpoints, early stopping)
+│   ├── data/                # Data configs (AudioSet, etc.)
+│   ├── logger/              # Logger configs (WandB, Tensorboard)
+│   ├── model/               # Model configs (AudioJEPA parameters)
+│   ├── trainer/             # Trainer configs (CPU, GPU, strategies)
+│   └── train.yaml           # Main configuration entry point
+├── src/
+│   ├── data/                # Data loading logic
+│   │   └── audioset_datamodule.py  # AudioSet DataModule & Dataset
+│   ├── models/              # Model architectures
+│   │   ├── components/      # Reusable blocks
+│   │   │   ├── masking.py   # Masking generators
+│   │   │   ├── patch_embed.py # Patchification
+│   │   │   ├── rope.py      # 2D Rotary Embeddings
+│   │   │   ├── spectrogram.py # Audio preprocessing
+│   │   │   └── vit.py       # Vision Transformer (Student/Teacher/Predictor)
+│   │   └── audio_jepa_module.py # Main LightningModule
+│   ├── utils/               # Utility functions
+│   └── train.py             # Training entry point
+├── scripts/                 # Helper scripts
+├── tests/                   # Verification tests
+├── pyproject.toml           # Project dependencies
+└── README.md                # This file
+```
+## 🛠️ Extensibility
+### Adding a New Model
+1.  Create your model components in `src/models/components/`.
+2.  Create a new LightningModule in `src/models/` (or update `AudioJEPAModule`).
+3.  Create a new config file in `configs/model/my_new_model.yaml`.
+4.  Run with `uv run src/train.py model=my_new_model`.
+### Adding a New Dataset
+1.  Create a new DataModule in `src/data/`.
+2.  Create a new config file in `configs/data/my_dataset.yaml`.
+3.  Run with `uv run src/train.py data=my_dataset`.
+### Adding Functionalities
+-   **Callbacks**: Add custom callbacks in `src/callbacks/` (if needed) or use existing Lightning callbacks, and configure them in `configs/callbacks/`.
+-   **Metrics**: Add metrics logging in `training_step` or `validation_step` inside `src/models/audio_jepa_module.py`.
+## 🧪 Testing
+Run verification scripts to ensure components are working:
+```bash
+uv run tests/verify_rope.py
+uv run tests/verify_custom_rope.py
+```

audio-embeddings/THIRD_PARTY_LICENSES.md ADDED Viewed

	@@ -0,0 +1,13 @@

+# Third-Party Licenses
+This project vendors third-party source code listed below.
+## Lightning-AI / pytorch-lightning
+- Component: `src/callbacks/lightning_weight_averaging.py`
+- Source repository: `https://github.com/Lightning-AI/pytorch-lightning`
+- Source file: `src/lightning/pytorch/callbacks/weight_averaging.py`
+- Pinned commit: `9bcba1c1e82b45e10f948dc28fc12f4cf04ab736`
+- License: Apache License 2.0
+See `licenses/APACHE-2.0-LIGHTNING.txt` for the full license text.

audio-embeddings/configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # this file is needed here to include configs when building project as a package

audio-embeddings/configs/callbacks/default.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+defaults:
+  - model_checkpoint
+  - model_summary
+  - rich_progress_bar
+  - _self_
+model_checkpoint:
+  dirpath: ${paths.output_dir}/checkpoints
+  filename: "best-step-{step:06d}"
+  monitor: "val/loss"
+  mode: "min"
+  save_last: True
+  save_weights_only: False
+  save_top_k: 0
+  auto_insert_metric_name: False
+safetensors:
+  _target_: src.callbacks.safetensors_callback.SafetensorsCallback
+  cleanup_orphan_safetensors: True
+# early_stopping:
+#   monitor: "val/loss"
+#   patience: 100
+#   mode: "min"
+model_summary:
+  max_depth: 1
+device_stats:
+  _target_: lightning.pytorch.callbacks.DeviceStatsMonitor
+visualization:
+  _target_: src.callbacks.visualization_callback.VisualizationCallback
+  num_samples: 4

audio-embeddings/configs/callbacks/early_stopping.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.EarlyStopping.html
+early_stopping:
+  _target_: lightning.pytorch.callbacks.EarlyStopping
+  monitor: ??? # quantity to be monitored, must be specified !!!
+  min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement
+  patience: 3 # number of checks with no improvement after which training will be stopped
+  verbose: False # verbosity mode
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  strict: True # whether to crash the training if monitor is not found in the validation metrics
+  check_finite: True # when set True, stops training when the monitor becomes NaN or infinite
+  stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold
+  divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold
+  check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch
+  # log_rank_zero_only: False  # this keyword argument isn't available in stable version

audio-embeddings/configs/callbacks/ema_weight_averaging.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+ema_weight_averaging:
+  _target_: src.callbacks.ema_weight_averaging.WarmupEMAWeightAveraging
+  warmup_pct: ${model.warmup_pct}
+  enabled: ${model.ema.enabled}
+  decay: ${model.ema.decay} # decay rate is 1 - decay_numerator / ( total_steps - warmup_steps )
+  decay_numerator: ${model.ema.decay_numerator} # decay rate is 1 - decay_numerator / ( total_steps - warmup_steps )
+  update_every_n_steps: ${model.ema.update_every_n_steps}
+  use_buffers: ${model.ema.use_buffers}
+  update_starting_at_step: null

audio-embeddings/configs/callbacks/model_checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
+model_checkpoint:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  dirpath: null # directory to save the model file
+  filename: null # checkpoint filename
+  monitor: null # name of the logged metric which determines when model is improving
+  verbose: False # verbosity mode
+  save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+  save_top_k: 1 # save k best models (determined by above metric)
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
+  save_weights_only: False # if True, then only the model’s weights will be saved
+  every_n_train_steps: null # number of training steps between checkpoints
+  train_time_interval: null # checkpoints are monitored at the specified time interval
+  every_n_epochs: null # number of epochs between checkpoints
+  save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation

audio-embeddings/configs/callbacks/model_summary.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
+model_summary:
+  _target_: lightning.pytorch.callbacks.RichModelSummary
+  max_depth: 1 # the maximum depth of layer nesting that the summary will include

audio-embeddings/configs/callbacks/none.yaml ADDED Viewed

File without changes

audio-embeddings/configs/callbacks/rich_progress_bar.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
+rich_progress_bar:
+  _target_: lightning.pytorch.callbacks.RichProgressBar

audio-embeddings/configs/callbacks/wandb_offline.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ wandb_offline_checkpoint:
2	+ _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/data/audioset.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+_target_: src.data.audioset_datamodule.AudioSetDataModule
+data_dir: ${paths.data_dir}/AudioSet
+batch_size: 64
+num_workers: 4
+pin_memory: True
+train_h5: full_unbal_bal_train_wav.h5
+train_csv: silent_files_full_unbal_bal_train_wav.csv
+val_h5: eval_soxrhq.h5
+val_csv: silent_files_eval_soxrhq.csv
+max_audio_length_sec: 10.0 # 10 seconds
+target_sample_rate: 16000
+collate_mode: pad

audio-embeddings/configs/data/mock_audioset.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+_target_: src.data.mock_audioset_datamodule.MockAudioSetDataModule
+batch_size: 16
+num_workers: 0  # 0 is often better for simple debugging/validation on local Mac
+pin_memory: True
+max_audio_length_sec: 10.0
+target_sample_rate: 16000
+collate_mode: pad

audio-embeddings/configs/data/yt1b.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_target_: src.data.yt1b_datamodule.YT1BDataModule
+data_dir: ${paths.data_dir}/YT-Temporal-1B
+batch_size: 64
+num_workers: 4
+pin_memory: True
+train_parquet: train_metadata.parquet
+val_parquet: val_metadata.parquet
+test_parquet: val_metadata.parquet
+max_audio_length_sec: 10.0
+min_duration_sec: 10.0
+target_sample_rate: 16000
+collate_mode: pad
+decode_window_sec: null

audio-embeddings/configs/experiment/audio_jepa/baseline.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+# @package _global_
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "baseline", "cluster"]
+trainer:
+  max_steps: 200000
+data:
+  data_dir: ${paths.data_dir}
+  train_h5: AudioSet/full_unbal_bal_train_wav.h5
+  val_h5: AudioSet/eval_soxrhq.h5
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/audio_jepa/large.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# @package _global_
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "large", "cluster", "1GPU"]
+trainer:
+  max_steps: 200000
+data:
+  data_dir: ${paths.data_dir}
+  train_h5: AudioSet/full_unbal_bal_train_wav.h5
+  val_h5: AudioSet/eval_soxrhq.h5
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+# ViT Large Configuration
+model:
+  net:
+    patch_embed:
+      embed_dim: 1024
+    encoder:
+      embed_dim: 1024
+      depth: 24
+      num_heads: 16

audio-embeddings/configs/experiment/audio_jepa/rope.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# @package _global_
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "RoPE", "cluster"]
+trainer:
+  max_steps: 200000
+data:
+  data_dir: ${paths.data_dir}
+  train_h5: AudioSet/full_unbal_bal_train_wav.h5
+  val_h5: AudioSet/eval_soxrhq.h5
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+model:
+  net:
+    encoder:
+      pos_embed_type: rope
+    predictor:
+      pos_embed_type: rope

audio-embeddings/configs/experiment/audio_jepa/time_res2x.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# @package _global_
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "time_res2x", "cluster"]
+trainer:
+  max_steps: 200000
+data:
+  data_dir: ${paths.data_dir}
+  train_h5: AudioSet/full_unbal_bal_train_wav.h5
+  val_h5: AudioSet/eval_soxrhq.h5
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+model:
+  net:
+    spectrogram:
+      win_length_ms: 64 # halved
+      hop_length_ms: 19.53125 # halved
+    patch_embed:
+      img_size: [128, 512] # width doubled because hop is halved
+    masking:
+      input_size: [128, 512]
+    encoder:
+      num_patches: 256 # (128/16) * (512/16) = 8 * 32 = 256
+      img_size: [128, 512] # Explicitly set img_size for ViT to match patch_embed
+    predictor:
+      num_patches: 256
+      img_size: [128, 512]

audio-embeddings/configs/experiment/audio_jepa/time_res4x.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# @package _global_
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "time_res4x", "cluster"]
+trainer:
+  max_steps: 200000
+data:
+  data_dir: ${paths.data_dir}
+  train_h5: AudioSet/full_unbal_bal_train_wav.h5
+  val_h5: AudioSet/eval_soxrhq.h5
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+model:
+  net:
+    spectrogram:
+      win_length_ms: 32 # quartered
+      hop_length_ms: 9.765625 # quartered
+    patch_embed:
+      img_size: [128, 1024] # width quadrupled because hop is quartered
+    masking:
+      input_size: [128, 1024]
+    encoder:
+      num_patches: 512 # (128/16) * (1024/16) = 8 * 64 = 512
+      img_size: [128, 1024]
+    predictor:
+      num_patches: 512
+      img_size: [128, 1024]

audio-embeddings/configs/experiment/best_rq/audioset.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq/audioset
+defaults:
+  - override /data: audioset
+  - override /model: best_rq
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "best-rq", "baseline", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq/yt1b.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+# @package _global_
+defaults:
+  - override /data: yt1b
+  - override /model: best_rq
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["yt1b", "best-rq", "baseline", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "best-rq-2", "baseline", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_100k_512bs.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_100k_512bs
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "100k", "512bs", "cluster GPU"]
+trainer:
+  max_steps: 100000
+data:
+  batch_size: 512
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_1m_128bs_4gpu.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_1m_128bs_4gpu
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: ddp
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "1m", "128bs", "4GPU", "cluster GPU"]
+trainer:
+  max_steps: 1000000
+data:
+  batch_size: 128
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_200k_256bs_4gpu.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_200k_256bs_4gpu
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: ddp
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "200k", "256bs", "4GPU", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_400k_128bs.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_400k_128bs
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "400k", "128bs", "cluster GPU"]
+trainer:
+  max_steps: 400000
+data:
+  batch_size: 128
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_400k_128bs_4gpu.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_400k_128bs_4gpu
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: ddp
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "400k", "128bs", "4GPU", "cluster GPU"]
+trainer:
+  max_steps: 400000
+data:
+  batch_size: 128
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_800k_64bs_4gpu.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_800k_64bs_4gpu
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: ddp
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "800k", "64bs", "4GPU", "cluster GPU"]
+trainer:
+  max_steps: 800000
+data:
+  batch_size: 64
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/best_rq_2/audioset_ema.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_ema
+defaults:
+  - /callbacks/ema_weight_averaging
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "ema", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+model:
+  ema:
+    enabled: true

audio-embeddings/configs/experiment/best_rq_2/audioset_ema_600k.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/audioset_ema_600k
+defaults:
+  - /callbacks/ema_weight_averaging
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+tags: ["audioset", "best-rq-2", "ema", "600k", "cluster GPU"]
+trainer:
+  max_steps: 600000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+model:
+  ema:
+    enabled: true
+    decay_numerator: 40.0

audio-embeddings/configs/experiment/best_rq_2/yt1b_ema.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=best_rq_2/yt1b_ema
+defaults:
+  - /callbacks/ema_weight_averaging
+  - override /data: yt1b
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+tags: ["yt1b", "best-rq-2", "ema", "cluster GPU"]
+trainer:
+  max_steps: 600000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
+model:
+  warmup_pct: 0.03
+  ema:
+    enabled: true
+    decay_numerator: 40.0

audio-embeddings/configs/experiment/local/audio_jepa.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=audioset_baseline
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "baseline", "local GPU"]
+trainer:
+  devices: 1
+  max_steps: 200000
+data:
+  batch_size: 128
+  num_workers: 16
+logger:
+  wandb:
+    name: "local-jepa-audioset-baseline-200k-128x1bs"
+    offline: False
+    log_model: True

audio-embeddings/configs/experiment/local/audio_jepa_rope.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=audioset_rope
+defaults:
+  - override /data: audioset
+  - override /model: audio_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "jepa", "rope", "local GPU"]
+trainer:
+  devices: 1
+  max_steps: 200000
+data:
+  batch_size: 128
+  num_workers: 16
+logger:
+  wandb:
+    name: "local-jepa-audioset-rope-200k-128x1bs"
+    offline: False
+    log_model: True
+model:
+  net:
+    encoder:
+      pos_embed_type: rope
+    predictor:
+      pos_embed_type: rope

audio-embeddings/configs/experiment/local/best_rq.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=local/best_rq
+defaults:
+  - override /data: audioset
+  - override /model: best_rq
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "best-rq", "baseline", "local GPU"]
+trainer:
+  devices: 1
+  max_steps: 200000
+data:
+  batch_size: 128
+  num_workers: 16
+logger:
+  wandb:
+    name: "local-best-rq-vit-audioset-200k-128x1bs"
+    offline: False
+    log_model: True

audio-embeddings/configs/experiment/local/best_rq2.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=local/best_rq2
+defaults:
+  - override /data: audioset
+  - override /model: best_rq2
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "best-rq-2", "baseline", "local GPU"]
+trainer:
+  devices: 1
+  max_steps: 100000
+data:
+  batch_size: 128
+  num_workers: 16
+logger:
+  wandb:
+    name: "cluster-best-rq-2-audioset-100k-128bs"
+    offline: True
+    log_model: False
+callbacks:
+  # model_checkpoint:
+  #   save_weights_only: True
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/local/m4_mock_jepa.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=local/m4_mock_jepa
+defaults:
+  - override /data: mock_audioset
+  - override /model: audio_jepa
+  - override /trainer: mps
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["mock", "jepa", "local", "mps"]
+trainer:
+  # devices: 1 # set in trainer/mps.yaml
+  max_steps: 100
+  log_every_n_steps: 1
+  val_check_interval: 50
+  limit_val_batches: 5
+data:
+  batch_size: 8
+logger:
+  wandb:
+    name: "local-m4-mock-jepa"
+    offline: True
+    log_model: False
+callbacks:
+  model_checkpoint:
+    save_weights_only: True
+    monitor: null # verify we can save without monitoring
+  device_stats: null

audio-embeddings/configs/experiment/local/rqa_jepa.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=local_rqa_jepa_audioset
+defaults:
+  - override /data: audioset
+  - override /model: rqa_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "rqa-jepa", "baseline", "local GPU"]
+trainer:
+  devices: 1
+  max_steps: 200000
+data:
+  batch_size: 128
+  num_workers: 16
+logger:
+  wandb:
+    name: "local-rqa-jepa-audioset-200k-128x1bs"
+    offline: False
+    log_model: True

audio-embeddings/configs/experiment/rqa_jepa/audioset.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# @package _global_
+# to execute this experiment run:
+# python train.py experiment=cluster_rqa_jepa_audioset
+defaults:
+  - override /data: audioset
+  - override /model: rqa_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["audioset", "rqa-jepa", "baseline", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback

audio-embeddings/configs/experiment/rqa_jepa/yt1b.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+# @package _global_
+defaults:
+  - override /data: yt1b
+  - override /model: rqa_jepa
+  - override /trainer: gpu
+  - override /logger: wandb
+  - override /callbacks: default
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+tags: ["yt1b", "rqa-jepa", "baseline", "cluster GPU"]
+trainer:
+  max_steps: 200000
+data:
+  batch_size: 256
+  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
+logger:
+  wandb:
+    offline: True
+    log_model: False
+callbacks:
+  rich_progress_bar: null
+  wandb_offline_checkpoint:
+    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback