Spaces:

WeReCooking
/

ACE-Step-CPU

No application file

App Files Files Community

Nekochu commited on 3 days ago

Commit

c5c37c1

verified ·

1 Parent(s): 3c29213

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +11 -6
app.py +556 -0
requirements.txt +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

README.md CHANGED Viewed

@@ -1,12 +1,17 @@
 ---
-title: ACE Step CPU
-emoji: 🐢
-colorFrom: purple
-colorTo: blue
 sdk: gradio
-sdk_version: 6.10.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ACE-Step CPU
+emoji: 🎵
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.50.0
 app_file: app.py
 pinned: false
+license: mit
+startup_duration_timeout: 3600
+python_version: 3.11
 ---
+# ACE-Step 1.5 Music Generation (CPU)
+Generate music from text descriptions and train LoRA adapters, all on CPU.

app.py ADDED Viewed

	@@ -0,0 +1,556 @@

+"""
+ACE-Step 1.5 Music Generation + LoRA Training (CPU)
+Runs on HuggingFace Spaces free CPU tier.
+"""
+import os
+import sys
+import gc
+import time
+import tempfile
+import shutil
+from pathlib import Path
+# Force CPU, no CUDA
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["TORCHAUDIO_USE_BACKEND"] = "ffmpeg"
+os.environ["ACESTEP_DISABLE_TQDM"] = "1"
+import torch
+torch.set_default_dtype(torch.float32)
+import numpy as np
+import gradio as gr
+import soundfile as sf
+# ---------------------------------------------------------------------------
+# Clone ACE-Step repo if not present
+# ---------------------------------------------------------------------------
+REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ace-step-source")
+if not os.path.isdir(REPO_DIR):
+    print("[Setup] Cloning ACE-Step 1.5 repository...")
+    os.system(f"git clone --depth 1 https://github.com/ace-step/ACE-Step-1.5 {REPO_DIR}")
+# Add repo to path
+if REPO_DIR not in sys.path:
+    sys.path.insert(0, REPO_DIR)
+# ---------------------------------------------------------------------------
+# Lazy-load handler (downloads model on first use)
+# ---------------------------------------------------------------------------
+_dit_handler = None
+_init_status = None
+CHECKPOINT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "checkpoints")
+LORA_OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "lora_output")
+CURRENT_LM_SIZE = "1.7B"  # Track current LM size
+def get_handler():
+    """Get or initialize the ACE-Step handler (lazy, first call downloads model)."""
+    global _dit_handler, _init_status
+    if _dit_handler is not None and _dit_handler.model is not None:
+        return _dit_handler, _init_status
+    from acestep.handler import AceStepHandler
+    from acestep.model_downloader import ensure_main_model
+    print("[Init] Ensuring model is downloaded...")
+    success, msg = ensure_main_model(
+        checkpoints_dir=Path(CHECKPOINT_DIR),
+        prefer_source="huggingface",
+    )
+    print(f"[Init] Model download: {msg}")
+    if not success:
+        _init_status = f"Model download failed: {msg}"
+        return None, _init_status
+    _dit_handler = AceStepHandler()
+    project_root = os.path.dirname(os.path.abspath(__file__))
+    os.environ["ACESTEP_PROJECT_ROOT"] = project_root
+    status, ok = _dit_handler.initialize_service(
+        project_root=project_root,
+        config_path="acestep-v15-turbo",
+        device="cpu",
+        use_flash_attention=False,
+        compile_model=False,
+        offload_to_cpu=False,
+        offload_dit_to_cpu=False,
+        quantization=None,
+        use_mlx_dit=False,
+    )
+    _init_status = status
+    if not ok:
+        print(f"[Init] FAILED: {status}")
+        _dit_handler = None
+        return None, _init_status
+    # Force float32 on everything
+    _dit_handler.dtype = torch.float32
+    if _dit_handler.model is not None:
+        _dit_handler.model = _dit_handler.model.float().to("cpu")
+    if _dit_handler.vae is not None:
+        _dit_handler.vae = _dit_handler.vae.float().to("cpu")
+    if _dit_handler.text_encoder is not None:
+        _dit_handler.text_encoder = _dit_handler.text_encoder.float().to("cpu")
+    print(f"[Init] OK: {status}")
+    return _dit_handler, _init_status
+def get_trained_loras():
+    """List available trained LoRAs."""
+    loras = ["None (no LoRA)"]
+    if os.path.isdir(LORA_OUTPUT_DIR):
+        for name in sorted(os.listdir(LORA_OUTPUT_DIR)):
+            lora_dir = os.path.join(LORA_OUTPUT_DIR, name)
+            if os.path.isdir(lora_dir):
+                # Check for any .safetensors or .pt files
+                for f in os.listdir(lora_dir):
+                    if f.endswith((".safetensors", ".pt", ".bin")):
+                        loras.append(name)
+                        break
+    return loras
+# ---------------------------------------------------------------------------
+# Generate Tab
+# ---------------------------------------------------------------------------
+def generate_music(
+    caption,
+    lyrics,
+    instrumental,
+    bpm,
+    duration,
+    seed,
+    inference_steps,
+    lm_size,
+    lora_choice,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Generate music from text prompt on CPU."""
+    t0 = time.time()
+    handler, status = get_handler()
+    if handler is None:
+        return None, f"Model not ready: {status}"
+    # Apply trained LoRA if selected
+    if lora_choice and lora_choice != "None (no LoRA)":
+        lora_dir = os.path.join(LORA_OUTPUT_DIR, lora_choice)
+        if os.path.isdir(lora_dir):
+            try:
+                handler.load_lora(lora_dir)
+                print(f"[Gen] Loaded LoRA: {lora_choice}")
+            except Exception as e:
+                print(f"[Gen] LoRA load failed: {e}")
+    # TODO: LM size switching requires re-downloading the LM model
+    # For now, log the selected size
+    if lm_size != CURRENT_LM_SIZE:
+        print(f"[Gen] LM size {lm_size} requested (current: {CURRENT_LM_SIZE})")
+    # Clamp values
+    duration = max(10, min(float(duration), 120))  # cap at 120s for CPU
+    inference_steps = max(1, min(int(inference_steps), 32))
+    bpm_val = int(bpm) if bpm and int(bpm) > 0 else None
+    seed_val = int(seed) if seed and int(seed) >= 0 else -1
+    try:
+        result = handler.generate_music(
+            captions=caption or "upbeat electronic dance music",
+            lyrics=lyrics or "[Instrumental]",
+            bpm=bpm_val,
+            audio_duration=duration,
+            inference_steps=inference_steps,
+            guidance_scale=1.0,  # turbo model, no CFG needed
+            use_random_seed=(seed_val < 0),
+            seed=str(seed_val) if seed_val >= 0 else "",
+            batch_size=1,
+            task_type="text2music",
+            vocal_language="en",
+            shift=1.0,
+            infer_method="ode",
+            progress=None,
+        )
+        elapsed = time.time() - t0
+        if not result.get("success", False):
+            error = result.get("error", result.get("status_message", "Unknown error"))
+            return None, f"Generation failed: {error}"
+        audios = result.get("audios", [])
+        if not audios:
+            return None, "No audio generated"
+        audio_tensor = audios[0].get("tensor")
+        sample_rate = audios[0].get("sample_rate", 48000)
+        if audio_tensor is None:
+            return None, "Audio tensor is None"
+        # Convert to numpy
+        if isinstance(audio_tensor, torch.Tensor):
+            audio_np = audio_tensor.cpu().float().numpy()
+        else:
+            audio_np = np.array(audio_tensor, dtype=np.float32)
+        # Save to temp file
+        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        # soundfile expects (samples, channels)
+        if audio_np.ndim == 2:
+            audio_np = audio_np.T  # (channels, samples) -> (samples, channels)
+        sf.write(tmp.name, audio_np, sample_rate)
+        status_msg = (
+            f"Generated in {elapsed:.1f}s | "
+            f"Duration: {duration}s | Steps: {inference_steps} | "
+            f"Seed: {seed_val}"
+        )
+        return tmp.name, status_msg
+    except Exception as e:
+        import traceback
+        return None, f"Error: {e}\n{traceback.format_exc()}"
+    finally:
+        gc.collect()
+# ---------------------------------------------------------------------------
+# Train LoRA Tab
+# ---------------------------------------------------------------------------
+def train_lora(
+    audio_files,
+    lora_name,
+    epochs,
+    learning_rate,
+    lora_rank,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Train a LoRA adapter from uploaded audio files on CPU."""
+    if not audio_files:
+        return "No audio files uploaded."
+    handler, status = get_handler()
+    if handler is None:
+        return f"Model not ready: {status}"
+    lora_name = lora_name.strip() or "my_lora"
+    epochs = max(1, min(int(epochs), 10))
+    lr = float(learning_rate)
+    rank = max(1, min(int(lora_rank), 64))
+    output_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "lora_output", lora_name
+    )
+    os.makedirs(output_dir, exist_ok=True)
+    # Create a temp directory for audio files
+    audio_dir = os.path.join(output_dir, "audio_input")
+    os.makedirs(audio_dir, exist_ok=True)
+    # Copy uploaded files
+    for f in audio_files:
+        src = f.name if hasattr(f, "name") else str(f)
+        dst = os.path.join(audio_dir, os.path.basename(src))
+        shutil.copy2(src, dst)
+    log_lines = []
+    log_lines.append(f"LoRA Training: '{lora_name}'")
+    log_lines.append(f"Audio files: {len(audio_files)}")
+    log_lines.append(f"Epochs: {epochs}, LR: {lr}, Rank: {rank}")
+    log_lines.append(f"Output: {output_dir}")
+    log_lines.append("")
+    try:
+        # Preprocessing step: encode audio files to tensors
+        log_lines.append("[Step 1/2] Preprocessing audio files...")
+        tensor_dir = os.path.join(output_dir, "preprocessed_tensors")
+        os.makedirs(tensor_dir, exist_ok=True)
+        from acestep.training_v2.preprocess import preprocess_audio_files
+        preprocess_result = preprocess_audio_files(
+            audio_dir=audio_dir,
+            output_dir=tensor_dir,
+            checkpoint_dir=CHECKPOINT_DIR,
+            variant="turbo",
+            max_duration=60.0,
+            device="cpu",
+            precision="float32",
+        )
+        processed = preprocess_result.get("processed", 0)
+        total = preprocess_result.get("total", 0)
+        failed = preprocess_result.get("failed", 0)
+        log_lines.append(f"  Preprocessed: {processed}/{total} (failed: {failed})")
+        if processed == 0:
+            log_lines.append("ERROR: No files were preprocessed successfully.")
+            return "\n".join(log_lines)
+        # Training step
+        log_lines.append("[Step 2/2] Training LoRA adapter...")
+        from acestep.training_v2.model_loader import load_decoder_for_training
+        from acestep.training_v2.trainer_fixed import FixedLoRATrainer
+        from acestep.training_v2.fixed_lora_module import AdapterConfig
+        from acestep.training_v2.configs import TrainingConfigV2
+        # Load model for training
+        model = load_decoder_for_training(
+            checkpoint_dir=CHECKPOINT_DIR,
+            variant="turbo",
+            device="cpu",
+            precision="float32",
+        )
+        adapter_cfg = AdapterConfig(
+            rank=rank,
+            alpha=rank,
+            dropout=0.0,
+            adapter_type="lora",
+        )
+        train_cfg = TrainingConfigV2(
+            checkpoint_dir=CHECKPOINT_DIR,
+            model_variant="turbo",
+            dataset_dir=tensor_dir,
+            output_dir=output_dir,
+            max_epochs=epochs,
+            batch_size=1,
+            learning_rate=lr,
+            device="cpu",
+            precision="float32",
+            seed=42,
+            num_workers=0,
+            pin_memory=False,
+        )
+        trainer = FixedLoRATrainer(model, adapter_cfg, train_cfg)
+        step_count = 0
+        last_loss = 0.0
+        for update in trainer.train():
+            if hasattr(update, "step"):
+                step_count = update.step
+                last_loss = update.loss
+                if step_count % 5 == 0:
+                    log_lines.append(f"  Step {step_count}: loss={last_loss:.4f}")
+            elif isinstance(update, tuple) and len(update) >= 2:
+                step_count = update[0]
+                last_loss = update[1]
+                if step_count % 5 == 0:
+                    log_lines.append(f"  Step {step_count}: loss={last_loss:.4f}")
+        log_lines.append(f"Training complete! Final step: {step_count}, loss: {last_loss:.4f}")
+        log_lines.append(f"LoRA saved to: {output_dir}")
+        # Cleanup
+        del model, trainer
+        gc.collect()
+    except Exception as e:
+        import traceback
+        log_lines.append(f"ERROR: {e}")
+        log_lines.append(traceback.format_exc())
+    return "\n".join(log_lines)
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+def build_ui():
+    theme = gr.themes.Default()
+    try:
+        theme = gr.Theme.from_hub("NoCrypt/miku")
+    except Exception:
+        pass
+    with gr.Blocks(
+        theme=theme,
+        title="ACE-Step 1.5 CPU",
+        css="""
+        .main-title { text-align: center; margin-bottom: 0.5em; }
+        .status-box { font-family: monospace; font-size: 0.85em; }
+        """,
+    ) as demo:
+        gr.HTML("<h1 class='main-title'>ACE-Step 1.5 Music Generation (CPU)</h1>")
+        gr.HTML(
+            "<p style='text-align:center;'>Text-to-music generation and LoRA training, "
+            "running entirely on CPU. Based on "
+            "<a href='https://github.com/ace-step/ACE-Step-1.5'>ACE-Step 1.5</a>.</p>"
+        )
+        with gr.Tabs():
+            # ---- Generate Tab ----
+            with gr.Tab("Generate Music"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        caption_input = gr.Textbox(
+                            label="Music Description",
+                            placeholder="e.g. upbeat electronic dance music, 120 BPM",
+                            lines=3,
+                            value="upbeat electronic dance music, energetic synth leads, driving bassline",
+                        )
+                        lyrics_input = gr.Textbox(
+                            label="Lyrics (use [Instrumental] for no vocals)",
+                            placeholder="[Instrumental]",
+                            lines=3,
+                            value="[Instrumental]",
+                        )
+                        instrumental_cb = gr.Checkbox(
+                            label="Instrumental (no vocals)",
+                            value=True,
+                        )
+                    with gr.Column(scale=1):
+                        bpm_input = gr.Number(
+                            label="BPM (0 = auto)",
+                            value=120,
+                            minimum=0,
+                            maximum=300,
+                        )
+                        duration_input = gr.Slider(
+                            label="Duration (seconds)",
+                            minimum=10,
+                            maximum=120,
+                            value=10,
+                            step=5,
+                        )
+                        seed_input = gr.Number(
+                            label="Seed (-1 = random)",
+                            value=-1,
+                        )
+                        steps_input = gr.Slider(
+                            label="Inference Steps (fewer = faster)",
+                            minimum=1,
+                            maximum=32,
+                            value=8,
+                            step=1,
+                        )
+                        lm_size_input = gr.Dropdown(
+                            label="LM Model Size",
+                            choices=["0.6B (fast)", "1.7B (balanced)", "4B (best quality)"],
+                            value="1.7B (balanced)",
+                            info="Language model for music understanding",
+                        )
+                        lora_select = gr.Dropdown(
+                            label="Use Trained LoRA",
+                            choices=get_trained_loras(),
+                            value="None (no LoRA)",
+                            info="Select a LoRA you trained to apply it",
+                        )
+                generate_btn = gr.Button("Generate Music", variant="primary")
+                with gr.Row():
+                    audio_output = gr.Audio(
+                        label="Generated Audio",
+                        type="filepath",
+                    )
+                gen_status = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    elem_classes="status-box",
+                )
+                generate_btn.click(
+                    fn=generate_music,
+                    inputs=[
+                        caption_input,
+                        lyrics_input,
+                        instrumental_cb,
+                        bpm_input,
+                        duration_input,
+                        seed_input,
+                        steps_input,
+                        lm_size_input,
+                        lora_select,
+                    ],
+                    outputs=[audio_output, gen_status],
+                )
+            # ---- Train LoRA Tab ----
+            with gr.Tab("Train LoRA"):
+                gr.Markdown(
+                    "### Train a LoRA adapter on your audio files\n"
+                    "Upload WAV/MP3/FLAC files to fine-tune the model. "
+                    "Training runs on CPU so keep epochs low and files short."
+                )
+                with gr.Row():
+                    with gr.Column():
+                        audio_upload = gr.File(
+                            label="Upload Audio Files",
+                            file_count="multiple",
+                            file_types=["audio"],
+                        )
+                        lora_name_input = gr.Textbox(
+                            label="LoRA Name",
+                            value="my_lora",
+                        )
+                    with gr.Column():
+                        epochs_input = gr.Slider(
+                            label="Epochs",
+                            minimum=1,
+                            maximum=10,
+                            value=1,
+                            step=1,
+                        )
+                        lr_input = gr.Number(
+                            label="Learning Rate",
+                            value=1e-4,
+                        )
+                        rank_input = gr.Slider(
+                            label="LoRA Rank",
+                            minimum=1,
+                            maximum=64,
+                            value=8,
+                            step=1,
+                        )
+                train_btn = gr.Button("Start Training", variant="primary")
+                train_log = gr.Textbox(
+                    label="Training Log",
+                    interactive=False,
+                    lines=15,
+                    elem_classes="status-box",
+                )
+                def train_and_refresh(*args):
+                    log = train_lora(*args)
+                    new_loras = get_trained_loras()
+                    return log, gr.update(choices=new_loras, value=new_loras[-1] if len(new_loras) > 1 else "None (no LoRA)")
+                train_btn.click(
+                    fn=train_and_refresh,
+                    inputs=[
+                        audio_upload,
+                        lora_name_input,
+                        epochs_input,
+                        lr_input,
+                        rank_input,
+                    ],
+                    outputs=[train_log, lora_select],
+                )
+    return demo
+if __name__ == "__main__":
+    demo = build_ui()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        ssr_mode=False,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# ACE-Step CPU requirements
+torch
+torchaudio
+torchvision
+safetensors
+transformers>=4.51.0,<4.58.0
+diffusers
+matplotlib>=3.7.5
+scipy>=1.10.1
+soundfile>=0.13.1
+loguru>=0.7.3
+einops>=0.8.1
+accelerate>=1.12.0
+numba>=0.63.1
+vector-quantize-pytorch>=1.27.15
+peft>=0.18.0
+huggingface_hub