""" ACE-Step 1.5 Music Generation + LoRA Training (CPU) Runs on HuggingFace Spaces free CPU tier. """ import os import sys import gc import time import tempfile import shutil from pathlib import Path # Force CPU, no CUDA os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TORCHAUDIO_USE_BACKEND"] = "ffmpeg" os.environ["ACESTEP_DISABLE_TQDM"] = "1" import torch torch.set_default_dtype(torch.float32) import numpy as np import gradio as gr import soundfile as sf # --------------------------------------------------------------------------- # Clone ACE-Step repo if not present # --------------------------------------------------------------------------- REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ace-step-source") if not os.path.isdir(REPO_DIR): print("[Setup] Cloning ACE-Step 1.5 repository...") os.system(f"git clone --depth 1 https://github.com/ace-step/ACE-Step-1.5 {REPO_DIR}") # Add repo to path if REPO_DIR not in sys.path: sys.path.insert(0, REPO_DIR) # --------------------------------------------------------------------------- # Lazy-load handler (downloads model on first use) # --------------------------------------------------------------------------- _dit_handler = None _init_status = None CHECKPOINT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "checkpoints") LORA_OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "lora_output") CURRENT_LM_SIZE = "1.7B" # Track current LM size def get_handler(): """Get or initialize the ACE-Step handler (lazy, first call downloads model).""" global _dit_handler, _init_status if _dit_handler is not None and _dit_handler.model is not None: return _dit_handler, _init_status from acestep.handler import AceStepHandler from acestep.model_downloader import ensure_main_model print("[Init] Ensuring model is downloaded...") success, msg = ensure_main_model( checkpoints_dir=Path(CHECKPOINT_DIR), prefer_source="huggingface", ) print(f"[Init] Model download: {msg}") if not success: _init_status = f"Model download failed: {msg}" return None, _init_status _dit_handler = AceStepHandler() project_root = os.path.dirname(os.path.abspath(__file__)) os.environ["ACESTEP_PROJECT_ROOT"] = project_root status, ok = _dit_handler.initialize_service( project_root=project_root, config_path="acestep-v15-turbo", device="cpu", use_flash_attention=False, compile_model=False, offload_to_cpu=False, offload_dit_to_cpu=False, quantization=None, use_mlx_dit=False, ) _init_status = status if not ok: print(f"[Init] FAILED: {status}") _dit_handler = None return None, _init_status # Force float32 on everything _dit_handler.dtype = torch.float32 if _dit_handler.model is not None: _dit_handler.model = _dit_handler.model.float().to("cpu") if _dit_handler.vae is not None: _dit_handler.vae = _dit_handler.vae.float().to("cpu") if _dit_handler.text_encoder is not None: _dit_handler.text_encoder = _dit_handler.text_encoder.float().to("cpu") print(f"[Init] OK: {status}") return _dit_handler, _init_status def get_trained_loras(): """List available trained LoRAs.""" loras = ["None (no LoRA)"] if os.path.isdir(LORA_OUTPUT_DIR): for name in sorted(os.listdir(LORA_OUTPUT_DIR)): lora_dir = os.path.join(LORA_OUTPUT_DIR, name) if os.path.isdir(lora_dir): # Check for any .safetensors or .pt files for f in os.listdir(lora_dir): if f.endswith((".safetensors", ".pt", ".bin")): loras.append(name) break return loras # --------------------------------------------------------------------------- # Generate Tab # --------------------------------------------------------------------------- def generate_music( caption, lyrics, instrumental, bpm, duration, seed, inference_steps, lm_size, lora_choice, progress=gr.Progress(track_tqdm=True), ): """Generate music from text prompt on CPU.""" t0 = time.time() handler, status = get_handler() if handler is None: return None, f"Model not ready: {status}" # Apply trained LoRA if selected if lora_choice and lora_choice != "None (no LoRA)": lora_dir = os.path.join(LORA_OUTPUT_DIR, lora_choice) if os.path.isdir(lora_dir): try: handler.load_lora(lora_dir) print(f"[Gen] Loaded LoRA: {lora_choice}") except Exception as e: print(f"[Gen] LoRA load failed: {e}") # TODO: LM size switching requires re-downloading the LM model # For now, log the selected size if lm_size != CURRENT_LM_SIZE: print(f"[Gen] LM size {lm_size} requested (current: {CURRENT_LM_SIZE})") # Clamp values duration = max(10, min(float(duration), 120)) # cap at 120s for CPU inference_steps = max(1, min(int(inference_steps), 32)) bpm_val = int(bpm) if bpm and int(bpm) > 0 else None seed_val = int(seed) if seed and int(seed) >= 0 else -1 try: result = handler.generate_music( captions=caption or "upbeat electronic dance music", lyrics=lyrics or "[Instrumental]", bpm=bpm_val, audio_duration=duration, inference_steps=inference_steps, guidance_scale=1.0, # turbo model, no CFG needed use_random_seed=(seed_val < 0), seed=str(seed_val) if seed_val >= 0 else "", batch_size=1, task_type="text2music", vocal_language="en", shift=1.0, infer_method="ode", progress=None, ) elapsed = time.time() - t0 if not result.get("success", False): error = result.get("error", result.get("status_message", "Unknown error")) return None, f"Generation failed: {error}" audios = result.get("audios", []) if not audios: return None, "No audio generated" audio_tensor = audios[0].get("tensor") sample_rate = audios[0].get("sample_rate", 48000) if audio_tensor is None: return None, "Audio tensor is None" # Convert to numpy if isinstance(audio_tensor, torch.Tensor): audio_np = audio_tensor.cpu().float().numpy() else: audio_np = np.array(audio_tensor, dtype=np.float32) # Save to temp file tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) # soundfile expects (samples, channels) if audio_np.ndim == 2: audio_np = audio_np.T # (channels, samples) -> (samples, channels) sf.write(tmp.name, audio_np, sample_rate) status_msg = ( f"Generated in {elapsed:.1f}s | " f"Duration: {duration}s | Steps: {inference_steps} | " f"Seed: {seed_val}" ) return tmp.name, status_msg except Exception as e: import traceback return None, f"Error: {e}\n{traceback.format_exc()}" finally: gc.collect() # --------------------------------------------------------------------------- # Train LoRA Tab # --------------------------------------------------------------------------- def train_lora( audio_files, lora_name, epochs, learning_rate, lora_rank, progress=gr.Progress(track_tqdm=True), ): """Train a LoRA adapter from uploaded audio files on CPU.""" if not audio_files: return "No audio files uploaded." handler, status = get_handler() if handler is None: return f"Model not ready: {status}" lora_name = lora_name.strip() or "my_lora" epochs = max(1, min(int(epochs), 10)) lr = float(learning_rate) rank = max(1, min(int(lora_rank), 64)) output_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "lora_output", lora_name ) os.makedirs(output_dir, exist_ok=True) # Create a temp directory for audio files audio_dir = os.path.join(output_dir, "audio_input") os.makedirs(audio_dir, exist_ok=True) # Copy uploaded files for f in audio_files: src = f.name if hasattr(f, "name") else str(f) dst = os.path.join(audio_dir, os.path.basename(src)) shutil.copy2(src, dst) log_lines = [] log_lines.append(f"LoRA Training: '{lora_name}'") log_lines.append(f"Audio files: {len(audio_files)}") log_lines.append(f"Epochs: {epochs}, LR: {lr}, Rank: {rank}") log_lines.append(f"Output: {output_dir}") log_lines.append("") try: # Preprocessing step: encode audio files to tensors log_lines.append("[Step 1/2] Preprocessing audio files...") tensor_dir = os.path.join(output_dir, "preprocessed_tensors") os.makedirs(tensor_dir, exist_ok=True) from acestep.training_v2.preprocess import preprocess_audio_files preprocess_result = preprocess_audio_files( audio_dir=audio_dir, output_dir=tensor_dir, checkpoint_dir=CHECKPOINT_DIR, variant="turbo", max_duration=60.0, device="cpu", precision="float32", ) processed = preprocess_result.get("processed", 0) total = preprocess_result.get("total", 0) failed = preprocess_result.get("failed", 0) log_lines.append(f" Preprocessed: {processed}/{total} (failed: {failed})") if processed == 0: log_lines.append("ERROR: No files were preprocessed successfully.") return "\n".join(log_lines) # Training step log_lines.append("[Step 2/2] Training LoRA adapter...") from acestep.training_v2.model_loader import load_decoder_for_training from acestep.training_v2.trainer_fixed import FixedLoRATrainer from acestep.training_v2.fixed_lora_module import AdapterConfig from acestep.training_v2.configs import TrainingConfigV2 # Load model for training model = load_decoder_for_training( checkpoint_dir=CHECKPOINT_DIR, variant="turbo", device="cpu", precision="float32", ) adapter_cfg = AdapterConfig( rank=rank, alpha=rank, dropout=0.0, adapter_type="lora", ) train_cfg = TrainingConfigV2( checkpoint_dir=CHECKPOINT_DIR, model_variant="turbo", dataset_dir=tensor_dir, output_dir=output_dir, max_epochs=epochs, batch_size=1, learning_rate=lr, device="cpu", precision="float32", seed=42, num_workers=0, pin_memory=False, ) trainer = FixedLoRATrainer(model, adapter_cfg, train_cfg) step_count = 0 last_loss = 0.0 for update in trainer.train(): if hasattr(update, "step"): step_count = update.step last_loss = update.loss if step_count % 5 == 0: log_lines.append(f" Step {step_count}: loss={last_loss:.4f}") elif isinstance(update, tuple) and len(update) >= 2: step_count = update[0] last_loss = update[1] if step_count % 5 == 0: log_lines.append(f" Step {step_count}: loss={last_loss:.4f}") log_lines.append(f"Training complete! Final step: {step_count}, loss: {last_loss:.4f}") log_lines.append(f"LoRA saved to: {output_dir}") # Cleanup del model, trainer gc.collect() except Exception as e: import traceback log_lines.append(f"ERROR: {e}") log_lines.append(traceback.format_exc()) return "\n".join(log_lines) # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- def build_ui(): theme = gr.themes.Default() try: theme = gr.Theme.from_hub("NoCrypt/miku") except Exception: pass with gr.Blocks( theme=theme, title="ACE-Step 1.5 CPU", css=""" .main-title { text-align: center; margin-bottom: 0.5em; } .status-box { font-family: monospace; font-size: 0.85em; } """, ) as demo: gr.HTML("

ACE-Step 1.5 Music Generation (CPU)

") gr.HTML( "

Text-to-music generation and LoRA training, " "running entirely on CPU. Based on " "ACE-Step 1.5.

" ) with gr.Tabs(): # ---- Generate Tab ---- with gr.Tab("Generate Music"): with gr.Row(): with gr.Column(scale=2): caption_input = gr.Textbox( label="Music Description", placeholder="e.g. upbeat electronic dance music, 120 BPM", lines=3, value="upbeat electronic dance music, energetic synth leads, driving bassline", ) lyrics_input = gr.Textbox( label="Lyrics (use [Instrumental] for no vocals)", placeholder="[Instrumental]", lines=3, value="[Instrumental]", ) instrumental_cb = gr.Checkbox( label="Instrumental (no vocals)", value=True, ) with gr.Column(scale=1): bpm_input = gr.Number( label="BPM (0 = auto)", value=120, minimum=0, maximum=300, ) duration_input = gr.Slider( label="Duration (seconds)", minimum=10, maximum=120, value=10, step=5, ) seed_input = gr.Number( label="Seed (-1 = random)", value=-1, ) steps_input = gr.Slider( label="Inference Steps (fewer = faster)", minimum=1, maximum=32, value=8, step=1, ) lm_size_input = gr.Dropdown( label="LM Model Size", choices=["0.6B (fast)", "1.7B (balanced)", "4B (best quality)"], value="1.7B (balanced)", info="Language model for music understanding", ) lora_select = gr.Dropdown( label="Use Trained LoRA", choices=get_trained_loras(), value="None (no LoRA)", info="Select a LoRA you trained to apply it", ) generate_btn = gr.Button("Generate Music", variant="primary") with gr.Row(): audio_output = gr.Audio( label="Generated Audio", type="filepath", ) gen_status = gr.Textbox( label="Status", interactive=False, elem_classes="status-box", ) generate_btn.click( fn=generate_music, inputs=[ caption_input, lyrics_input, instrumental_cb, bpm_input, duration_input, seed_input, steps_input, lm_size_input, lora_select, ], outputs=[audio_output, gen_status], ) # ---- Train LoRA Tab ---- with gr.Tab("Train LoRA"): gr.Markdown( "### Train a LoRA adapter on your audio files\n" "Upload WAV/MP3/FLAC files to fine-tune the model. " "Training runs on CPU so keep epochs low and files short." ) with gr.Row(): with gr.Column(): audio_upload = gr.File( label="Upload Audio Files", file_count="multiple", file_types=["audio"], ) lora_name_input = gr.Textbox( label="LoRA Name", value="my_lora", ) with gr.Column(): epochs_input = gr.Slider( label="Epochs", minimum=1, maximum=10, value=1, step=1, ) lr_input = gr.Number( label="Learning Rate", value=1e-4, ) rank_input = gr.Slider( label="LoRA Rank", minimum=1, maximum=64, value=8, step=1, ) train_btn = gr.Button("Start Training", variant="primary") train_log = gr.Textbox( label="Training Log", interactive=False, lines=15, elem_classes="status-box", ) def train_and_refresh(*args): log = train_lora(*args) new_loras = get_trained_loras() return log, gr.update(choices=new_loras, value=new_loras[-1] if len(new_loras) > 1 else "None (no LoRA)") train_btn.click( fn=train_and_refresh, inputs=[ audio_upload, lora_name_input, epochs_input, lr_input, rank_input, ], outputs=[train_log, lora_select], ) return demo if __name__ == "__main__": demo = build_ui() demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, ssr_mode=False, )