ComfyUI/generate_audio_standalone.py

#!/usr/bin/env python3
"""
Standalone script to generate music from text using Stable Audio in ComfyUI.
Based on the workflow: user/default/workflows/audio_stable_audio_example.json

This script replicates the workflow:
1. Load checkpoint model (stable-audio-open-1.0.safetensors)
2. Load CLIP text encoder (t5-base.safetensors)
3. Encode positive prompt (music description)
4. Encode negative prompt (empty)
5. Create empty latent audio (47.6 seconds)
6. Sample using KSampler
7. Decode audio from latent using VAE
8. Save as MP3

Requirements:
- stable-audio-open-1.0.safetensors in models/checkpoints/
- t5-base.safetensors in models/text_encoders/
"""

import torch
import sys
import os
import random
import av
from io import BytesIO

# Add ComfyUI to path
script_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, script_dir)

import comfy.sd
import comfy.sample
import comfy.samplers
import comfy.model_management
import folder_paths
import latent_preview
import comfy.utils


def load_checkpoint(ckpt_name):
    """Load checkpoint model - returns MODEL, CLIP, VAE"""
    print(f"Loading checkpoint: {ckpt_name}")
    ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
    out = comfy.sd.load_checkpoint_guess_config(
        ckpt_path,
        output_vae=True,
        output_clip=True,
        embedding_directory=folder_paths.get_folder_paths("embeddings")
    )
    return out[:3]  # MODEL, CLIP, VAE


def load_clip(clip_name, clip_type="stable_audio"):
    """Load CLIP text encoder"""
    print(f"Loading CLIP: {clip_name}")
    clip_type_enum = getattr(comfy.sd.CLIPType, clip_type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)

    clip_path = folder_paths.get_full_path_or_raise("text_encoders", clip_name)
    clip = comfy.sd.load_clip(
        ckpt_paths=[clip_path],
        embedding_directory=folder_paths.get_folder_paths("embeddings"),
        clip_type=clip_type_enum,
        model_options={}
    )
    return clip


def encode_text(clip, text):
    """Encode text using CLIP - returns CONDITIONING"""
    print(f"Encoding text: '{text}'")
    if clip is None:
        raise RuntimeError("ERROR: clip input is invalid: None")
    tokens = clip.tokenize(text)
    return clip.encode_from_tokens_scheduled(tokens)


def create_empty_latent_audio(seconds, batch_size=1):
    """Create empty latent audio tensor"""
    print(f"Creating empty latent audio: {seconds} seconds")
    length = round((seconds * 44100 / 2048) / 2) * 2
    latent = torch.zeros(
        [batch_size, 64, length],
        device=comfy.model_management.intermediate_device()
    )
    return {"samples": latent, "type": "audio"}


def sample_audio(model, seed, steps, cfg, sampler_name, scheduler,
                positive, negative, latent_image, denoise=1.0):
    """Run KSampler to generate audio latents"""
    print(f"Sampling with seed={seed}, steps={steps}, cfg={cfg}, sampler={sampler_name}, scheduler={scheduler}")

    latent_samples = latent_image["samples"]
    latent_samples = comfy.sample.fix_empty_latent_channels(model, latent_samples)

    # Prepare noise
    batch_inds = latent_image["batch_index"] if "batch_index" in latent_image else None
    noise = comfy.sample.prepare_noise(latent_samples, seed, batch_inds)

    # Check for noise mask
    noise_mask = latent_image.get("noise_mask", None)

    # Prepare callback for progress
    callback = latent_preview.prepare_callback(model, steps)
    disable_pbar = not comfy.utils.PROGRESS_BAR_ENABLED

    # Sample
    samples = comfy.sample.sample(
        model, noise, steps, cfg, sampler_name, scheduler,
        positive, negative, latent_samples,
        denoise=denoise,
        disable_noise=False,
        start_step=None,
        last_step=None,
        force_full_denoise=False,
        noise_mask=noise_mask,
        callback=callback,
        disable_pbar=disable_pbar,
        seed=seed
    )

    out = latent_image.copy()
    out["samples"] = samples
    return out


def decode_audio(vae, samples):
    """Decode audio from latent samples using VAE"""
    print("Decoding audio from latents")
    audio = vae.decode(samples["samples"]).movedim(-1, 1)

    # Normalize audio
    std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
    std[std < 1.0] = 1.0
    audio /= std

    return {"waveform": audio, "sample_rate": 44100}


def save_audio_mp3(audio, filename, quality="V0"):
    """Save audio as MP3 file using PyAV (same as ComfyUI)"""
    print(f"Saving audio to: {filename}")

    # Create output directory if needed
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    waveform = audio["waveform"]
    sample_rate = audio["sample_rate"]

    # Ensure audio is in CPU
    waveform = waveform.cpu()

    # Process each audio in batch (usually just 1)
    for batch_number, waveform_item in enumerate(waveform):
        if batch_number > 0:
            # Add batch number to filename if multiple
            base, ext = os.path.splitext(filename)
            output_path = f"{base}_{batch_number}{ext}"
        else:
            output_path = filename

        # Create output buffer
        output_buffer = BytesIO()
        output_container = av.open(output_buffer, mode="w", format="mp3")

        # Determine audio layout - waveform_item shape is [channels, samples]
        num_channels = waveform_item.shape[0] if waveform_item.dim() > 1 else 1
        layout = "mono" if num_channels == 1 else "stereo"

        # Set up the MP3 output stream
        out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout)

        # Set quality
        if quality == "V0":
            out_stream.codec_context.qscale = 1  # Highest VBR quality
        elif quality == "128k":
            out_stream.bit_rate = 128000
        elif quality == "320k":
            out_stream.bit_rate = 320000

        # Prepare waveform for PyAV: needs to be [samples, channels]
        # Use detach() to avoid gradient tracking issues
        if waveform_item.dim() == 1:
            # Mono audio, add channel dimension
            waveform_numpy = waveform_item.unsqueeze(1).float().detach().numpy()
        else:
            # Transpose from [channels, samples] to [samples, channels]
            waveform_numpy = waveform_item.transpose(0, 1).float().detach().numpy()

        # Reshape to [1, samples * channels] for PyAV
        waveform_numpy = waveform_numpy.reshape(1, -1)

        # Create audio frame
        frame = av.AudioFrame.from_ndarray(
            waveform_numpy,
            format="flt",
            layout=layout,
        )
        frame.sample_rate = sample_rate
        frame.pts = 0

        # Encode
        output_container.mux(out_stream.encode(frame))

        # Flush encoder
        output_container.mux(out_stream.encode(None))

        # Close container
        output_container.close()

        # Write to file
        output_buffer.seek(0)
        with open(output_path, "wb") as f:
            f.write(output_buffer.getbuffer())

        print(f"Audio saved successfully: {output_path}")


def main():
    # Configuration
    checkpoint_name = "stable-audio-open-1.0.safetensors"
    clip_name = "t5-base.safetensors"
    positive_prompt = "A soft melodious acoustic guitar music"
    negative_prompt = ""
    audio_duration = 47.6  # seconds
    seed = random.randint(0, 0xffffffffffffffff)  # Random seed, or use specific value
    steps = 50
    cfg = 4.98
    sampler_name = "dpmpp_3m_sde_gpu"
    scheduler = "exponential"
    denoise = 1.0
    output_filename = "output/audio/generated_music.mp3"
    quality = "V0"

    print("=" * 60)
    print("Stable Audio - Music Generation Script")
    print("=" * 60)
    print(f"Positive Prompt: {positive_prompt}")
    print(f"Duration: {audio_duration} seconds")
    print(f"Seed: {seed}")
    print("=" * 60)

    # 1. Load checkpoint (MODEL, CLIP, VAE)
    model, checkpoint_clip, vae = load_checkpoint(checkpoint_name)

    # 2. Load separate CLIP text encoder for stable audio
    clip = load_clip(clip_name, "stable_audio")

    # 3. Encode positive and negative prompts
    positive_conditioning = encode_text(clip, positive_prompt)
    negative_conditioning = encode_text(clip, negative_prompt)

    # 4. Create empty latent audio
    latent_audio = create_empty_latent_audio(audio_duration, batch_size=1)

    # 5. Sample using KSampler
    sampled_latent = sample_audio(
        model, seed, steps, cfg, sampler_name, scheduler,
        positive_conditioning, negative_conditioning, latent_audio, denoise
    )

    # 6. Decode audio from latent using VAE
    audio = decode_audio(vae, sampled_latent)

    # 7. Save as MP3
    save_audio_mp3(audio, output_filename, quality)

    print("=" * 60)
    print("Generation complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()