ComfyUI/generate_audio_standalone.py
daverbj c5ad1381bf Add complete authentication system with documentation
- Frontend authentication with login page (auth_login.html)
- API key injection script (auth_inject.js)
- Session management (localStorage/sessionStorage)
- Logout via URL: http://127.0.0.1:8188/auth_login.html?logout=true
- Modified server.py to inject auth scripts into index.html
- Added comprehensive documentation:
  * AUTHENTICATION_GUIDE.md - Complete authentication guide
  * FRONTEND_AUTH_GUIDE.md - Frontend-specific guide
- Health endpoint accessible without authentication
- Multiple auth methods: Bearer token, X-API-Key header, query parameter
2025-12-11 15:49:23 +03:00

276 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""
Standalone script to generate music from text using Stable Audio in ComfyUI.
Based on the workflow: user/default/workflows/audio_stable_audio_example.json
This script replicates the workflow:
1. Load checkpoint model (stable-audio-open-1.0.safetensors)
2. Load CLIP text encoder (t5-base.safetensors)
3. Encode positive prompt (music description)
4. Encode negative prompt (empty)
5. Create empty latent audio (47.6 seconds)
6. Sample using KSampler
7. Decode audio from latent using VAE
8. Save as MP3
Requirements:
- stable-audio-open-1.0.safetensors in models/checkpoints/
- t5-base.safetensors in models/text_encoders/
"""
import torch
import sys
import os
import random
import av
from io import BytesIO
# Add ComfyUI to path
script_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, script_dir)
import comfy.sd
import comfy.sample
import comfy.samplers
import comfy.model_management
import folder_paths
import latent_preview
import comfy.utils
def load_checkpoint(ckpt_name):
"""Load checkpoint model - returns MODEL, CLIP, VAE"""
print(f"Loading checkpoint: {ckpt_name}")
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
out = comfy.sd.load_checkpoint_guess_config(
ckpt_path,
output_vae=True,
output_clip=True,
embedding_directory=folder_paths.get_folder_paths("embeddings")
)
return out[:3] # MODEL, CLIP, VAE
def load_clip(clip_name, clip_type="stable_audio"):
"""Load CLIP text encoder"""
print(f"Loading CLIP: {clip_name}")
clip_type_enum = getattr(comfy.sd.CLIPType, clip_type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
clip_path = folder_paths.get_full_path_or_raise("text_encoders", clip_name)
clip = comfy.sd.load_clip(
ckpt_paths=[clip_path],
embedding_directory=folder_paths.get_folder_paths("embeddings"),
clip_type=clip_type_enum,
model_options={}
)
return clip
def encode_text(clip, text):
"""Encode text using CLIP - returns CONDITIONING"""
print(f"Encoding text: '{text}'")
if clip is None:
raise RuntimeError("ERROR: clip input is invalid: None")
tokens = clip.tokenize(text)
return clip.encode_from_tokens_scheduled(tokens)
def create_empty_latent_audio(seconds, batch_size=1):
"""Create empty latent audio tensor"""
print(f"Creating empty latent audio: {seconds} seconds")
length = round((seconds * 44100 / 2048) / 2) * 2
latent = torch.zeros(
[batch_size, 64, length],
device=comfy.model_management.intermediate_device()
)
return {"samples": latent, "type": "audio"}
def sample_audio(model, seed, steps, cfg, sampler_name, scheduler,
positive, negative, latent_image, denoise=1.0):
"""Run KSampler to generate audio latents"""
print(f"Sampling with seed={seed}, steps={steps}, cfg={cfg}, sampler={sampler_name}, scheduler={scheduler}")
latent_samples = latent_image["samples"]
latent_samples = comfy.sample.fix_empty_latent_channels(model, latent_samples)
# Prepare noise
batch_inds = latent_image["batch_index"] if "batch_index" in latent_image else None
noise = comfy.sample.prepare_noise(latent_samples, seed, batch_inds)
# Check for noise mask
noise_mask = latent_image.get("noise_mask", None)
# Prepare callback for progress
callback = latent_preview.prepare_callback(model, steps)
disable_pbar = not comfy.utils.PROGRESS_BAR_ENABLED
# Sample
samples = comfy.sample.sample(
model, noise, steps, cfg, sampler_name, scheduler,
positive, negative, latent_samples,
denoise=denoise,
disable_noise=False,
start_step=None,
last_step=None,
force_full_denoise=False,
noise_mask=noise_mask,
callback=callback,
disable_pbar=disable_pbar,
seed=seed
)
out = latent_image.copy()
out["samples"] = samples
return out
def decode_audio(vae, samples):
"""Decode audio from latent samples using VAE"""
print("Decoding audio from latents")
audio = vae.decode(samples["samples"]).movedim(-1, 1)
# Normalize audio
std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
std[std < 1.0] = 1.0
audio /= std
return {"waveform": audio, "sample_rate": 44100}
def save_audio_mp3(audio, filename, quality="V0"):
"""Save audio as MP3 file using PyAV (same as ComfyUI)"""
print(f"Saving audio to: {filename}")
# Create output directory if needed
os.makedirs(os.path.dirname(filename), exist_ok=True)
waveform = audio["waveform"]
sample_rate = audio["sample_rate"]
# Ensure audio is in CPU
waveform = waveform.cpu()
# Process each audio in batch (usually just 1)
for batch_number, waveform_item in enumerate(waveform):
if batch_number > 0:
# Add batch number to filename if multiple
base, ext = os.path.splitext(filename)
output_path = f"{base}_{batch_number}{ext}"
else:
output_path = filename
# Create output buffer
output_buffer = BytesIO()
output_container = av.open(output_buffer, mode="w", format="mp3")
# Determine audio layout - waveform_item shape is [channels, samples]
num_channels = waveform_item.shape[0] if waveform_item.dim() > 1 else 1
layout = "mono" if num_channels == 1 else "stereo"
# Set up the MP3 output stream
out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout)
# Set quality
if quality == "V0":
out_stream.codec_context.qscale = 1 # Highest VBR quality
elif quality == "128k":
out_stream.bit_rate = 128000
elif quality == "320k":
out_stream.bit_rate = 320000
# Prepare waveform for PyAV: needs to be [samples, channels]
# Use detach() to avoid gradient tracking issues
if waveform_item.dim() == 1:
# Mono audio, add channel dimension
waveform_numpy = waveform_item.unsqueeze(1).float().detach().numpy()
else:
# Transpose from [channels, samples] to [samples, channels]
waveform_numpy = waveform_item.transpose(0, 1).float().detach().numpy()
# Reshape to [1, samples * channels] for PyAV
waveform_numpy = waveform_numpy.reshape(1, -1)
# Create audio frame
frame = av.AudioFrame.from_ndarray(
waveform_numpy,
format="flt",
layout=layout,
)
frame.sample_rate = sample_rate
frame.pts = 0
# Encode
output_container.mux(out_stream.encode(frame))
# Flush encoder
output_container.mux(out_stream.encode(None))
# Close container
output_container.close()
# Write to file
output_buffer.seek(0)
with open(output_path, "wb") as f:
f.write(output_buffer.getbuffer())
print(f"Audio saved successfully: {output_path}")
def main():
# Configuration
checkpoint_name = "stable-audio-open-1.0.safetensors"
clip_name = "t5-base.safetensors"
positive_prompt = "A soft melodious acoustic guitar music"
negative_prompt = ""
audio_duration = 47.6 # seconds
seed = random.randint(0, 0xffffffffffffffff) # Random seed, or use specific value
steps = 50
cfg = 4.98
sampler_name = "dpmpp_3m_sde_gpu"
scheduler = "exponential"
denoise = 1.0
output_filename = "output/audio/generated_music.mp3"
quality = "V0"
print("=" * 60)
print("Stable Audio - Music Generation Script")
print("=" * 60)
print(f"Positive Prompt: {positive_prompt}")
print(f"Duration: {audio_duration} seconds")
print(f"Seed: {seed}")
print("=" * 60)
# 1. Load checkpoint (MODEL, CLIP, VAE)
model, checkpoint_clip, vae = load_checkpoint(checkpoint_name)
# 2. Load separate CLIP text encoder for stable audio
clip = load_clip(clip_name, "stable_audio")
# 3. Encode positive and negative prompts
positive_conditioning = encode_text(clip, positive_prompt)
negative_conditioning = encode_text(clip, negative_prompt)
# 4. Create empty latent audio
latent_audio = create_empty_latent_audio(audio_duration, batch_size=1)
# 5. Sample using KSampler
sampled_latent = sample_audio(
model, seed, steps, cfg, sampler_name, scheduler,
positive_conditioning, negative_conditioning, latent_audio, denoise
)
# 6. Decode audio from latent using VAE
audio = decode_audio(vae, sampled_latent)
# 7. Save as MP3
save_audio_mp3(audio, output_filename, quality)
print("=" * 60)
print("Generation complete!")
print("=" * 60)
if __name__ == "__main__":
main()