mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-12-17 18:13:01 +08:00
- Frontend authentication with login page (auth_login.html) - API key injection script (auth_inject.js) - Session management (localStorage/sessionStorage) - Logout via URL: http://127.0.0.1:8188/auth_login.html?logout=true - Modified server.py to inject auth scripts into index.html - Added comprehensive documentation: * AUTHENTICATION_GUIDE.md - Complete authentication guide * FRONTEND_AUTH_GUIDE.md - Frontend-specific guide - Health endpoint accessible without authentication - Multiple auth methods: Bearer token, X-API-Key header, query parameter
276 lines
8.8 KiB
Python
276 lines
8.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Standalone script to generate music from text using Stable Audio in ComfyUI.
|
|
Based on the workflow: user/default/workflows/audio_stable_audio_example.json
|
|
|
|
This script replicates the workflow:
|
|
1. Load checkpoint model (stable-audio-open-1.0.safetensors)
|
|
2. Load CLIP text encoder (t5-base.safetensors)
|
|
3. Encode positive prompt (music description)
|
|
4. Encode negative prompt (empty)
|
|
5. Create empty latent audio (47.6 seconds)
|
|
6. Sample using KSampler
|
|
7. Decode audio from latent using VAE
|
|
8. Save as MP3
|
|
|
|
Requirements:
|
|
- stable-audio-open-1.0.safetensors in models/checkpoints/
|
|
- t5-base.safetensors in models/text_encoders/
|
|
"""
|
|
|
|
import torch
|
|
import sys
|
|
import os
|
|
import random
|
|
import av
|
|
from io import BytesIO
|
|
|
|
# Add ComfyUI to path
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
sys.path.insert(0, script_dir)
|
|
|
|
import comfy.sd
|
|
import comfy.sample
|
|
import comfy.samplers
|
|
import comfy.model_management
|
|
import folder_paths
|
|
import latent_preview
|
|
import comfy.utils
|
|
|
|
|
|
def load_checkpoint(ckpt_name):
|
|
"""Load checkpoint model - returns MODEL, CLIP, VAE"""
|
|
print(f"Loading checkpoint: {ckpt_name}")
|
|
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
|
|
out = comfy.sd.load_checkpoint_guess_config(
|
|
ckpt_path,
|
|
output_vae=True,
|
|
output_clip=True,
|
|
embedding_directory=folder_paths.get_folder_paths("embeddings")
|
|
)
|
|
return out[:3] # MODEL, CLIP, VAE
|
|
|
|
|
|
def load_clip(clip_name, clip_type="stable_audio"):
|
|
"""Load CLIP text encoder"""
|
|
print(f"Loading CLIP: {clip_name}")
|
|
clip_type_enum = getattr(comfy.sd.CLIPType, clip_type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
|
|
|
clip_path = folder_paths.get_full_path_or_raise("text_encoders", clip_name)
|
|
clip = comfy.sd.load_clip(
|
|
ckpt_paths=[clip_path],
|
|
embedding_directory=folder_paths.get_folder_paths("embeddings"),
|
|
clip_type=clip_type_enum,
|
|
model_options={}
|
|
)
|
|
return clip
|
|
|
|
|
|
def encode_text(clip, text):
|
|
"""Encode text using CLIP - returns CONDITIONING"""
|
|
print(f"Encoding text: '{text}'")
|
|
if clip is None:
|
|
raise RuntimeError("ERROR: clip input is invalid: None")
|
|
tokens = clip.tokenize(text)
|
|
return clip.encode_from_tokens_scheduled(tokens)
|
|
|
|
|
|
def create_empty_latent_audio(seconds, batch_size=1):
|
|
"""Create empty latent audio tensor"""
|
|
print(f"Creating empty latent audio: {seconds} seconds")
|
|
length = round((seconds * 44100 / 2048) / 2) * 2
|
|
latent = torch.zeros(
|
|
[batch_size, 64, length],
|
|
device=comfy.model_management.intermediate_device()
|
|
)
|
|
return {"samples": latent, "type": "audio"}
|
|
|
|
|
|
def sample_audio(model, seed, steps, cfg, sampler_name, scheduler,
|
|
positive, negative, latent_image, denoise=1.0):
|
|
"""Run KSampler to generate audio latents"""
|
|
print(f"Sampling with seed={seed}, steps={steps}, cfg={cfg}, sampler={sampler_name}, scheduler={scheduler}")
|
|
|
|
latent_samples = latent_image["samples"]
|
|
latent_samples = comfy.sample.fix_empty_latent_channels(model, latent_samples)
|
|
|
|
# Prepare noise
|
|
batch_inds = latent_image["batch_index"] if "batch_index" in latent_image else None
|
|
noise = comfy.sample.prepare_noise(latent_samples, seed, batch_inds)
|
|
|
|
# Check for noise mask
|
|
noise_mask = latent_image.get("noise_mask", None)
|
|
|
|
# Prepare callback for progress
|
|
callback = latent_preview.prepare_callback(model, steps)
|
|
disable_pbar = not comfy.utils.PROGRESS_BAR_ENABLED
|
|
|
|
# Sample
|
|
samples = comfy.sample.sample(
|
|
model, noise, steps, cfg, sampler_name, scheduler,
|
|
positive, negative, latent_samples,
|
|
denoise=denoise,
|
|
disable_noise=False,
|
|
start_step=None,
|
|
last_step=None,
|
|
force_full_denoise=False,
|
|
noise_mask=noise_mask,
|
|
callback=callback,
|
|
disable_pbar=disable_pbar,
|
|
seed=seed
|
|
)
|
|
|
|
out = latent_image.copy()
|
|
out["samples"] = samples
|
|
return out
|
|
|
|
|
|
def decode_audio(vae, samples):
|
|
"""Decode audio from latent samples using VAE"""
|
|
print("Decoding audio from latents")
|
|
audio = vae.decode(samples["samples"]).movedim(-1, 1)
|
|
|
|
# Normalize audio
|
|
std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
|
|
std[std < 1.0] = 1.0
|
|
audio /= std
|
|
|
|
return {"waveform": audio, "sample_rate": 44100}
|
|
|
|
|
|
def save_audio_mp3(audio, filename, quality="V0"):
|
|
"""Save audio as MP3 file using PyAV (same as ComfyUI)"""
|
|
print(f"Saving audio to: {filename}")
|
|
|
|
# Create output directory if needed
|
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
|
|
waveform = audio["waveform"]
|
|
sample_rate = audio["sample_rate"]
|
|
|
|
# Ensure audio is in CPU
|
|
waveform = waveform.cpu()
|
|
|
|
# Process each audio in batch (usually just 1)
|
|
for batch_number, waveform_item in enumerate(waveform):
|
|
if batch_number > 0:
|
|
# Add batch number to filename if multiple
|
|
base, ext = os.path.splitext(filename)
|
|
output_path = f"{base}_{batch_number}{ext}"
|
|
else:
|
|
output_path = filename
|
|
|
|
# Create output buffer
|
|
output_buffer = BytesIO()
|
|
output_container = av.open(output_buffer, mode="w", format="mp3")
|
|
|
|
# Determine audio layout - waveform_item shape is [channels, samples]
|
|
num_channels = waveform_item.shape[0] if waveform_item.dim() > 1 else 1
|
|
layout = "mono" if num_channels == 1 else "stereo"
|
|
|
|
# Set up the MP3 output stream
|
|
out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout)
|
|
|
|
# Set quality
|
|
if quality == "V0":
|
|
out_stream.codec_context.qscale = 1 # Highest VBR quality
|
|
elif quality == "128k":
|
|
out_stream.bit_rate = 128000
|
|
elif quality == "320k":
|
|
out_stream.bit_rate = 320000
|
|
|
|
# Prepare waveform for PyAV: needs to be [samples, channels]
|
|
# Use detach() to avoid gradient tracking issues
|
|
if waveform_item.dim() == 1:
|
|
# Mono audio, add channel dimension
|
|
waveform_numpy = waveform_item.unsqueeze(1).float().detach().numpy()
|
|
else:
|
|
# Transpose from [channels, samples] to [samples, channels]
|
|
waveform_numpy = waveform_item.transpose(0, 1).float().detach().numpy()
|
|
|
|
# Reshape to [1, samples * channels] for PyAV
|
|
waveform_numpy = waveform_numpy.reshape(1, -1)
|
|
|
|
# Create audio frame
|
|
frame = av.AudioFrame.from_ndarray(
|
|
waveform_numpy,
|
|
format="flt",
|
|
layout=layout,
|
|
)
|
|
frame.sample_rate = sample_rate
|
|
frame.pts = 0
|
|
|
|
# Encode
|
|
output_container.mux(out_stream.encode(frame))
|
|
|
|
# Flush encoder
|
|
output_container.mux(out_stream.encode(None))
|
|
|
|
# Close container
|
|
output_container.close()
|
|
|
|
# Write to file
|
|
output_buffer.seek(0)
|
|
with open(output_path, "wb") as f:
|
|
f.write(output_buffer.getbuffer())
|
|
|
|
print(f"Audio saved successfully: {output_path}")
|
|
|
|
|
|
def main():
|
|
# Configuration
|
|
checkpoint_name = "stable-audio-open-1.0.safetensors"
|
|
clip_name = "t5-base.safetensors"
|
|
positive_prompt = "A soft melodious acoustic guitar music"
|
|
negative_prompt = ""
|
|
audio_duration = 47.6 # seconds
|
|
seed = random.randint(0, 0xffffffffffffffff) # Random seed, or use specific value
|
|
steps = 50
|
|
cfg = 4.98
|
|
sampler_name = "dpmpp_3m_sde_gpu"
|
|
scheduler = "exponential"
|
|
denoise = 1.0
|
|
output_filename = "output/audio/generated_music.mp3"
|
|
quality = "V0"
|
|
|
|
print("=" * 60)
|
|
print("Stable Audio - Music Generation Script")
|
|
print("=" * 60)
|
|
print(f"Positive Prompt: {positive_prompt}")
|
|
print(f"Duration: {audio_duration} seconds")
|
|
print(f"Seed: {seed}")
|
|
print("=" * 60)
|
|
|
|
# 1. Load checkpoint (MODEL, CLIP, VAE)
|
|
model, checkpoint_clip, vae = load_checkpoint(checkpoint_name)
|
|
|
|
# 2. Load separate CLIP text encoder for stable audio
|
|
clip = load_clip(clip_name, "stable_audio")
|
|
|
|
# 3. Encode positive and negative prompts
|
|
positive_conditioning = encode_text(clip, positive_prompt)
|
|
negative_conditioning = encode_text(clip, negative_prompt)
|
|
|
|
# 4. Create empty latent audio
|
|
latent_audio = create_empty_latent_audio(audio_duration, batch_size=1)
|
|
|
|
# 5. Sample using KSampler
|
|
sampled_latent = sample_audio(
|
|
model, seed, steps, cfg, sampler_name, scheduler,
|
|
positive_conditioning, negative_conditioning, latent_audio, denoise
|
|
)
|
|
|
|
# 6. Decode audio from latent using VAE
|
|
audio = decode_audio(vae, sampled_latent)
|
|
|
|
# 7. Save as MP3
|
|
save_audio_mp3(audio, output_filename, quality)
|
|
|
|
print("=" * 60)
|
|
print("Generation complete!")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|