From 6125b8097952a374009af39639ff45da85f65500 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 4 Feb 2026 18:29:22 -0800
Subject: [PATCH 01/10] Add llm sampling options and make reference audio work
on ace step 1.5 (#12295)
---
comfy/ldm/ace/ace_step15.py | 3 +--
comfy/model_base.py | 19 ++++++++++++-------
comfy/text_encoders/ace15.py | 31 +++++++++++++++++++++++--------
comfy_extras/nodes_ace.py | 16 +++++++++++-----
4 files changed, 47 insertions(+), 22 deletions(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index d90549658..17a37e573 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -1035,8 +1035,7 @@ class AceStepConditionGenerationModel(nn.Module):
audio_codes = torch.nn.functional.pad(audio_codes, (0, math.ceil(src_latents.shape[1] / 5) - audio_codes.shape[1]), "constant", 35847)
lm_hints_5Hz = self.tokenizer.quantizer.get_output_from_indices(audio_codes, dtype=text_hidden_states.dtype)
else:
- assert False
- # TODO ?
+ lm_hints_5Hz, indices = self.tokenizer.tokenize(refer_audio_acoustic_hidden_states_packed)
lm_hints = self.detokenizer(lm_hints_5Hz)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 89944548c..a2a34f191 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1548,6 +1548,7 @@ class ACEStep15(BaseModel):
def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs)
device = kwargs["device"]
+ noise = kwargs["noise"]
cross_attn = kwargs.get("cross_attn", None)
if cross_attn is not None:
@@ -1571,15 +1572,19 @@ class ACEStep15(BaseModel):
1.4844e-01, 9.4727e-02, 3.8477e-01, -1.2578e+00, -3.3203e-01,
-8.5547e-01, 4.3359e-01, 4.2383e-01, -8.9453e-01, -5.0391e-01,
-5.6152e-02, -2.9219e+00, -2.4658e-02, 5.0391e-01, 9.8438e-01,
- 7.2754e-02, -2.1582e-01, 6.3672e-01, 1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, 750)
+ 7.2754e-02, -2.1582e-01, 6.3672e-01, 1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, noise.shape[2])
+ pass_audio_codes = True
else:
- refer_audio = refer_audio[-1]
+ refer_audio = refer_audio[-1][:, :, :noise.shape[2]]
+ pass_audio_codes = False
+
+ if pass_audio_codes:
+ audio_codes = kwargs.get("audio_codes", None)
+ if audio_codes is not None:
+ out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
+ refer_audio = refer_audio[:, :, :750]
+
out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
-
- audio_codes = kwargs.get("audio_codes", None)
- if audio_codes is not None:
- out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
-
return out
class Omnigen2(BaseModel):
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index fce2b67ce..74e62733e 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -101,9 +101,7 @@ def sample_manual_loop_no_classes(
return output_audio_codes
-def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0):
- cfg_scale = 2.0
-
+def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=1024, seed=0, cfg_scale=2.0, temperature=0.85, top_p=0.9, top_k=0):
positive = [[token for token, _ in inner_list] for inner_list in positive]
negative = [[token for token, _ in inner_list] for inner_list in negative]
positive = positive[0]
@@ -120,7 +118,7 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102
positive = [model.special_tokens["pad"]] * pos_pad + positive
paddings = [pos_pad, neg_pad]
- return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+ return sample_manual_loop_no_classes(model, [positive, negative], paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
@@ -137,6 +135,12 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
language = kwargs.get("language", "en")
seed = kwargs.get("seed", 0)
+ generate_audio_codes = kwargs.get("generate_audio_codes", True)
+ cfg_scale = kwargs.get("cfg_scale", 2.0)
+ temperature = kwargs.get("temperature", 0.85)
+ top_p = kwargs.get("top_p", 0.9)
+ top_k = kwargs.get("top_k", 0.0)
+
duration = math.ceil(duration)
meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n\n{}\n\n\n<|im_end|>\n"
@@ -147,7 +151,14 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
- out["lm_metadata"] = {"min_tokens": duration * 5, "seed": seed}
+ out["lm_metadata"] = {"min_tokens": duration * 5,
+ "seed": seed,
+ "generate_audio_codes": generate_audio_codes,
+ "cfg_scale": cfg_scale,
+ "temperature": temperature,
+ "top_p": top_p,
+ "top_k": top_k,
+ }
return out
@@ -203,10 +214,14 @@ class ACE15TEModel(torch.nn.Module):
self.qwen3_06b.set_clip_options({"layer": [0]})
lyrics_embeds, _, extra_l = self.qwen3_06b.encode_token_weights(token_weight_pairs_lyrics)
- lm_metadata = token_weight_pairs["lm_metadata"]
- audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"])
+ out = {"conditioning_lyrics": lyrics_embeds[:, 0]}
- return base_out, None, {"conditioning_lyrics": lyrics_embeds[:, 0], "audio_codes": [audio_codes]}
+ lm_metadata = token_weight_pairs["lm_metadata"]
+ if lm_metadata["generate_audio_codes"]:
+ audio_codes = generate_audio_codes(getattr(self, self.lm_model, self.qwen3_06b), token_weight_pairs["lm_prompt"], token_weight_pairs["lm_prompt_negative"], min_tokens=lm_metadata["min_tokens"], max_tokens=lm_metadata["min_tokens"], seed=lm_metadata["seed"], cfg_scale=lm_metadata["cfg_scale"], temperature=lm_metadata["temperature"], top_p=lm_metadata["top_p"], top_k=lm_metadata["top_k"])
+ out["audio_codes"] = [audio_codes]
+
+ return base_out, None, out
def set_clip_options(self, options):
self.qwen3_06b.set_clip_options(options)
diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py
index 376584e5c..dde5bbd2a 100644
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -44,13 +44,18 @@ class TextEncodeAceStepAudio15(io.ComfyNode):
io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+ io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
+ io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
+ io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
+ io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
+ io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
],
outputs=[io.Conditioning.Output()],
)
@classmethod
- def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale) -> io.NodeOutput:
- tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed)
+ def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k) -> io.NodeOutput:
+ tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k)
conditioning = clip.encode_from_tokens_scheduled(tokens)
return io.NodeOutput(conditioning)
@@ -100,14 +105,15 @@ class EmptyAceStep15LatentAudio(io.ComfyNode):
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
return io.NodeOutput({"samples": latent, "type": "audio"})
-class ReferenceTimbreAudio(io.ComfyNode):
+class ReferenceAudio(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="ReferenceTimbreAudio",
+ display_name="Reference Audio",
category="advanced/conditioning/audio",
is_experimental=True,
- description="This node sets the reference audio for timbre (for ace step 1.5)",
+ description="This node sets the reference audio for ace step 1.5",
inputs=[
io.Conditioning.Input("conditioning"),
io.Latent.Input("latent", optional=True),
@@ -131,7 +137,7 @@ class AceExtension(ComfyExtension):
EmptyAceStepLatentAudio,
TextEncodeAceStepAudio15,
EmptyAceStep15LatentAudio,
- ReferenceTimbreAudio,
+ ReferenceAudio,
]
async def comfy_entrypoint() -> AceExtension:
From a50c32d63fe55d073edd7af2242f0536f50b362e Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 4 Feb 2026 19:15:30 -0800
Subject: [PATCH 02/10] Disable sage attention on ace step 1.5 (#12297)
---
comfy/ldm/ace/ace_step15.py | 2 +-
comfy/ldm/modules/attention.py | 3 +++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index 17a37e573..f2b130bc1 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -183,7 +183,7 @@ class AceStepAttention(nn.Module):
else:
attn_bias = window_bias
- attn_output = optimized_attention(query_states, key_states, value_states, self.num_heads, attn_bias, skip_reshape=True)
+ attn_output = optimized_attention(query_states, key_states, value_states, self.num_heads, attn_bias, skip_reshape=True, low_precision_attention=False)
attn_output = self.o_proj(attn_output)
return attn_output
diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index ccf690945..10d051325 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -524,6 +524,9 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
@wrap_attn
def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
+ if kwargs.get("low_precision_attention", True) is False:
+ return attention_pytorch(q, k, v, heads, mask=mask, skip_reshape=skip_reshape, skip_output_reshape=skip_output_reshape, **kwargs)
+
exception_fallback = False
if skip_reshape:
b, _, _, dim_head = q.shape
From a246cc02b274104d5f656b68ce505354c164aef8 Mon Sep 17 00:00:00 2001
From: blepping <157360029+blepping@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:17:37 -0700
Subject: [PATCH 03/10] Improvements to ACE-Steps 1.5 text encoding (#12283)
---
comfy/text_encoders/ace15.py | 56 +++++++++++++++++++++++++++++-------
1 file changed, 45 insertions(+), 11 deletions(-)
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 74e62733e..00dd5ba90 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -3,6 +3,7 @@ import comfy.text_encoders.llama
from comfy import sd1_clip
import torch
import math
+import yaml
import comfy.utils
@@ -125,14 +126,43 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_06b", tokenizer=Qwen3Tokenizer)
+ def _metas_to_cot(self, *, return_yaml: bool = False, **kwargs) -> str:
+ user_metas = {
+ k: kwargs.pop(k)
+ for k in ("bpm", "duration", "keyscale", "timesignature", "language", "caption")
+ if k in kwargs
+ }
+ timesignature = user_metas.get("timesignature")
+ if isinstance(timesignature, str) and timesignature.endswith("/4"):
+ user_metas["timesignature"] = timesignature.rsplit("/", 1)[0]
+ user_metas = {
+ k: v if not isinstance(v, str) or not v.isdigit() else int(v)
+ for k, v in user_metas.items()
+ if v not in {"unspecified", None}
+ }
+ if len(user_metas):
+ meta_yaml = yaml.dump(user_metas, allow_unicode=True, sort_keys=True).strip()
+ else:
+ meta_yaml = ""
+ return f"\n{meta_yaml}\n" if not return_yaml else meta_yaml
+
+ def _metas_to_cap(self, **kwargs) -> str:
+ use_keys = ("bpm", "duration", "keyscale", "timesignature")
+ user_metas = { k: kwargs.pop(k, "N/A") for k in use_keys }
+ duration = user_metas["duration"]
+ if duration == "N/A":
+ user_metas["duration"] = "30 seconds"
+ elif isinstance(duration, (str, int, float)):
+ user_metas["duration"] = f"{math.ceil(float(duration))} seconds"
+ else:
+ raise TypeError("Unexpected type for duration key, must be str, int or float")
+ return "\n".join(f"- {k}: {user_metas[k]}" for k in use_keys)
+
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
out = {}
lyrics = kwargs.get("lyrics", "")
- bpm = kwargs.get("bpm", 120)
duration = kwargs.get("duration", 120)
- keyscale = kwargs.get("keyscale", "C major")
- timesignature = kwargs.get("timesignature", 2)
- language = kwargs.get("language", "en")
+ language = kwargs.get("language")
seed = kwargs.get("seed", 0)
generate_audio_codes = kwargs.get("generate_audio_codes", True)
@@ -141,16 +171,20 @@ class ACE15Tokenizer(sd1_clip.SD1Tokenizer):
top_p = kwargs.get("top_p", 0.9)
top_k = kwargs.get("top_k", 0.0)
+
duration = math.ceil(duration)
- meta_lm = 'bpm: {}\nduration: {}\nkeyscale: {}\ntimesignature: {}'.format(bpm, duration, keyscale, timesignature)
- lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n{}\n<|im_end|>\n<|im_start|>assistant\n\n{}\n\n\n<|im_end|>\n"
+ kwargs["duration"] = duration
- meta_cap = '- bpm: {}\n- timesignature: {}\n- keyscale: {}\n- duration: {}\n'.format(bpm, timesignature, keyscale, duration)
- out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, meta_lm), disable_weights=True)
- out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, ""), disable_weights=True)
+ cot_text = self._metas_to_cot(caption = text, **kwargs)
+ meta_cap = self._metas_to_cap(**kwargs)
- out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric{}<|endoftext|><|endoftext|>".format(language, lyrics), return_word_ids, disable_weights=True, **kwargs)
- out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}# Metas\n{}<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
+ lm_template = "<|im_start|>system\n# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n<|im_end|>\n<|im_start|>user\n# Caption\n{}\n# Lyric\n{}\n<|im_end|>\n<|im_start|>assistant\n{}\n<|im_end|>\n"
+
+ out["lm_prompt"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, cot_text), disable_weights=True)
+ out["lm_prompt_negative"] = self.qwen3_06b.tokenize_with_weights(lm_template.format(text, lyrics, "\n"), disable_weights=True)
+
+ out["lyrics"] = self.qwen3_06b.tokenize_with_weights("# Languages\n{}\n\n# Lyric\n{}<|endoftext|><|endoftext|>".format(language if language is not None else "", lyrics), return_word_ids, disable_weights=True, **kwargs)
+ out["qwen3_06b"] = self.qwen3_06b.tokenize_with_weights("# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n# Caption\n{}\n# Metas\n{}\n<|endoftext|>\n<|endoftext|>".format(text, meta_cap), return_word_ids, **kwargs)
out["lm_metadata"] = {"min_tokens": duration * 5,
"seed": seed,
"generate_audio_codes": generate_audio_codes,
From 35183543e004d8b7509c043e7a680bee07171622 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:12:04 -0800
Subject: [PATCH 04/10] Add VAE tiled decode node for audio. (#12299)
---
comfy/sd.py | 2 +-
comfy_extras/nodes_audio.py | 43 +++++++++++++++++++++++++++++++------
2 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/comfy/sd.py b/comfy/sd.py
index bc63d6ced..bc9407405 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -976,7 +976,7 @@ class VAE:
if overlap is not None:
args["overlap"] = overlap
- if dims == 1:
+ if dims == 1 or self.extra_1d_channel is not None:
args.pop("tile_y")
output = self.decode_tiled_1d(samples, **args)
elif dims == 2:
diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index bef723dce..b63dd8e97 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -94,6 +94,19 @@ class VAEEncodeAudio(IO.ComfyNode):
encode = execute # TODO: remove
+def vae_decode_audio(vae, samples, tile=None, overlap=None):
+ if tile is not None:
+ audio = vae.decode_tiled(samples["samples"], tile_y=tile, overlap=overlap).movedim(-1, 1)
+ else:
+ audio = vae.decode(samples["samples"]).movedim(-1, 1)
+
+ std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
+ std[std < 1.0] = 1.0
+ audio /= std
+ vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+ return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
+
+
class VAEDecodeAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
@@ -111,16 +124,33 @@ class VAEDecodeAudio(IO.ComfyNode):
@classmethod
def execute(cls, vae, samples) -> IO.NodeOutput:
- audio = vae.decode(samples["samples"]).movedim(-1, 1)
- std = torch.std(audio, dim=[1,2], keepdim=True) * 5.0
- std[std < 1.0] = 1.0
- audio /= std
- vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
- return IO.NodeOutput({"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]})
+ return IO.NodeOutput(vae_decode_audio(vae, samples))
decode = execute # TODO: remove
+class VAEDecodeAudioTiled(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="VAEDecodeAudioTiled",
+ search_aliases=["latent to audio"],
+ display_name="VAE Decode Audio (Tiled)",
+ category="latent/audio",
+ inputs=[
+ IO.Latent.Input("samples"),
+ IO.Vae.Input("vae"),
+ IO.Int.Input("tile_size", default=512, min=32, max=8192, step=8),
+ IO.Int.Input("overlap", default=64, min=0, max=1024, step=8),
+ ],
+ outputs=[IO.Audio.Output()],
+ )
+
+ @classmethod
+ def execute(cls, vae, samples, tile_size, overlap) -> IO.NodeOutput:
+ return IO.NodeOutput(vae_decode_audio(vae, samples, tile_size, overlap))
+
+
class SaveAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
@@ -675,6 +705,7 @@ class AudioExtension(ComfyExtension):
EmptyLatentAudio,
VAEEncodeAudio,
VAEDecodeAudio,
+ VAEDecodeAudioTiled,
SaveAudio,
SaveAudioMP3,
SaveAudioOpus,
From cb459573c8fa025bbf9ecf312f6af376d659f567 Mon Sep 17 00:00:00 2001
From: comfyanonymous
Date: Thu, 5 Feb 2026 01:13:35 -0500
Subject: [PATCH 05/10] ComfyUI v0.12.3
---
comfyui_version.py | 2 +-
pyproject.toml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/comfyui_version.py b/comfyui_version.py
index 5d296cd1b..706b37763 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
-__version__ = "0.12.2"
+__version__ = "0.12.3"
diff --git a/pyproject.toml b/pyproject.toml
index 1ddcc3596..f7925b92a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
-version = "0.12.2"
+version = "0.12.3"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
From 00efcc6cd028206ad81a90dec177c9a470a20a2a Mon Sep 17 00:00:00 2001
From: Comfy Org PR Bot
Date: Thu, 5 Feb 2026 15:17:37 +0900
Subject: [PATCH 06/10] Bump comfyui-frontend-package to 1.38.13 (#12238)
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index 0c401873a..41cc9174b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.37.11
+comfyui-frontend-package==1.38.13
comfyui-workflow-templates==0.8.31
comfyui-embedded-docs==0.4.0
torch
From 2b70ab9ad0fd6a38b11546a18c546ce40cc176a1 Mon Sep 17 00:00:00 2001
From: AustinMroz
Date: Wed, 4 Feb 2026 22:18:21 -0800
Subject: [PATCH 07/10] Add a Create List node (#12173)
---
comfy_extras/nodes_toolkit.py | 47 +++++++++++++++++++++++++++++++++++
nodes.py | 3 ++-
2 files changed, 49 insertions(+), 1 deletion(-)
create mode 100644 comfy_extras/nodes_toolkit.py
diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py
new file mode 100644
index 000000000..71faf7226
--- /dev/null
+++ b/comfy_extras/nodes_toolkit.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class CreateList(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ template_matchtype = io.MatchType.Template("type")
+ template_autogrow = io.Autogrow.TemplatePrefix(
+ input=io.MatchType.Input("input", template=template_matchtype),
+ prefix="input",
+ )
+ return io.Schema(
+ node_id="CreateList",
+ display_name="Create List",
+ category="logic",
+ is_input_list=True,
+ search_aliases=["Image Iterator", "Text Iterator", "Iterator"],
+ inputs=[io.Autogrow.Input("inputs", template=template_autogrow)],
+ outputs=[
+ io.MatchType.Output(
+ template=template_matchtype,
+ is_output_list=True,
+ display_name="list",
+ ),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, inputs: io.Autogrow.Type) -> io.NodeOutput:
+ output_list = []
+ for input in inputs.values():
+ output_list += input
+ return io.NodeOutput(output_list)
+
+
+class ToolkitExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ CreateList,
+ ]
+
+
+async def comfy_entrypoint() -> ToolkitExtension:
+ return ToolkitExtension()
diff --git a/nodes.py b/nodes.py
index e11a8ed80..91de7a9d7 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2433,7 +2433,8 @@ async def init_builtin_extra_nodes():
"nodes_image_compare.py",
"nodes_zimage.py",
"nodes_lora_debug.py",
- "nodes_color.py"
+ "nodes_color.py",
+ "nodes_toolkit.py",
]
import_failed = []
From 6555dc65b82c5f072dcad87f0dbccb4fc5f85e6b Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 5 Feb 2026 13:43:45 -0800
Subject: [PATCH 08/10] Make ace step 1.5 work without the llm. (#12311)
---
comfy/ldm/ace/ace_step15.py | 72 +++++++++++++++++++++++++++++++++----
comfy/model_base.py | 17 +++------
2 files changed, 70 insertions(+), 19 deletions(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index f2b130bc1..7fc7f1e8e 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -7,6 +7,67 @@ from comfy.ldm.modules.attention import optimized_attention
import comfy.model_management
from comfy.ldm.flux.layers import timestep_embedding
+def get_silence_latent(length, device):
+ head = torch.tensor([[[ 0.5707, 0.0982, 0.6909, -0.5658, 0.6266, 0.6996, -0.1365, -0.1291,
+ -0.0776, -0.1171, -0.2743, -0.8422, -0.1168, 1.5539, -4.6936, 0.7436,
+ -1.1846, -0.2637, 0.6933, -6.7266, 0.0966, -0.1187, -0.3501, -1.1736,
+ 0.0587, -2.0517, -1.3651, 0.7508, -0.2490, -1.3548, -0.1290, -0.7261,
+ 1.1132, -0.3249, 0.2337, 0.3004, 0.6605, -0.0298, -0.1989, -0.4041,
+ 0.2843, -1.0963, -0.5519, 0.2639, -1.0436, -0.1183, 0.0640, 0.4460,
+ -1.1001, -0.6172, -1.3241, 1.1379, 0.5623, -0.1507, -0.1963, -0.4742,
+ -2.4697, 0.5302, 0.5381, 0.4636, -0.1782, -0.0687, 1.0333, 0.4202],
+ [ 0.3040, -0.1367, 0.6200, 0.0665, -0.0642, 0.4655, -0.1187, -0.0440,
+ 0.2941, -0.2753, 0.0173, -0.2421, -0.0147, 1.5603, -2.7025, 0.7907,
+ -0.9736, -0.0682, 0.1294, -5.0707, -0.2167, 0.3302, -0.1513, -0.8100,
+ -0.3894, -0.2884, -0.3149, 0.8660, -0.3817, -1.7061, 0.5824, -0.4840,
+ 0.6938, 0.1859, 0.1753, 0.3081, 0.0195, 0.1403, -0.0754, -0.2091,
+ 0.1251, -0.1578, -0.4968, -0.1052, -0.4554, -0.0320, 0.1284, 0.4974,
+ -1.1889, -0.0344, -0.8313, 0.2953, 0.5445, -0.6249, -0.1595, -0.0682,
+ -3.1412, 0.0484, 0.4153, 0.8260, -0.1526, -0.0625, 0.5366, 0.8473],
+ [ 5.3524e-02, -1.7534e-01, 5.4443e-01, -4.3501e-01, -2.1317e-03,
+ 3.7200e-01, -4.0143e-03, -1.5516e-01, -1.2968e-01, -1.5375e-01,
+ -7.7107e-02, -2.0593e-01, -3.2780e-01, 1.5142e+00, -2.6101e+00,
+ 5.8698e-01, -1.2716e+00, -2.4773e-01, -2.7933e-02, -5.0799e+00,
+ 1.1601e-01, 4.0987e-01, -2.2030e-02, -6.6495e-01, -2.0995e-01,
+ -6.3474e-01, -1.5893e-01, 8.2745e-01, -2.2992e-01, -1.6816e+00,
+ 5.4440e-01, -4.9579e-01, 5.5128e-01, 3.0477e-01, 8.3052e-02,
+ -6.1782e-02, 5.9036e-03, 2.9553e-01, -8.0645e-02, -1.0060e-01,
+ 1.9144e-01, -3.8124e-01, -7.2949e-01, 2.4520e-02, -5.0814e-01,
+ 2.3977e-01, 9.2943e-02, 3.9256e-01, -1.1993e+00, -3.2752e-01,
+ -7.2707e-01, 2.9476e-01, 4.3542e-01, -8.8597e-01, -4.1686e-01,
+ -8.5390e-02, -2.9018e+00, 6.4988e-02, 5.3945e-01, 9.1988e-01,
+ 5.8762e-02, -7.0098e-02, 6.4772e-01, 8.9118e-01],
+ [-3.2225e-02, -1.3195e-01, 5.6411e-01, -5.4766e-01, -5.2170e-03,
+ 3.1425e-01, -5.4367e-02, -1.9419e-01, -1.3059e-01, -1.3660e-01,
+ -9.0984e-02, -1.9540e-01, -2.5590e-01, 1.5440e+00, -2.6349e+00,
+ 6.8273e-01, -1.2532e+00, -1.9810e-01, -2.2793e-02, -5.0506e+00,
+ 1.8818e-01, 5.0109e-01, 7.3546e-03, -6.8771e-01, -3.0676e-01,
+ -7.3257e-01, -1.6687e-01, 9.2232e-01, -1.8987e-01, -1.7267e+00,
+ 5.3355e-01, -5.3179e-01, 4.4953e-01, 2.8820e-01, 1.3012e-01,
+ -2.0943e-01, -1.1348e-01, 3.3929e-01, -1.5069e-01, -1.2919e-01,
+ 1.8929e-01, -3.6166e-01, -8.0756e-01, 6.6387e-02, -5.8867e-01,
+ 1.6978e-01, 1.0134e-01, 3.3877e-01, -1.2133e+00, -3.2492e-01,
+ -8.1237e-01, 3.8101e-01, 4.3765e-01, -8.0596e-01, -4.4531e-01,
+ -4.7513e-02, -2.9266e+00, 1.1741e-03, 4.5123e-01, 9.3075e-01,
+ 5.3688e-02, -1.9621e-01, 6.4530e-01, 9.3870e-01]]], device=device).movedim(-1, 1)
+
+ silence_latent = torch.tensor([[[-1.3672e-01, -1.5820e-01, 5.8594e-01, -5.7422e-01, 3.0273e-02,
+ 2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
+ -2.7710e-02, -1.8066e-01, -2.9688e-01, 1.6016e+00, -2.6719e+00,
+ 7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
+ 2.4316e-01, 4.7266e-01, 4.6387e-02, -6.6406e-01, -2.1973e-01,
+ -6.7578e-01, -1.5723e-01, 9.5312e-01, -2.0020e-01, -1.7109e+00,
+ 5.8984e-01, -5.7422e-01, 5.1562e-01, 2.8320e-01, 1.4551e-01,
+ -1.8750e-01, -5.9814e-02, 3.6719e-01, -1.0059e-01, -1.5723e-01,
+ 2.0605e-01, -4.3359e-01, -8.2812e-01, 4.5654e-02, -6.6016e-01,
+ 1.4844e-01, 9.4727e-02, 3.8477e-01, -1.2578e+00, -3.3203e-01,
+ -8.5547e-01, 4.3359e-01, 4.2383e-01, -8.9453e-01, -5.0391e-01,
+ -5.6152e-02, -2.9219e+00, -2.4658e-02, 5.0391e-01, 9.8438e-01,
+ 7.2754e-02, -2.1582e-01, 6.3672e-01, 1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, length)
+ silence_latent[:, :, :head.shape[-1]] = head
+ return silence_latent
+
+
def get_layer_class(operations, layer_name):
if operations is not None and hasattr(operations, layer_name):
return getattr(operations, layer_name)
@@ -1040,22 +1101,21 @@ class AceStepConditionGenerationModel(nn.Module):
lm_hints = self.detokenizer(lm_hints_5Hz)
lm_hints = lm_hints[:, :src_latents.shape[1], :]
- if is_covers is None:
+ if is_covers is None or is_covers is True:
src_latents = lm_hints
- else:
- src_latents = torch.where(is_covers.unsqueeze(-1).unsqueeze(-1) > 0, lm_hints, src_latents)
+ elif is_covers is False:
+ src_latents = refer_audio_acoustic_hidden_states_packed
context_latents = torch.cat([src_latents, chunk_masks.to(src_latents.dtype)], dim=-1)
return encoder_hidden, encoder_mask, context_latents
- def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, **kwargs):
+ def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, **kwargs):
text_attention_mask = None
lyric_attention_mask = None
refer_audio_order_mask = None
attention_mask = None
chunk_masks = None
- is_covers = None
src_latents = None
precomputed_lm_hints_25Hz = None
lyric_hidden_states = lyric_embed
@@ -1067,7 +1127,7 @@ class AceStepConditionGenerationModel(nn.Module):
if refer_audio_order_mask is None:
refer_audio_order_mask = torch.zeros((x.shape[0],), device=x.device, dtype=torch.long)
- if src_latents is None and is_covers is None:
+ if src_latents is None:
src_latents = x
if chunk_masks is None:
diff --git a/comfy/model_base.py b/comfy/model_base.py
index a2a34f191..dcbf12074 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1560,22 +1560,11 @@ class ACEStep15(BaseModel):
refer_audio = kwargs.get("reference_audio_timbre_latents", None)
if refer_audio is None or len(refer_audio) == 0:
- refer_audio = torch.tensor([[[-1.3672e-01, -1.5820e-01, 5.8594e-01, -5.7422e-01, 3.0273e-02,
- 2.7930e-01, -2.5940e-03, -2.0703e-01, -1.6113e-01, -1.4746e-01,
- -2.7710e-02, -1.8066e-01, -2.9688e-01, 1.6016e+00, -2.6719e+00,
- 7.7734e-01, -1.3516e+00, -1.9434e-01, -7.1289e-02, -5.0938e+00,
- 2.4316e-01, 4.7266e-01, 4.6387e-02, -6.6406e-01, -2.1973e-01,
- -6.7578e-01, -1.5723e-01, 9.5312e-01, -2.0020e-01, -1.7109e+00,
- 5.8984e-01, -5.7422e-01, 5.1562e-01, 2.8320e-01, 1.4551e-01,
- -1.8750e-01, -5.9814e-02, 3.6719e-01, -1.0059e-01, -1.5723e-01,
- 2.0605e-01, -4.3359e-01, -8.2812e-01, 4.5654e-02, -6.6016e-01,
- 1.4844e-01, 9.4727e-02, 3.8477e-01, -1.2578e+00, -3.3203e-01,
- -8.5547e-01, 4.3359e-01, 4.2383e-01, -8.9453e-01, -5.0391e-01,
- -5.6152e-02, -2.9219e+00, -2.4658e-02, 5.0391e-01, 9.8438e-01,
- 7.2754e-02, -2.1582e-01, 6.3672e-01, 1.0000e+00]]], device=device).movedim(-1, 1).repeat(1, 1, noise.shape[2])
+ refer_audio = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device)
pass_audio_codes = True
else:
refer_audio = refer_audio[-1][:, :, :noise.shape[2]]
+ out['is_covers'] = comfy.conds.CONDConstant(True)
pass_audio_codes = False
if pass_audio_codes:
@@ -1583,6 +1572,8 @@ class ACEStep15(BaseModel):
if audio_codes is not None:
out['audio_codes'] = comfy.conds.CONDRegular(torch.tensor(audio_codes, device=device))
refer_audio = refer_audio[:, :, :750]
+ else:
+ out['is_covers'] = comfy.conds.CONDConstant(False)
out['refer_audio'] = comfy.conds.CONDRegular(refer_audio)
return out
From 458292fef0077470f5675ba52555e7bb4c28102e Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 5 Feb 2026 16:15:04 -0800
Subject: [PATCH 09/10] Fix some lowvram stuff with ace step 1.5 (#12312)
---
comfy/ldm/ace/ace_step15.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py
index 7fc7f1e8e..69338336d 100644
--- a/comfy/ldm/ace/ace_step15.py
+++ b/comfy/ldm/ace/ace_step15.py
@@ -738,7 +738,7 @@ class AttentionPooler(nn.Module):
def forward(self, x):
B, T, P, D = x.shape
x = self.embed_tokens(x)
- special = self.special_token.expand(B, T, 1, -1)
+ special = comfy.model_management.cast_to(self.special_token, device=x.device, dtype=x.dtype).expand(B, T, 1, -1)
x = torch.cat([special, x], dim=2)
x = x.view(B * T, P + 1, D)
@@ -789,7 +789,7 @@ class FSQ(nn.Module):
self.register_buffer('implicit_codebook', implicit_codebook, persistent=False)
def bound(self, z):
- levels_minus_1 = (self._levels - 1).to(z.dtype)
+ levels_minus_1 = (comfy.model_management.cast_to(self._levels, device=z.device, dtype=z.dtype) - 1)
scale = 2. / levels_minus_1
bracket = (levels_minus_1 * (torch.tanh(z) + 1) / 2.) + 0.5
@@ -804,8 +804,8 @@ class FSQ(nn.Module):
return codes_non_centered.float() * (2. / (self._levels.float() - 1)) - 1.
def codes_to_indices(self, zhat):
- zhat_normalized = (zhat + 1.) / (2. / (self._levels.to(zhat.dtype) - 1))
- return (zhat_normalized * self._basis.to(zhat.dtype)).sum(dim=-1).round().to(torch.int32)
+ zhat_normalized = (zhat + 1.) / (2. / (comfy.model_management.cast_to(self._levels, device=zhat.device, dtype=zhat.dtype) - 1))
+ return (zhat_normalized * comfy.model_management.cast_to(self._basis, device=zhat.device, dtype=zhat.dtype)).sum(dim=-1).round().to(torch.int32)
def forward(self, z):
orig_dtype = z.dtype
@@ -887,7 +887,7 @@ class ResidualFSQ(nn.Module):
x = self.project_in(x)
if hasattr(self, 'soft_clamp_input_value'):
- sc_val = self.soft_clamp_input_value.to(x.dtype)
+ sc_val = comfy.model_management.cast_to(self.soft_clamp_input_value, device=x.device, dtype=x.dtype)
x = (x / sc_val).tanh() * sc_val
quantized_out = torch.tensor(0., device=x.device, dtype=x.dtype)
@@ -895,7 +895,7 @@ class ResidualFSQ(nn.Module):
all_indices = []
for layer, scale in zip(self.layers, self.scales):
- scale = scale.to(residual.dtype)
+ scale = comfy.model_management.cast_to(scale, device=x.device, dtype=x.dtype)
quantized, indices = layer(residual / scale)
quantized = quantized * scale
From c2d7f07dbf312ef9034c65102f1a45c4a3355c1a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 5 Feb 2026 16:24:09 -0800
Subject: [PATCH 10/10] Fix issue when using disable_unet_model_creation
(#12315)
---
comfy/model_base.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index dcbf12074..3bb54f59e 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -147,11 +147,11 @@ class BaseModel(torch.nn.Module):
self.diffusion_model.to(memory_format=torch.channels_last)
logging.debug("using channels last mode for diffusion model")
logging.info("model weight dtype {}, manual cast: {}".format(self.get_dtype(), self.manual_cast_dtype))
+ comfy.model_management.archive_model_dtypes(self.diffusion_model)
+
self.model_type = model_type
self.model_sampling = model_sampling(model_config, model_type)
- comfy.model_management.archive_model_dtypes(self.diffusion_model)
-
self.adm_channels = unet_config.get("adm_in_channels", None)
if self.adm_channels is None:
self.adm_channels = 0