From 7ce3f64c784430e15731d344affffb48c55a0eaa Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Wed, 15 Apr 2026 08:35:27 +0800 Subject: [PATCH 01/32] Update workflow templates to v0.9.54 (#13412) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 7f065e0d4..e45a20aaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.11 -comfyui-workflow-templates==0.9.50 +comfyui-workflow-templates==0.9.54 comfyui-embedded-docs==0.4.3 torch torchsde From cb0bbde402cfb72559cc8b00f679d7735dff5c40 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 14 Apr 2026 19:54:47 -0700 Subject: [PATCH 02/32] Fix ernie on devices that don't support fp64. (#13414) --- comfy/ldm/ernie/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/ernie/model.py b/comfy/ldm/ernie/model.py index 3dbab8dc0..f7cdb51e6 100644 --- a/comfy/ldm/ernie/model.py +++ b/comfy/ldm/ernie/model.py @@ -15,7 +15,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim omega = 1.0 / (theta**scale) - out = torch.einsum("...n,d->...nd", pos, omega) + out = torch.einsum("...n,d->...nd", pos.to(device), omega) out = torch.stack([torch.cos(out), torch.sin(out)], dim=0) return out.to(dtype=torch.float32, device=pos.device) From 8f374716ee98d378d403ebc61250e091ecd3a25c Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 14 Apr 2026 22:56:13 -0400 Subject: [PATCH 03/32] ComfyUI v0.19.1 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 0da11d5fa..3c6dac3d9 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.19.0" +__version__ = "0.19.1" diff --git a/pyproject.toml b/pyproject.toml index e8d4a9742..006ed9985 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.19.0" +version = "0.19.1" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" From 1de83f91c34a3396c667d764224054ba87027e82 Mon Sep 17 00:00:00 2001 From: Jun Yamog Date: Wed, 15 Apr 2026 21:10:36 +1200 Subject: [PATCH 04/32] Fix OOM regression in _apply() for quantized models during inference (#13372) Skip unnecessary clone of inference-mode tensors when already inside torch.inference_mode(), matching the existing guard in set_attr_param. The unconditional clone introduced in 20561aa9 caused transient VRAM doubling during model movement for FP8/quantized models. --- comfy/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index b5cd1d47e..7a9b4b84c 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -1151,7 +1151,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec if param is None: continue p = fn(param) - if p.is_inference(): + if (not torch.is_inference_mode_enabled()) and p.is_inference(): p = p.clone() self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False)) for key, buf in self._buffers.items(): From e9a2d1e4cc34ade8a655b386a3919a6d05aa290a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 15 Apr 2026 19:59:08 -0700 Subject: [PATCH 05/32] Add a way to disable default template in text gen node. (#13424) --- comfy_extras/nodes_textgen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/comfy_extras/nodes_textgen.py b/comfy_extras/nodes_textgen.py index f1aeb63fa..eed26c582 100644 --- a/comfy_extras/nodes_textgen.py +++ b/comfy_extras/nodes_textgen.py @@ -35,6 +35,7 @@ class TextGenerate(io.ComfyNode): io.Int.Input("max_length", default=256, min=1, max=2048), io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"), io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."), + io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True), ], outputs=[ io.String.Output(display_name="generated_text"), @@ -42,9 +43,9 @@ class TextGenerate(io.ComfyNode): ) @classmethod - def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput: + def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput: - tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1, thinking=thinking) + tokens = clip.tokenize(prompt, image=image, skip_template=not use_default_template, min_length=1, thinking=thinking) # Get sampling parameters from dynamic combo do_sample = sampling_mode.get("sampling_mode") == "on" From b41ab53b6f289b4d7688ab96e0a06248ec1fd86b Mon Sep 17 00:00:00 2001 From: Bedovyy <137917911+bedovyy@users.noreply.github.com> Date: Thu, 16 Apr 2026 23:11:58 +0900 Subject: [PATCH 06/32] Use `ErnieTEModel_` not `ErnieTEModel`. (#13431) --- comfy/text_encoders/ernie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/text_encoders/ernie.py b/comfy/text_encoders/ernie.py index 2c7df78fe..46d24d222 100644 --- a/comfy/text_encoders/ernie.py +++ b/comfy/text_encoders/ernie.py @@ -35,4 +35,4 @@ def te(dtype_llama=None, llama_quantization_metadata=None): model_options = model_options.copy() model_options["quantization_metadata"] = llama_quantization_metadata super().__init__(device=device, dtype=dtype, model_options=model_options) - return ErnieTEModel + return ErnieTEModel_ From d0c53c50c2a1edf11aa63967d09aa3efbfd43cfe Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 17 Apr 2026 04:32:04 +0300 Subject: [PATCH 07/32] feat(api-nodes): add 1080p resolution for SeeDance 2.0 model (#13437) Signed-off-by: bigcat88 --- comfy_api_nodes/nodes_bytedance.py | 38 ++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 1cca72f6e..429c32444 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -1066,7 +1066,7 @@ PRICE_BADGE_VIDEO = IO.PriceBadge( ) -def _seedance2_text_inputs(): +def _seedance2_text_inputs(resolutions: list[str]): return [ IO.String.Input( "prompt", @@ -1076,7 +1076,7 @@ def _seedance2_text_inputs(): ), IO.Combo.Input( "resolution", - options=["480p", "720p"], + options=resolutions, tooltip="Resolution of the output video.", ), IO.Combo.Input( @@ -1114,8 +1114,8 @@ class ByteDance2TextToVideoNode(IO.ComfyNode): IO.DynamicCombo.Input( "model", options=[ - IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()), - IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()), + IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])), + IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])), ], tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.", ), @@ -1152,11 +1152,14 @@ class ByteDance2TextToVideoNode(IO.ComfyNode): ( $rate480 := 10044; $rate720 := 21600; + $rate1080 := 48800; $m := widgets.model; $pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001; $res := $lookup(widgets, "model.resolution"); $dur := $lookup(widgets, "model.duration"); - $rate := $res = "720p" ? $rate720 : $rate480; + $rate := $res = "1080p" ? $rate1080 : + $res = "720p" ? $rate720 : + $rate480; $cost := $dur * $rate * $pricePer1K / 1000; {"type": "usd", "usd": $cost, "format": {"approximate": true}} ) @@ -1195,6 +1198,7 @@ class ByteDance2TextToVideoNode(IO.ComfyNode): status_extractor=lambda r: r.status, price_extractor=_seedance2_price_extractor(model_id, has_video_input=False), poll_interval=9, + max_poll_attempts=180, ) return IO.NodeOutput(await download_url_to_video_output(response.content.video_url)) @@ -1212,8 +1216,8 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): IO.DynamicCombo.Input( "model", options=[ - IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()), - IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()), + IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])), + IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])), ], tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.", ), @@ -1259,11 +1263,14 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): ( $rate480 := 10044; $rate720 := 21600; + $rate1080 := 48800; $m := widgets.model; $pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001; $res := $lookup(widgets, "model.resolution"); $dur := $lookup(widgets, "model.duration"); - $rate := $res = "720p" ? $rate720 : $rate480; + $rate := $res = "1080p" ? $rate1080 : + $res = "720p" ? $rate720 : + $rate480; $cost := $dur * $rate * $pricePer1K / 1000; {"type": "usd", "usd": $cost, "format": {"approximate": true}} ) @@ -1324,13 +1331,14 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): status_extractor=lambda r: r.status, price_extractor=_seedance2_price_extractor(model_id, has_video_input=False), poll_interval=9, + max_poll_attempts=180, ) return IO.NodeOutput(await download_url_to_video_output(response.content.video_url)) -def _seedance2_reference_inputs(): +def _seedance2_reference_inputs(resolutions: list[str]): return [ - *_seedance2_text_inputs(), + *_seedance2_text_inputs(resolutions), IO.Autogrow.Input( "reference_images", template=IO.Autogrow.TemplateNames( @@ -1382,8 +1390,8 @@ class ByteDance2ReferenceNode(IO.ComfyNode): IO.DynamicCombo.Input( "model", options=[ - IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs()), - IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs()), + IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs(["480p", "720p", "1080p"])), + IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs(["480p", "720p"])), ], tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.", ), @@ -1423,13 +1431,16 @@ class ByteDance2ReferenceNode(IO.ComfyNode): ( $rate480 := 10044; $rate720 := 21600; + $rate1080 := 48800; $m := widgets.model; $hasVideo := $lookup(inputGroups, "model.reference_videos") > 0; $noVideoPricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001; $videoPricePer1K := $contains($m, "fast") ? 0.004719 : 0.006149; $res := $lookup(widgets, "model.resolution"); $dur := $lookup(widgets, "model.duration"); - $rate := $res = "720p" ? $rate720 : $rate480; + $rate := $res = "1080p" ? $rate1080 : + $res = "720p" ? $rate720 : + $rate480; $noVideoCost := $dur * $rate * $noVideoPricePer1K / 1000; $minVideoFactor := $ceil($dur * 5 / 3); $minVideoCost := $minVideoFactor * $rate * $videoPricePer1K / 1000; @@ -1559,6 +1570,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode): status_extractor=lambda r: r.status, price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input), poll_interval=9, + max_poll_attempts=180, ) return IO.NodeOutput(await download_url_to_video_output(response.content.video_url)) From 1391579c33db4921a5d40c7e0e71a938b28eb047 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 16 Apr 2026 21:20:16 -0700 Subject: [PATCH 08/32] Add JsonExtractString node. (#13435) --- comfy_extras/nodes_string.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index 75a8bb4ee..604076c4e 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -1,4 +1,5 @@ import re +import json from typing_extensions import override from comfy_api.latest import ComfyExtension, io @@ -375,6 +376,39 @@ class RegexReplace(io.ComfyNode): return io.NodeOutput(result) +class JsonExtractString(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="JsonExtractString", + display_name="Extract String from JSON", + category="utils/string", + search_aliases=["json", "extract json", "parse json", "json value", "read json"], + inputs=[ + io.String.Input("json_string", multiline=True), + io.String.Input("key", multiline=False), + ], + outputs=[ + io.String.Output(), + ] + ) + + @classmethod + def execute(cls, json_string, key): + try: + data = json.loads(json_string) + if isinstance(data, dict) and key in data: + value = data[key] + if value is None: + return io.NodeOutput("") + + return io.NodeOutput(str(value)) + + return io.NodeOutput("") + + except (json.JSONDecodeError, TypeError): + return io.NodeOutput("") + class StringExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: @@ -390,6 +424,7 @@ class StringExtension(ComfyExtension): RegexMatch, RegexExtract, RegexReplace, + JsonExtractString, ] async def comfy_entrypoint() -> StringExtension: From c033bbf516ad8fcd079b45c318e73ee8b5e22962 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 17 Apr 2026 00:26:35 -0400 Subject: [PATCH 09/32] ComfyUI v0.19.2 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 3c6dac3d9..98b8337b4 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.19.1" +__version__ = "0.19.2" diff --git a/pyproject.toml b/pyproject.toml index 006ed9985..c4b006486 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.19.1" +version = "0.19.2" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" From 05f75311489c94e905d958c2bc4b22db5be78699 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Sat, 18 Apr 2026 02:20:09 +1000 Subject: [PATCH 10/32] nodes_textgen: Implement use_default_template for LTX (#13451) --- comfy_extras/nodes_textgen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy_extras/nodes_textgen.py b/comfy_extras/nodes_textgen.py index eed26c582..1f46d820f 100644 --- a/comfy_extras/nodes_textgen.py +++ b/comfy_extras/nodes_textgen.py @@ -161,12 +161,12 @@ class TextGenerateLTX2Prompt(TextGenerate): ) @classmethod - def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput: + def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput: if image is None: formatted_prompt = f"system\n{LTX2_T2V_SYSTEM_PROMPT.strip()}\nuser\nUser Raw Input Prompt: {prompt}.\nmodel\n" else: formatted_prompt = f"system\n{LTX2_I2V_SYSTEM_PROMPT.strip()}\nuser\n\n\n\nUser Raw Input Prompt: {prompt}.\nmodel\n" - return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking) + return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking, use_default_template) class TextgenExtension(ComfyExtension): From 541fd10bbe5ac5e963619bb9594c4993f977e9e1 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:44:08 +0300 Subject: [PATCH 11/32] fix(api-nodes): corrected StabilityAI price badges (#13454) Signed-off-by: bigcat88 --- comfy_api_nodes/nodes_stability.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 9ef13c83b..906d8ff35 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -401,7 +401,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - expr="""{"type":"usd","usd":0.25}""", + expr="""{"type":"usd","usd":0.4}""", ), ) @@ -510,7 +510,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - expr="""{"type":"usd","usd":0.25}""", + expr="""{"type":"usd","usd":0.6}""", ), ) @@ -593,7 +593,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - expr="""{"type":"usd","usd":0.01}""", + expr="""{"type":"usd","usd":0.02}""", ), ) From 4f48be41388f67022d58f4f07f2f785adb8bfeea Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 17 Apr 2026 20:02:06 +0300 Subject: [PATCH 12/32] feat(api-nodes): add new "arrow-1.1" and "arrow-1.1-max" SVG models (#13447) Signed-off-by: bigcat88 --- comfy_api_nodes/nodes_quiver.py | 135 +++++++++++++++----------------- 1 file changed, 65 insertions(+), 70 deletions(-) diff --git a/comfy_api_nodes/nodes_quiver.py b/comfy_api_nodes/nodes_quiver.py index 61533263f..28862e368 100644 --- a/comfy_api_nodes/nodes_quiver.py +++ b/comfy_api_nodes/nodes_quiver.py @@ -17,6 +17,44 @@ from comfy_api_nodes.util import ( ) from comfy_extras.nodes_images import SVG +_ARROW_MODELS = ["arrow-1.1", "arrow-1.1-max", "arrow-preview"] + + +def _arrow_sampling_inputs(): + """Shared sampling inputs for all Arrow model variants.""" + return [ + IO.Float.Input( + "temperature", + default=1.0, + min=0.0, + max=2.0, + step=0.1, + display_mode=IO.NumberDisplay.slider, + tooltip="Randomness control. Higher values increase randomness.", + advanced=True, + ), + IO.Float.Input( + "top_p", + default=1.0, + min=0.05, + max=1.0, + step=0.05, + display_mode=IO.NumberDisplay.slider, + tooltip="Nucleus sampling parameter.", + advanced=True, + ), + IO.Float.Input( + "presence_penalty", + default=0.0, + min=-2.0, + max=2.0, + step=0.1, + display_mode=IO.NumberDisplay.slider, + tooltip="Token presence penalty.", + advanced=True, + ), + ] + class QuiverTextToSVGNode(IO.ComfyNode): @classmethod @@ -39,6 +77,7 @@ class QuiverTextToSVGNode(IO.ComfyNode): default="", tooltip="Additional style or formatting guidance.", optional=True, + advanced=True, ), IO.Autogrow.Input( "reference_images", @@ -53,43 +92,7 @@ class QuiverTextToSVGNode(IO.ComfyNode): ), IO.DynamicCombo.Input( "model", - options=[ - IO.DynamicCombo.Option( - "arrow-preview", - [ - IO.Float.Input( - "temperature", - default=1.0, - min=0.0, - max=2.0, - step=0.1, - display_mode=IO.NumberDisplay.slider, - tooltip="Randomness control. Higher values increase randomness.", - advanced=True, - ), - IO.Float.Input( - "top_p", - default=1.0, - min=0.05, - max=1.0, - step=0.05, - display_mode=IO.NumberDisplay.slider, - tooltip="Nucleus sampling parameter.", - advanced=True, - ), - IO.Float.Input( - "presence_penalty", - default=0.0, - min=-2.0, - max=2.0, - step=0.1, - display_mode=IO.NumberDisplay.slider, - tooltip="Token presence penalty.", - advanced=True, - ), - ], - ), - ], + options=[IO.DynamicCombo.Option(m, _arrow_sampling_inputs()) for m in _ARROW_MODELS], tooltip="Model to use for SVG generation.", ), IO.Int.Input( @@ -112,7 +115,16 @@ class QuiverTextToSVGNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - expr="""{"type":"usd","usd":0.429}""", + depends_on=IO.PriceBadgeDepends(widgets=["model"]), + expr=""" + ( + $contains(widgets.model, "max") + ? {"type":"usd","usd":0.3575} + : $contains(widgets.model, "preview") + ? {"type":"usd","usd":0.429} + : {"type":"usd","usd":0.286} + ) + """, ), ) @@ -176,12 +188,13 @@ class QuiverImageToSVGNode(IO.ComfyNode): "auto_crop", default=False, tooltip="Automatically crop to the dominant subject.", + advanced=True, ), IO.DynamicCombo.Input( "model", options=[ IO.DynamicCombo.Option( - "arrow-preview", + m, [ IO.Int.Input( "target_size", @@ -189,39 +202,12 @@ class QuiverImageToSVGNode(IO.ComfyNode): min=128, max=4096, tooltip="Square resize target in pixels.", - ), - IO.Float.Input( - "temperature", - default=1.0, - min=0.0, - max=2.0, - step=0.1, - display_mode=IO.NumberDisplay.slider, - tooltip="Randomness control. Higher values increase randomness.", - advanced=True, - ), - IO.Float.Input( - "top_p", - default=1.0, - min=0.05, - max=1.0, - step=0.05, - display_mode=IO.NumberDisplay.slider, - tooltip="Nucleus sampling parameter.", - advanced=True, - ), - IO.Float.Input( - "presence_penalty", - default=0.0, - min=-2.0, - max=2.0, - step=0.1, - display_mode=IO.NumberDisplay.slider, - tooltip="Token presence penalty.", advanced=True, ), + *_arrow_sampling_inputs(), ], - ), + ) + for m in _ARROW_MODELS ], tooltip="Model to use for SVG vectorization.", ), @@ -245,7 +231,16 @@ class QuiverImageToSVGNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - expr="""{"type":"usd","usd":0.429}""", + depends_on=IO.PriceBadgeDepends(widgets=["model"]), + expr=""" + ( + $contains(widgets.model, "max") + ? {"type":"usd","usd":0.3575} + : $contains(widgets.model, "preview") + ? {"type":"usd","usd":0.429} + : {"type":"usd","usd":0.286} + ) + """, ), ) From f8d92cf3138092050955fabf9172b3defcd89484 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Sat, 18 Apr 2026 01:16:39 +0800 Subject: [PATCH 13/32] chore: update workflow templates to v0.9.57 (#13455) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e45a20aaf..3de845f48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.11 -comfyui-workflow-templates==0.9.54 +comfyui-workflow-templates==0.9.57 comfyui-embedded-docs==0.4.3 torch torchsde From 9635c2ec9b92f8fa1113660ace7660c1fea67e0e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 17 Apr 2026 20:31:37 +0300 Subject: [PATCH 14/32] fix(api-nodes): make "obj" output optional in Hunyuan3D Text and Image to 3D (#13449) Signed-off-by: bigcat88 Co-authored-by: Jedrzej Kosinski --- comfy_api_nodes/nodes_hunyuan3d.py | 34 ++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/comfy_api_nodes/nodes_hunyuan3d.py b/comfy_api_nodes/nodes_hunyuan3d.py index 44c94a98e..5fc31bccd 100644 --- a/comfy_api_nodes/nodes_hunyuan3d.py +++ b/comfy_api_nodes/nodes_hunyuan3d.py @@ -221,14 +221,17 @@ class TencentTextToModelNode(IO.ComfyNode): response_model=To3DProTaskResultResponse, status_extractor=lambda r: r.Status, ) - obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url) + obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False) + obj_result = None + if obj_file_response: + obj_result = await download_and_extract_obj_zip(obj_file_response.Url) return IO.NodeOutput( f"{task_id}.glb", await download_url_to_file_3d( get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id ), - obj_result.obj, - obj_result.texture, + obj_result.obj if obj_result else None, + obj_result.texture if obj_result else None, ) @@ -378,17 +381,30 @@ class TencentImageToModelNode(IO.ComfyNode): response_model=To3DProTaskResultResponse, status_extractor=lambda r: r.Status, ) - obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url) + obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False) + if obj_file_response: + obj_result = await download_and_extract_obj_zip(obj_file_response.Url) + return IO.NodeOutput( + f"{task_id}.glb", + await download_url_to_file_3d( + get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id + ), + obj_result.obj, + obj_result.texture, + obj_result.metallic if obj_result.metallic is not None else torch.zeros(1, 1, 1, 3), + obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3), + obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3), + ) return IO.NodeOutput( f"{task_id}.glb", await download_url_to_file_3d( get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id ), - obj_result.obj, - obj_result.texture, - obj_result.metallic if obj_result.metallic is not None else torch.zeros(1, 1, 1, 3), - obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3), - obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3), + None, + None, + None, + None, + None, ) From 3086026401180c9216bcb6ace442a4e3587d2c66 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Fri, 17 Apr 2026 13:35:01 -0400 Subject: [PATCH 15/32] ComfyUI v0.19.3 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 98b8337b4..2a1eb9905 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.19.2" +__version__ = "0.19.3" diff --git a/pyproject.toml b/pyproject.toml index c4b006486..8fa92ecbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.19.2" +version = "0.19.3" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" From b9dedea57d9f8be9861811aef3ced3e221eb8068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Sun, 19 Apr 2026 06:02:01 +0300 Subject: [PATCH 16/32] feat: SUPIR model support (CORE-17) (#13250) --- .../modules/diffusionmodules/openaimodel.py | 30 ++- comfy/ldm/supir/__init__.py | 0 comfy/ldm/supir/supir_modules.py | 226 ++++++++++++++++++ comfy/ldm/supir/supir_patch.py | 103 ++++++++ comfy/model_patcher.py | 4 + comfy_extras/nodes_model_patch.py | 104 ++++++++ comfy_extras/nodes_post_processing.py | 224 +++++++++++++++++ 7 files changed, 680 insertions(+), 11 deletions(-) create mode 100644 comfy/ldm/supir/__init__.py create mode 100644 comfy/ldm/supir/supir_modules.py create mode 100644 comfy/ldm/supir/supir_patch.py diff --git a/comfy/ldm/modules/diffusionmodules/openaimodel.py b/comfy/ldm/modules/diffusionmodules/openaimodel.py index 295310df6..4b92c44cf 100644 --- a/comfy/ldm/modules/diffusionmodules/openaimodel.py +++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py @@ -34,6 +34,16 @@ class TimestepBlock(nn.Module): #This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index" def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None): for layer in ts: + if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]: + found_patched = False + for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]: + if isinstance(layer, class_type): + x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator) + found_patched = True + break + if found_patched: + continue + if isinstance(layer, VideoResBlock): x = layer(x, emb, num_video_frames, image_only_indicator) elif isinstance(layer, TimestepBlock): @@ -49,15 +59,6 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out elif isinstance(layer, Upsample): x = layer(x, output_shape=output_shape) else: - if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]: - found_patched = False - for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]: - if isinstance(layer, class_type): - x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator) - found_patched = True - break - if found_patched: - continue x = layer(x) return x @@ -894,6 +895,12 @@ class UNetModel(nn.Module): h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator) h = apply_control(h, control, 'middle') + if "middle_block_after_patch" in transformer_patches: + patch = transformer_patches["middle_block_after_patch"] + for p in patch: + out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y, + "timesteps": timesteps, "transformer_options": transformer_options}) + h = out["h"] for id, module in enumerate(self.output_blocks): transformer_options["block"] = ("output", id) @@ -905,8 +912,9 @@ class UNetModel(nn.Module): for p in patch: h, hsp = p(h, hsp, transformer_options) - h = th.cat([h, hsp], dim=1) - del hsp + if hsp is not None: + h = th.cat([h, hsp], dim=1) + del hsp if len(hs) > 0: output_shape = hs[-1].shape else: diff --git a/comfy/ldm/supir/__init__.py b/comfy/ldm/supir/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comfy/ldm/supir/supir_modules.py b/comfy/ldm/supir/supir_modules.py new file mode 100644 index 000000000..7389b01d2 --- /dev/null +++ b/comfy/ldm/supir/supir_modules.py @@ -0,0 +1,226 @@ +import torch +import torch.nn as nn + +from comfy.ldm.modules.diffusionmodules.util import timestep_embedding +from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer +from comfy.ldm.modules.attention import optimized_attention + + +class ZeroSFT(nn.Module): + def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None): + super().__init__() + + ks = 3 + pw = ks // 2 + + self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device) + + nhidden = 128 + + self.mlp_shared = nn.Sequential( + operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device), + nn.SiLU() + ) + self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device) + self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device) + + self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device) + self.pre_concat = bool(concat_channels != 0) + + def forward(self, c, h, h_ori=None, control_scale=1): + if h_ori is not None and self.pre_concat: + h_raw = torch.cat([h_ori, h], dim=1) + else: + h_raw = h + + h = h + self.zero_conv(c) + if h_ori is not None and self.pre_concat: + h = torch.cat([h_ori, h], dim=1) + actv = self.mlp_shared(c) + gamma = self.zero_mul(actv) + beta = self.zero_add(actv) + h = self.param_free_norm(h) + h = torch.addcmul(h + beta, h, gamma) + if h_ori is not None and not self.pre_concat: + h = torch.cat([h_ori, h], dim=1) + return torch.lerp(h_raw, h, control_scale) + + +class _CrossAttnInner(nn.Module): + """Inner cross-attention module matching the state_dict layout of the original CrossAttention.""" + def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None): + super().__init__() + inner_dim = dim_head * heads + self.heads = heads + self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device) + self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device) + self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device) + self.to_out = nn.Sequential( + operations.Linear(inner_dim, query_dim, dtype=dtype, device=device), + ) + + def forward(self, x, context): + q = self.to_q(x) + k = self.to_k(context) + v = self.to_v(context) + return self.to_out(optimized_attention(q, k, v, self.heads)) + + +class ZeroCrossAttn(nn.Module): + def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None): + super().__init__() + heads = query_dim // 64 + dim_head = 64 + self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations) + self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device) + self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device) + + def forward(self, context, x, control_scale=1): + b, c, h, w = x.shape + x_in = x + + x = self.attn( + self.norm1(x).flatten(2).transpose(1, 2), + self.norm2(context).flatten(2).transpose(1, 2), + ).transpose(1, 2).unflatten(2, (h, w)) + + return x_in + x * control_scale + + +class GLVControl(nn.Module): + """SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only).""" + def __init__( + self, + in_channels=4, + model_channels=320, + num_res_blocks=2, + attention_resolutions=(4, 2), + channel_mult=(1, 2, 4), + num_head_channels=64, + transformer_depth=(1, 2, 10), + context_dim=2048, + adm_in_channels=2816, + use_linear_in_transformer=True, + use_checkpoint=False, + dtype=None, + device=None, + operations=None, + **kwargs, + ): + super().__init__() + self.model_channels = model_channels + time_embed_dim = model_channels * 4 + + self.time_embed = nn.Sequential( + operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device), + nn.SiLU(), + operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device), + ) + + self.label_emb = nn.Sequential( + nn.Sequential( + operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device), + nn.SiLU(), + operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device), + ) + ) + + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device) + ) + ]) + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for nr in range(num_res_blocks): + layers = [ + ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels, + dtype=dtype, device=device, operations=operations) + ] + ch = mult * model_channels + if ds in attention_resolutions: + num_heads = ch // num_head_channels + layers.append( + SpatialTransformer(ch, num_heads, num_head_channels, + depth=transformer_depth[level], context_dim=context_dim, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint, + dtype=dtype, device=device, operations=operations) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + if level != len(channel_mult) - 1: + self.input_blocks.append( + TimestepEmbedSequential( + Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations) + ) + ) + ds *= 2 + + num_heads = ch // num_head_channels + self.middle_block = TimestepEmbedSequential( + ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations), + SpatialTransformer(ch, num_heads, num_head_channels, + depth=transformer_depth[-1], context_dim=context_dim, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint, + dtype=dtype, device=device, operations=operations), + ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations), + ) + + self.input_hint_block = TimestepEmbedSequential( + operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device) + ) + + def forward(self, x, timesteps, xt, context=None, y=None, **kwargs): + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype) + emb = self.time_embed(t_emb) + self.label_emb(y) + + guided_hint = self.input_hint_block(x, emb, context) + + hs = [] + h = xt + for module in self.input_blocks: + if guided_hint is not None: + h = module(h, emb, context) + h += guided_hint + guided_hint = None + else: + h = module(h, emb, context) + hs.append(h) + h = self.middle_block(h, emb, context) + hs.append(h) + return hs + + +class SUPIR(nn.Module): + """ + SUPIR model containing GLVControl (control encoder) and project_modules (adapters). + State dict keys match the original SUPIR checkpoint layout: + control_model.* -> GLVControl + project_modules.* -> nn.ModuleList of ZeroSFT/ZeroCrossAttn + """ + def __init__(self, device=None, dtype=None, operations=None): + super().__init__() + + self.control_model = GLVControl(dtype=dtype, device=device, operations=operations) + + project_channel_scale = 2 + cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3 + project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3] + concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0] + cross_attn_insert_idx = [6, 3] + + self.project_modules = nn.ModuleList() + for i in range(len(cond_output_channels)): + self.project_modules.append(ZeroSFT( + project_channels[i], cond_output_channels[i], + concat_channels=concat_channels[i], + dtype=dtype, device=device, operations=operations, + )) + + for i in cross_attn_insert_idx: + self.project_modules.insert(i, ZeroCrossAttn( + cond_output_channels[i], concat_channels[i], + dtype=dtype, device=device, operations=operations, + )) diff --git a/comfy/ldm/supir/supir_patch.py b/comfy/ldm/supir/supir_patch.py new file mode 100644 index 000000000..b67ab4cd8 --- /dev/null +++ b/comfy/ldm/supir/supir_patch.py @@ -0,0 +1,103 @@ +import torch +from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample + + +class SUPIRPatch: + """ + Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters). + Runs GLVControl lazily on first patch invocation per step, applies adapters through + middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch. + """ + SIGMA_MAX = 14.6146 + + def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end): + self.model_patch = model_patch # CoreModelPatcher wrapping GLVControl + self.project_modules = project_modules # nn.ModuleList of ZeroSFT/ZeroCrossAttn + self.hint_latent = hint_latent # encoded LQ image latent + self.strength_start = strength_start + self.strength_end = strength_end + self.cached_features = None + self.adapter_idx = 0 + self.control_idx = 0 + self.current_control_idx = 0 + self.active = True + + def _ensure_features(self, kwargs): + """Run GLVControl on first call per step, cache results.""" + if self.cached_features is not None: + return + x = kwargs["x"] + b = x.shape[0] + hint = self.hint_latent.to(device=x.device, dtype=x.dtype) + if hint.shape[0] != b: + hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b] + self.cached_features = self.model_patch.model.control_model( + hint, kwargs["timesteps"], x, + kwargs["context"], kwargs["y"] + ) + self.adapter_idx = len(self.project_modules) - 1 + self.control_idx = len(self.cached_features) - 1 + + def _get_control_scale(self, kwargs): + if self.strength_start == self.strength_end: + return self.strength_end + sigma = kwargs["transformer_options"].get("sigmas") + if sigma is None: + return self.strength_end + s = sigma[0].item() if sigma.dim() > 0 else sigma.item() + t = min(s / self.SIGMA_MAX, 1.0) + return t * (self.strength_start - self.strength_end) + self.strength_end + + def middle_after(self, kwargs): + """middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block.""" + self.cached_features = None # reset from previous step + self.current_scale = self._get_control_scale(kwargs) + self.active = self.current_scale > 0 + if not self.active: + return {"h": kwargs["h"]} + self._ensure_features(kwargs) + h = kwargs["h"] + h = self.project_modules[self.adapter_idx]( + self.cached_features[self.control_idx], h, control_scale=self.current_scale + ) + self.adapter_idx -= 1 + self.control_idx -= 1 + return {"h": h} + + def output_block(self, h, hsp, transformer_options): + """output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat.""" + if not self.active: + return h, hsp + self.current_control_idx = self.control_idx + h = self.project_modules[self.adapter_idx]( + self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale + ) + self.adapter_idx -= 1 + self.control_idx -= 1 + return h, None + + def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw): + """forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample.""" + block_type, _ = transformer_options["block"] + if block_type == "output" and self.active and self.cached_features is not None: + x = self.project_modules[self.adapter_idx]( + self.cached_features[self.current_control_idx], x, control_scale=self.current_scale + ) + self.adapter_idx -= 1 + return layer(x, output_shape=output_shape) + + def to(self, device_or_dtype): + if isinstance(device_or_dtype, torch.device): + self.cached_features = None + if self.hint_latent is not None: + self.hint_latent = self.hint_latent.to(device_or_dtype) + return self + + def models(self): + return [self.model_patch] + + def register(self, model_patcher): + """Register all patches on a cloned model patcher.""" + model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch") + model_patcher.set_model_output_block_patch(self.output_block) + model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch") diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 6deb71e12..93d19d6fe 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -506,6 +506,10 @@ class ModelPatcher: def set_model_noise_refiner_patch(self, patch): self.set_model_patch(patch, "noise_refiner") + def set_model_middle_block_after_patch(self, patch): + self.set_model_patch(patch, "middle_block_after_patch") + + def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs): rope_options = self.model_options["transformer_options"].get("rope_options", {}) rope_options["scale_x"] = scale_x diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 176e6bc2f..748559a6b 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -7,7 +7,10 @@ import comfy.model_management import comfy.ldm.common_dit import comfy.latent_formats import comfy.ldm.lumina.controlnet +import comfy.ldm.supir.supir_modules from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel +from comfy_api.latest import io +from comfy.ldm.supir.supir_patch import SUPIRPatch class BlockWiseControlBlock(torch.nn.Module): @@ -266,6 +269,27 @@ class ModelPatchLoader: out_dim=sd["audio_proj.norm.weight"].shape[0], device=comfy.model_management.unet_offload_device(), operations=comfy.ops.manual_cast) + elif 'model.control_model.input_hint_block.0.weight' in sd or 'control_model.input_hint_block.0.weight' in sd: + prefix_replace = {} + if 'model.control_model.input_hint_block.0.weight' in sd: + prefix_replace["model.control_model."] = "control_model." + prefix_replace["model.diffusion_model.project_modules."] = "project_modules." + else: + prefix_replace["control_model."] = "control_model." + prefix_replace["project_modules."] = "project_modules." + + # Extract denoise_encoder weights before filter_keys discards them + de_prefix = "first_stage_model.denoise_encoder." + denoise_encoder_sd = {} + for k in list(sd.keys()): + if k.startswith(de_prefix): + denoise_encoder_sd[k[len(de_prefix):]] = sd.pop(k) + + sd = comfy.utils.state_dict_prefix_replace(sd, prefix_replace, filter_keys=True) + sd.pop("control_model.mask_LQ", None) + model = comfy.ldm.supir.supir_modules.SUPIR(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + if denoise_encoder_sd: + model.denoise_encoder_sd = denoise_encoder_sd model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) model.load_state_dict(sd, assign=model_patcher.is_dynamic()) @@ -565,9 +589,89 @@ class MultiTalkModelPatch(torch.nn.Module): ) +class SUPIRApply(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="SUPIRApply", + category="model_patches/supir", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.ModelPatch.Input("model_patch"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Float.Input("strength_start", default=1.0, min=0.0, max=10.0, step=0.01, + tooltip="Control strength at the start of sampling (high sigma)."), + io.Float.Input("strength_end", default=1.0, min=0.0, max=10.0, step=0.01, + tooltip="Control strength at the end of sampling (low sigma). Linearly interpolated from start."), + io.Float.Input("restore_cfg", default=4.0, min=0.0, max=20.0, step=0.1, advanced=True, + tooltip="Pulls denoised output toward the input latent. Higher = stronger fidelity to input. 0 to disable."), + io.Float.Input("restore_cfg_s_tmin", default=0.05, min=0.0, max=1.0, step=0.01, advanced=True, + tooltip="Sigma threshold below which restore_cfg is disabled."), + ], + outputs=[io.Model.Output()], + ) + + @classmethod + def _encode_with_denoise_encoder(cls, vae, model_patch, image): + """Encode using denoise_encoder weights from SUPIR checkpoint if available.""" + denoise_sd = getattr(model_patch.model, 'denoise_encoder_sd', None) + if not denoise_sd: + return vae.encode(image) + + # Clone VAE patcher, apply denoise_encoder weights to clone, encode + orig_patcher = vae.patcher + vae.patcher = orig_patcher.clone() + patches = {f"encoder.{k}": (v,) for k, v in denoise_sd.items()} + vae.patcher.add_patches(patches, strength_patch=1.0, strength_model=0.0) + try: + return vae.encode(image) + finally: + vae.patcher = orig_patcher + + @classmethod + def execute(cls, *, model: io.Model.Type, model_patch: io.ModelPatch.Type, vae: io.Vae.Type, image: io.Image.Type, + strength_start: float, strength_end: float, restore_cfg: float, restore_cfg_s_tmin: float) -> io.NodeOutput: + model_patched = model.clone() + hint_latent = model.get_model_object("latent_format").process_in( + cls._encode_with_denoise_encoder(vae, model_patch, image[:, :, :, :3])) + patch = SUPIRPatch(model_patch, model_patch.model.project_modules, hint_latent, strength_start, strength_end) + patch.register(model_patched) + + if restore_cfg > 0.0: + # Round-trip to match original pipeline: decode hint, re-encode with regular VAE + latent_format = model.get_model_object("latent_format") + decoded = vae.decode(latent_format.process_out(hint_latent)) + x_center = latent_format.process_in(vae.encode(decoded[:, :, :, :3])) + sigma_max = 14.6146 + + def restore_cfg_function(args): + denoised = args["denoised"] + sigma = args["sigma"] + if sigma.dim() > 0: + s = sigma[0].item() + else: + s = sigma.item() + if s > restore_cfg_s_tmin: + ref = x_center.to(device=denoised.device, dtype=denoised.dtype) + b = denoised.shape[0] + if ref.shape[0] != b: + ref = ref.expand(b, -1, -1, -1) if ref.shape[0] == 1 else ref.repeat((b + ref.shape[0] - 1) // ref.shape[0], 1, 1, 1)[:b] + sigma_val = sigma.view(-1, 1, 1, 1) if sigma.dim() > 0 else sigma + d_center = denoised - ref + denoised = denoised - d_center * ((sigma_val / sigma_max) ** restore_cfg) + return denoised + + model_patched.set_model_sampler_post_cfg_function(restore_cfg_function) + + return io.NodeOutput(model_patched) + + NODE_CLASS_MAPPINGS = { "ModelPatchLoader": ModelPatchLoader, "QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet, "ZImageFunControlnet": ZImageFunControlnet, "USOStyleReference": USOStyleReference, + "SUPIRApply": SUPIRApply, } diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index 9037c3d20..c932b747a 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -6,6 +6,7 @@ from PIL import Image import math from enum import Enum from typing import TypedDict, Literal +import kornia import comfy.utils import comfy.model_management @@ -660,6 +661,228 @@ class BatchImagesMasksLatentsNode(io.ComfyNode): return io.NodeOutput(batched) +class ColorTransfer(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ColorTransfer", + category="image/postprocessing", + description="Match the colors of one image to another using various algorithms.", + search_aliases=["color match", "color grading", "color correction", "match colors", "color transform", "mkl", "reinhard", "histogram"], + inputs=[ + io.Image.Input("image_target", tooltip="Image(s) to apply the color transform to."), + io.Image.Input("image_ref", optional=True, tooltip="Reference image(s) to match colors to. If not provided, processing is skipped"), + io.Combo.Input("method", options=['reinhard_lab', 'mkl_lab', 'histogram'],), + io.DynamicCombo.Input("source_stats", + tooltip="per_frame: each frame matched to image_ref individually. uniform: pool stats across all source frames as baseline, match to image_ref. target_frame: use one chosen frame as the baseline for the transform to image_ref, applied uniformly to all frames (preserves relative differences)", + options=[ + io.DynamicCombo.Option("per_frame", []), + io.DynamicCombo.Option("uniform", []), + io.DynamicCombo.Option("target_frame", [ + io.Int.Input("target_index", default=0, min=0, max=10000, + tooltip="Frame index used as the source baseline for computing the transform to image_ref"), + ]), + ]), + io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[ + io.Image.Output(display_name="image"), + ], + ) + + @staticmethod + def _to_lab(images, i, device): + return kornia.color.rgb_to_lab( + images[i:i+1].to(device, dtype=torch.float32).permute(0, 3, 1, 2)) + + @staticmethod + def _pool_stats(images, device, is_reinhard, eps): + """Two-pass pooled mean + std/cov across all frames.""" + N, C = images.shape[0], images.shape[3] + HW = images.shape[1] * images.shape[2] + mean = torch.zeros(C, 1, device=device, dtype=torch.float32) + for i in range(N): + mean += ColorTransfer._to_lab(images, i, device).view(C, -1).mean(dim=-1, keepdim=True) + mean /= N + acc = torch.zeros(C, 1 if is_reinhard else C, device=device, dtype=torch.float32) + for i in range(N): + centered = ColorTransfer._to_lab(images, i, device).view(C, -1) - mean + if is_reinhard: + acc += (centered * centered).mean(dim=-1, keepdim=True) + else: + acc += centered @ centered.T / HW + if is_reinhard: + return mean, torch.sqrt(acc / N).clamp_min_(eps) + return mean, acc / N + + @staticmethod + def _frame_stats(lab_flat, hw, is_reinhard, eps): + """Per-frame mean + std/cov.""" + mean = lab_flat.mean(dim=-1, keepdim=True) + if is_reinhard: + return mean, lab_flat.std(dim=-1, keepdim=True, unbiased=False).clamp_min_(eps) + centered = lab_flat - mean + return mean, centered @ centered.T / hw + + @staticmethod + def _mkl_matrix(cov_s, cov_r, eps): + """Compute MKL 3x3 transform matrix from source and ref covariances.""" + eig_val_s, eig_vec_s = torch.linalg.eigh(cov_s) + sqrt_val_s = torch.sqrt(eig_val_s.clamp_min(0)).clamp_min_(eps) + + scaled_V = eig_vec_s * sqrt_val_s.unsqueeze(0) + mid = scaled_V.T @ cov_r @ scaled_V + eig_val_m, eig_vec_m = torch.linalg.eigh(mid) + sqrt_m = torch.sqrt(eig_val_m.clamp_min(0)) + + inv_sqrt_s = 1.0 / sqrt_val_s + inv_scaled_V = eig_vec_s * inv_sqrt_s.unsqueeze(0) + M_half = (eig_vec_m * sqrt_m.unsqueeze(0)) @ eig_vec_m.T + return inv_scaled_V @ M_half @ inv_scaled_V.T + + @staticmethod + def _histogram_lut(src, ref, bins=256): + """Build per-channel LUT from source and ref histograms. src/ref: (C, HW) in [0,1].""" + s_bins = (src * (bins - 1)).long().clamp(0, bins - 1) + r_bins = (ref * (bins - 1)).long().clamp(0, bins - 1) + s_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype) + r_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype) + ones_s = torch.ones_like(src) + ones_r = torch.ones_like(ref) + s_hist.scatter_add_(1, s_bins, ones_s) + r_hist.scatter_add_(1, r_bins, ones_r) + s_cdf = s_hist.cumsum(1) + s_cdf = s_cdf / s_cdf[:, -1:] + r_cdf = r_hist.cumsum(1) + r_cdf = r_cdf / r_cdf[:, -1:] + return torch.searchsorted(r_cdf, s_cdf).clamp_max_(bins - 1).float() / (bins - 1) + + @classmethod + def _pooled_cdf(cls, images, device, num_bins=256): + """Build pooled CDF across all frames, one frame at a time.""" + C = images.shape[3] + hist = torch.zeros(C, num_bins, device=device, dtype=torch.float32) + for i in range(images.shape[0]): + frame = images[i].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1) + bins = (frame * (num_bins - 1)).long().clamp(0, num_bins - 1) + hist.scatter_add_(1, bins, torch.ones_like(frame)) + cdf = hist.cumsum(1) + return cdf / cdf[:, -1:] + + @classmethod + def _build_histogram_transform(cls, image_target, image_ref, device, stats_mode, target_index, B): + """Build per-frame or uniform LUT transform for histogram mode.""" + if stats_mode == 'per_frame': + return None # LUT computed per-frame in the apply loop + + r_cdf = cls._pooled_cdf(image_ref, device) + if stats_mode == 'target_frame': + ti = min(target_index, B - 1) + s_cdf = cls._pooled_cdf(image_target[ti:ti+1], device) + else: + s_cdf = cls._pooled_cdf(image_target, device) + return torch.searchsorted(r_cdf, s_cdf).clamp_max_(255).float() / 255.0 + + @classmethod + def _build_lab_transform(cls, image_target, image_ref, device, stats_mode, target_index, is_reinhard): + """Build transform parameters for Lab-based methods. Returns a transform function.""" + eps = 1e-6 + B, H, W, C = image_target.shape + B_ref = image_ref.shape[0] + single_ref = B_ref == 1 + HW = H * W + HW_ref = image_ref.shape[1] * image_ref.shape[2] + + # Precompute ref stats + if single_ref or stats_mode in ('uniform', 'target_frame'): + ref_mean, ref_sc = cls._pool_stats(image_ref, device, is_reinhard, eps) + + # Uniform/target_frame: precompute single affine transform + if stats_mode in ('uniform', 'target_frame'): + if stats_mode == 'target_frame': + ti = min(target_index, B - 1) + s_lab = cls._to_lab(image_target, ti, device).view(C, -1) + s_mean, s_sc = cls._frame_stats(s_lab, HW, is_reinhard, eps) + else: + s_mean, s_sc = cls._pool_stats(image_target, device, is_reinhard, eps) + + if is_reinhard: + scale = ref_sc / s_sc + offset = ref_mean - scale * s_mean + return lambda src_flat, **_: src_flat * scale + offset + T = cls._mkl_matrix(s_sc, ref_sc, eps) + offset = ref_mean - T @ s_mean + return lambda src_flat, **_: T @ src_flat + offset + + # per_frame + def per_frame_transform(src_flat, frame_idx): + s_mean, s_sc = cls._frame_stats(src_flat, HW, is_reinhard, eps) + + if single_ref: + r_mean, r_sc = ref_mean, ref_sc + else: + ri = min(frame_idx, B_ref - 1) + r_mean, r_sc = cls._frame_stats(cls._to_lab(image_ref, ri, device).view(C, -1), HW_ref, is_reinhard, eps) + + centered = src_flat - s_mean + if is_reinhard: + return centered * (r_sc / s_sc) + r_mean + T = cls._mkl_matrix(centered @ centered.T / HW, r_sc, eps) + return T @ centered + r_mean + + return per_frame_transform + + @classmethod + def execute(cls, image_target, image_ref, method, source_stats, strength=1.0) -> io.NodeOutput: + stats_mode = source_stats["source_stats"] + target_index = source_stats.get("target_index", 0) + + if strength == 0 or image_ref is None: + return io.NodeOutput(image_target) + + device = comfy.model_management.get_torch_device() + intermediate_device = comfy.model_management.intermediate_device() + intermediate_dtype = comfy.model_management.intermediate_dtype() + + B, H, W, C = image_target.shape + B_ref = image_ref.shape[0] + pbar = comfy.utils.ProgressBar(B) + out = torch.empty(B, H, W, C, device=intermediate_device, dtype=intermediate_dtype) + + if method == 'histogram': + uniform_lut = cls._build_histogram_transform( + image_target, image_ref, device, stats_mode, target_index, B) + + for i in range(B): + src = image_target[i].to(device, dtype=torch.float32).permute(2, 0, 1) + src_flat = src.reshape(C, -1) + if uniform_lut is not None: + lut = uniform_lut + else: + ri = min(i, B_ref - 1) + ref = image_ref[ri].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1) + lut = cls._histogram_lut(src_flat, ref) + bin_idx = (src_flat * 255).long().clamp(0, 255) + matched = lut.gather(1, bin_idx).view(C, H, W) + result = matched if strength == 1.0 else torch.lerp(src, matched, strength) + out[i] = result.permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype) + pbar.update(1) + else: + transform = cls._build_lab_transform(image_target, image_ref, device, stats_mode, target_index, is_reinhard=method == "reinhard_lab") + + for i in range(B): + src_frame = cls._to_lab(image_target, i, device) + corrected = transform(src_frame.view(C, -1), frame_idx=i) + if strength == 1.0: + result = kornia.color.lab_to_rgb(corrected.view(1, C, H, W)) + else: + result = kornia.color.lab_to_rgb(torch.lerp(src_frame, corrected.view(1, C, H, W), strength)) + out[i] = result.squeeze(0).permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype) + pbar.update(1) + + return io.NodeOutput(out) + + class PostProcessingExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: @@ -673,6 +896,7 @@ class PostProcessingExtension(ComfyExtension): BatchImagesNode, BatchMasksNode, BatchLatentsNode, + ColorTransfer, # BatchImagesMasksLatentsNode, ] From 3d816db07f9721525c8326bc8d525cd81f00a7fa Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 18 Apr 2026 20:02:29 -0700 Subject: [PATCH 17/32] Some optimizations to make Ernie inference a bit faster. (#13472) --- comfy/ldm/ernie/model.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/comfy/ldm/ernie/model.py b/comfy/ldm/ernie/model.py index f7cdb51e6..eba661aec 100644 --- a/comfy/ldm/ernie/model.py +++ b/comfy/ldm/ernie/model.py @@ -118,8 +118,6 @@ class ErnieImageAttention(nn.Module): query = apply_rotary_emb(query, image_rotary_emb) key = apply_rotary_emb(key, image_rotary_emb) - query, key = query.to(x.dtype), key.to(x.dtype) - q_flat = query.reshape(B, S, -1) k_flat = key.reshape(B, S, -1) @@ -161,16 +159,16 @@ class ErnieImageSharedAdaLNBlock(nn.Module): residual = x x_norm = self.adaLN_sa_ln(x) - x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype) + x_norm = x_norm * (1 + scale_msa) + shift_msa attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb) - x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype) + x = residual + gate_msa * attn_out residual = x x_norm = self.adaLN_mlp_ln(x) - x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype) + x_norm = x_norm * (1 + scale_mlp) + shift_mlp - return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype) + return residual + gate_mlp * self.mlp(x_norm) class ErnieImageAdaLNContinuous(nn.Module): def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None): @@ -183,7 +181,7 @@ class ErnieImageAdaLNContinuous(nn.Module): def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor: scale, shift = self.linear(conditioning).chunk(2, dim=-1) x = self.norm(x) - x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1)) return x class ErnieImageModel(nn.Module): From 138571da955d85935baa09371ac2b67ea8b7a8ca Mon Sep 17 00:00:00 2001 From: Abdul Rehman <76230556+Abdulrehman-PIAIC80387@users.noreply.github.com> Date: Sun, 19 Apr 2026 08:21:22 +0500 Subject: [PATCH 18/32] fix: append directory type annotation to internal files endpoint response (#13078) (#13305) --- api_server/routes/internal/internal_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api_server/routes/internal/internal_routes.py b/api_server/routes/internal/internal_routes.py index b224306da..1477afa01 100644 --- a/api_server/routes/internal/internal_routes.py +++ b/api_server/routes/internal/internal_routes.py @@ -67,7 +67,7 @@ class InternalRoutes: (entry for entry in os.scandir(directory) if is_visible_file(entry)), key=lambda entry: -entry.stat().st_mtime ) - return web.json_response([entry.name for entry in sorted_files], status=200) + return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200) def get_app(self): From fc5f4a996b9c752a716badbbc11bacc396281466 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 19 Apr 2026 17:26:12 -0700 Subject: [PATCH 19/32] Add link to Intel portable to Readme. (#13477) --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1eeb810de..f05311421 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,9 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat #### Alternative Downloads: -[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z) +[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z) + +[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z) [Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs). From 543e9fba64e968426aa562baca1c0f4c5054b61c Mon Sep 17 00:00:00 2001 From: Octopus Date: Tue, 21 Apr 2026 06:30:23 +0800 Subject: [PATCH 20/32] fix: pin SQLAlchemy>=2.0 in requirements.txt (fixes #13036) (#13316) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3de845f48..63d7c41cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ scipy tqdm psutil alembic -SQLAlchemy +SQLAlchemy>=2.0 filelock av>=14.2.0 comfy-kitchen>=0.2.8 From c514890325b150c0a2a22732e3ae571afea189cb Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 20 Apr 2026 18:59:26 -0700 Subject: [PATCH 21/32] Refactor io to IO in nodes_ace.py (#13485) --- comfy_extras/nodes_ace.py | 104 +++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py index cbfaf913d..1602add84 100644 --- a/comfy_extras/nodes_ace.py +++ b/comfy_extras/nodes_ace.py @@ -3,136 +3,136 @@ from typing_extensions import override import comfy.model_management import node_helpers -from comfy_api.latest import ComfyExtension, io +from comfy_api.latest import ComfyExtension, IO -class TextEncodeAceStepAudio(io.ComfyNode): +class TextEncodeAceStepAudio(IO.ComfyNode): @classmethod def define_schema(cls): - return io.Schema( + return IO.Schema( node_id="TextEncodeAceStepAudio", category="conditioning", inputs=[ - io.Clip.Input("clip"), - io.String.Input("tags", multiline=True, dynamic_prompts=True), - io.String.Input("lyrics", multiline=True, dynamic_prompts=True), - io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01), + IO.Clip.Input("clip"), + IO.String.Input("tags", multiline=True, dynamic_prompts=True), + IO.String.Input("lyrics", multiline=True, dynamic_prompts=True), + IO.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01), ], - outputs=[io.Conditioning.Output()], + outputs=[IO.Conditioning.Output()], ) @classmethod - def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput: + def execute(cls, clip, tags, lyrics, lyrics_strength) -> IO.NodeOutput: tokens = clip.tokenize(tags, lyrics=lyrics) conditioning = clip.encode_from_tokens_scheduled(tokens) conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength}) - return io.NodeOutput(conditioning) + return IO.NodeOutput(conditioning) -class TextEncodeAceStepAudio15(io.ComfyNode): +class TextEncodeAceStepAudio15(IO.ComfyNode): @classmethod def define_schema(cls): - return io.Schema( + return IO.Schema( node_id="TextEncodeAceStepAudio1.5", category="conditioning", inputs=[ - io.Clip.Input("clip"), - io.String.Input("tags", multiline=True, dynamic_prompts=True), - io.String.Input("lyrics", multiline=True, dynamic_prompts=True), - io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True), - io.Int.Input("bpm", default=120, min=10, max=300), - io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1), - io.Combo.Input("timesignature", options=['2', '3', '4', '6']), - io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]), - io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]), - io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True), - io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True), - io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True), - io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True), - io.Int.Input("top_k", default=0, min=0, max=100, advanced=True), - io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True), + IO.Clip.Input("clip"), + IO.String.Input("tags", multiline=True, dynamic_prompts=True), + IO.String.Input("lyrics", multiline=True, dynamic_prompts=True), + IO.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True), + IO.Int.Input("bpm", default=120, min=10, max=300), + IO.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1), + IO.Combo.Input("timesignature", options=['2', '3', '4', '6']), + IO.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]), + IO.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]), + IO.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True), + IO.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True), + IO.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True), + IO.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True), + IO.Int.Input("top_k", default=0, min=0, max=100, advanced=True), + IO.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True), ], - outputs=[io.Conditioning.Output()], + outputs=[IO.Conditioning.Output()], ) @classmethod - def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput: + def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> IO.NodeOutput: tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p) conditioning = clip.encode_from_tokens_scheduled(tokens) - return io.NodeOutput(conditioning) + return IO.NodeOutput(conditioning) -class EmptyAceStepLatentAudio(io.ComfyNode): +class EmptyAceStepLatentAudio(IO.ComfyNode): @classmethod def define_schema(cls): - return io.Schema( + return IO.Schema( node_id="EmptyAceStepLatentAudio", display_name="Empty Ace Step 1.0 Latent Audio", category="latent/audio", inputs=[ - io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1), - io.Int.Input( + IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1), + IO.Int.Input( "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch." ), ], - outputs=[io.Latent.Output()], + outputs=[IO.Latent.Output()], ) @classmethod - def execute(cls, seconds, batch_size) -> io.NodeOutput: + def execute(cls, seconds, batch_size) -> IO.NodeOutput: length = int(seconds * 44100 / 512 / 8) latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype()) - return io.NodeOutput({"samples": latent, "type": "audio"}) + return IO.NodeOutput({"samples": latent, "type": "audio"}) -class EmptyAceStep15LatentAudio(io.ComfyNode): +class EmptyAceStep15LatentAudio(IO.ComfyNode): @classmethod def define_schema(cls): - return io.Schema( + return IO.Schema( node_id="EmptyAceStep1.5LatentAudio", display_name="Empty Ace Step 1.5 Latent Audio", category="latent/audio", inputs=[ - io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01), - io.Int.Input( + IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01), + IO.Int.Input( "batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch." ), ], - outputs=[io.Latent.Output()], + outputs=[IO.Latent.Output()], ) @classmethod - def execute(cls, seconds, batch_size) -> io.NodeOutput: + def execute(cls, seconds, batch_size) -> IO.NodeOutput: length = round((seconds * 48000 / 1920)) latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype()) - return io.NodeOutput({"samples": latent, "type": "audio"}) + return IO.NodeOutput({"samples": latent, "type": "audio"}) -class ReferenceAudio(io.ComfyNode): +class ReferenceAudio(IO.ComfyNode): @classmethod def define_schema(cls): - return io.Schema( + return IO.Schema( node_id="ReferenceTimbreAudio", display_name="Reference Audio", category="advanced/conditioning/audio", is_experimental=True, description="This node sets the reference audio for ace step 1.5", inputs=[ - io.Conditioning.Input("conditioning"), - io.Latent.Input("latent", optional=True), + IO.Conditioning.Input("conditioning"), + IO.Latent.Input("latent", optional=True), ], outputs=[ - io.Conditioning.Output(), + IO.Conditioning.Output(), ] ) @classmethod - def execute(cls, conditioning, latent=None) -> io.NodeOutput: + def execute(cls, conditioning, latent=None) -> IO.NodeOutput: if latent is not None: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True) - return io.NodeOutput(conditioning) + return IO.NodeOutput(conditioning) class AceExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ TextEncodeAceStepAudio, EmptyAceStepLatentAudio, From e75f775ae8e9b1a1fd2b78806c86338fd830bcd7 Mon Sep 17 00:00:00 2001 From: Comfy Org PR Bot Date: Tue, 21 Apr 2026 16:43:11 +0900 Subject: [PATCH 22/32] Bump comfyui-frontend-package to 1.42.12 (#13489) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 63d7c41cf..671bd5693 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.42.11 +comfyui-frontend-package==1.42.12 comfyui-workflow-templates==0.9.57 comfyui-embedded-docs==0.4.3 torch From ad94d472216ba52ab2660536af44faa92cf4b5d0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 21 Apr 2026 08:02:42 -0700 Subject: [PATCH 23/32] Make the ltx audio vae more native. (#13486) --- comfy/ldm/lightricks/vae/audio_vae.py | 55 +++------------------------ comfy/sd.py | 18 +++++++++ comfy_extras/nodes_audio.py | 2 +- comfy_extras/nodes_lt_audio.py | 36 ++++++++---------- 4 files changed, 41 insertions(+), 70 deletions(-) diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py index fa0a00748..dd5320c8f 100644 --- a/comfy/ldm/lightricks/vae/audio_vae.py +++ b/comfy/ldm/lightricks/vae/audio_vae.py @@ -4,9 +4,6 @@ import math import torch import torchaudio -import comfy.model_management -import comfy.model_patcher -import comfy.utils as utils from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier from comfy.ldm.lightricks.vae.causal_audio_autoencoder import ( @@ -43,30 +40,6 @@ class AudioVAEComponentConfig: return cls(autoencoder=audio_config, vocoder=vocoder_config) - -class ModelDeviceManager: - """Manages device placement and GPU residency for the composed model.""" - - def __init__(self, module: torch.nn.Module): - load_device = comfy.model_management.get_torch_device() - offload_device = comfy.model_management.vae_offload_device() - self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device) - - def ensure_model_loaded(self) -> None: - comfy.model_management.free_memory( - self.patcher.model_size(), - self.patcher.load_device, - ) - comfy.model_management.load_model_gpu(self.patcher) - - def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(self.patcher.load_device) - - @property - def load_device(self): - return self.patcher.load_device - - class AudioLatentNormalizer: """Applies per-channel statistics in patch space and restores original layout.""" @@ -132,23 +105,17 @@ class AudioPreprocessor: class AudioVAE(torch.nn.Module): """High-level Audio VAE wrapper exposing encode and decode entry points.""" - def __init__(self, state_dict: dict, metadata: dict): + def __init__(self, metadata: dict): super().__init__() component_config = AudioVAEComponentConfig.from_metadata(metadata) - vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True) - vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True) - self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder) if "bwe" in component_config.vocoder: self.vocoder = VocoderWithBWE(config=component_config.vocoder) else: self.vocoder = Vocoder(config=component_config.vocoder) - self.autoencoder.load_state_dict(vae_sd, strict=False) - self.vocoder.load_state_dict(vocoder_sd, strict=False) - autoencoder_config = self.autoencoder.get_config() self.normalizer = AudioLatentNormalizer( AudioPatchifier( @@ -168,18 +135,12 @@ class AudioVAE(torch.nn.Module): n_fft=autoencoder_config["n_fft"], ) - self.device_manager = ModelDeviceManager(self) - - def encode(self, audio: dict) -> torch.Tensor: + def encode(self, audio, sample_rate=44100) -> torch.Tensor: """Encode a waveform dictionary into normalized latent tensors.""" - waveform = audio["waveform"] - waveform_sample_rate = audio["sample_rate"] + waveform = audio + waveform_sample_rate = sample_rate input_device = waveform.device - # Ensure that Audio VAE is loaded on the correct device. - self.device_manager.ensure_model_loaded() - - waveform = self.device_manager.move_to_load_device(waveform) expected_channels = self.autoencoder.encoder.in_channels if waveform.shape[1] != expected_channels: if waveform.shape[1] == 1: @@ -190,7 +151,7 @@ class AudioVAE(torch.nn.Module): ) mel_spec = self.preprocessor.waveform_to_mel( - waveform, waveform_sample_rate, device=self.device_manager.load_device + waveform, waveform_sample_rate, device=waveform.device ) latents = self.autoencoder.encode(mel_spec) @@ -204,17 +165,13 @@ class AudioVAE(torch.nn.Module): """Decode normalized latent tensors into an audio waveform.""" original_shape = latents.shape - # Ensure that Audio VAE is loaded on the correct device. - self.device_manager.ensure_model_loaded() - - latents = self.device_manager.move_to_load_device(latents) latents = self.normalizer.denormalize(latents) target_shape = self.target_shape_from_latents(original_shape) mel_spec = self.autoencoder.decode(latents, target_shape=target_shape) waveform = self.run_vocoder(mel_spec) - return self.device_manager.move_to_load_device(waveform) + return waveform def target_shape_from_latents(self, latents_shape): batch, _, time, _ = latents_shape diff --git a/comfy/sd.py b/comfy/sd.py index e573804a5..a4d3ee269 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -12,6 +12,7 @@ from .ldm.cascade.stage_c_coder import StageC_coder from .ldm.audio.autoencoder import AudioOobleckVAE import comfy.ldm.genmo.vae.model import comfy.ldm.lightricks.vae.causal_video_autoencoder +import comfy.ldm.lightricks.vae.audio_vae import comfy.ldm.cosmos.vae import comfy.ldm.wan.vae import comfy.ldm.wan.vae2_2 @@ -805,6 +806,23 @@ class VAE: self.downscale_index_formula = (4, 8, 8) self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)) self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype)) + elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio + self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata) + self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype) + self.latent_channels = self.first_stage_model.latent_channels + self.audio_sample_rate_output = self.first_stage_model.output_sample_rate + self.autoencoder = self.first_stage_model.autoencoder # TODO: remove hack for ltxv custom nodes + self.output_channels = 2 + self.pad_channel_value = "replicate" + self.upscale_ratio = 4096 + self.downscale_ratio = 4096 + self.latent_dim = 2 + self.process_output = lambda audio: audio + self.process_input = lambda audio: audio + self.working_dtypes = [torch.float32] + self.disable_offload = True + self.extra_1d_channel = 16 else: logging.warning("WARNING: No VAE weights detected, VAE not initalized.") self.first_stage_model = None diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index a395392d8..5f514716f 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -104,7 +104,7 @@ def vae_decode_audio(vae, samples, tile=None, overlap=None): std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0 std[std < 1.0] = 1.0 audio /= std - vae_sample_rate = getattr(vae, "audio_sample_rate", 44100) + vae_sample_rate = getattr(vae, "audio_sample_rate_output", getattr(vae, "audio_sample_rate", 44100)) return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]} diff --git a/comfy_extras/nodes_lt_audio.py b/comfy_extras/nodes_lt_audio.py index 3e4222264..3ec635c75 100644 --- a/comfy_extras/nodes_lt_audio.py +++ b/comfy_extras/nodes_lt_audio.py @@ -3,9 +3,8 @@ import comfy.utils import comfy.model_management import torch -from comfy.ldm.lightricks.vae.audio_vae import AudioVAE from comfy_api.latest import ComfyExtension, io - +from comfy_extras.nodes_audio import VAEEncodeAudio class LTXVAudioVAELoader(io.ComfyNode): @classmethod @@ -28,10 +27,14 @@ class LTXVAudioVAELoader(io.ComfyNode): def execute(cls, ckpt_name: str) -> io.NodeOutput: ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name) sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True) - return io.NodeOutput(AudioVAE(sd, metadata)) + sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder.", "vocoder.": "vocoder."}, filter_keys=True) + vae = comfy.sd.VAE(sd=sd, metadata=metadata) + vae.throw_exception_if_invalid() + + return io.NodeOutput(vae) -class LTXVAudioVAEEncode(io.ComfyNode): +class LTXVAudioVAEEncode(VAEEncodeAudio): @classmethod def define_schema(cls) -> io.Schema: return io.Schema( @@ -50,15 +53,8 @@ class LTXVAudioVAEEncode(io.ComfyNode): ) @classmethod - def execute(cls, audio, audio_vae: AudioVAE) -> io.NodeOutput: - audio_latents = audio_vae.encode(audio) - return io.NodeOutput( - { - "samples": audio_latents, - "sample_rate": int(audio_vae.sample_rate), - "type": "audio", - } - ) + def execute(cls, audio, audio_vae) -> io.NodeOutput: + return super().execute(audio_vae, audio) class LTXVAudioVAEDecode(io.ComfyNode): @@ -80,12 +76,12 @@ class LTXVAudioVAEDecode(io.ComfyNode): ) @classmethod - def execute(cls, samples, audio_vae: AudioVAE) -> io.NodeOutput: + def execute(cls, samples, audio_vae) -> io.NodeOutput: audio_latent = samples["samples"] if audio_latent.is_nested: audio_latent = audio_latent.unbind()[-1] - audio = audio_vae.decode(audio_latent).to(audio_latent.device) - output_audio_sample_rate = audio_vae.output_sample_rate + audio = audio_vae.decode(audio_latent).movedim(-1, 1).to(audio_latent.device) + output_audio_sample_rate = audio_vae.first_stage_model.output_sample_rate return io.NodeOutput( { "waveform": audio, @@ -143,17 +139,17 @@ class LTXVEmptyLatentAudio(io.ComfyNode): frames_number: int, frame_rate: int, batch_size: int, - audio_vae: AudioVAE, + audio_vae, ) -> io.NodeOutput: """Generate empty audio latents matching the reference pipeline structure.""" assert audio_vae is not None, "Audio VAE model is required" z_channels = audio_vae.latent_channels - audio_freq = audio_vae.latent_frequency_bins - sampling_rate = int(audio_vae.sample_rate) + audio_freq = audio_vae.first_stage_model.latent_frequency_bins + sampling_rate = int(audio_vae.first_stage_model.sample_rate) - num_audio_latents = audio_vae.num_of_latents_from_frames(frames_number, frame_rate) + num_audio_latents = audio_vae.first_stage_model.num_of_latents_from_frames(frames_number, frame_rate) audio_latents = torch.zeros( (batch_size, z_channels, num_audio_latents, audio_freq), From b38dd0ff23037cd4f03f12274a5c7e04d224febd Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 21 Apr 2026 20:45:10 +0300 Subject: [PATCH 24/32] feat(api-nodes): add automatic downscaling of videos for ByteDance 2 nodes (#13465) --- comfy_api_nodes/apis/bytedance.py | 13 +++- comfy_api_nodes/nodes_bytedance.py | 33 +++++++-- comfy_api_nodes/util/__init__.py | 2 + comfy_api_nodes/util/conversions.py | 104 +++++++++++++++++++++++++--- 4 files changed, 134 insertions(+), 18 deletions(-) diff --git a/comfy_api_nodes/apis/bytedance.py b/comfy_api_nodes/apis/bytedance.py index 3755323ac..dc3bc3213 100644 --- a/comfy_api_nodes/apis/bytedance.py +++ b/comfy_api_nodes/apis/bytedance.py @@ -158,10 +158,17 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [ ("Custom", None, None), ] -# Seedance 2.0 reference video pixel count limits per model. +# Seedance 2.0 reference video pixel count limits per model and output resolution. SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = { - "dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408}, - "dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408}, + "dreamina-seedance-2-0-260128": { + "480p": {"min": 409_600, "max": 927_408}, + "720p": {"min": 409_600, "max": 927_408}, + "1080p": {"min": 409_600, "max": 2_073_600}, + }, + "dreamina-seedance-2-0-fast-260128": { + "480p": {"min": 409_600, "max": 927_408}, + "720p": {"min": 409_600, "max": 927_408}, + }, } # The time in this dictionary are given for 10 seconds duration. diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index 429c32444..bc564782d 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -35,6 +35,7 @@ from comfy_api_nodes.util import ( get_number_of_images, image_tensor_pair_to_batch, poll_op, + resize_video_to_pixel_budget, sync_op, upload_audio_to_comfyapi, upload_image_to_comfyapi, @@ -69,9 +70,12 @@ DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-2504 logger = logging.getLogger(__name__) -def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None: - """Validate reference video pixel count against Seedance 2.0 model limits.""" - limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id) +def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None: + """Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution.""" + model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id) + if not model_limits: + return + limits = model_limits.get(resolution) if not limits: return try: @@ -1373,6 +1377,14 @@ def _seedance2_reference_inputs(resolutions: list[str]): min=0, ), ), + IO.Boolean.Input( + "auto_downscale", + default=False, + advanced=True, + optional=True, + tooltip="Automatically downscale reference videos that exceed the model's pixel budget " + "for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.", + ), ] @@ -1480,10 +1492,23 @@ class ByteDance2ReferenceNode(IO.ComfyNode): model_id = SEEDANCE_MODELS[model["model"]] has_video_input = len(reference_videos) > 0 + + if model.get("auto_downscale") and reference_videos: + max_px = ( + SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}) + .get(model["resolution"], {}) + .get("max") + ) + if max_px: + for key in reference_videos: + reference_videos[key] = resize_video_to_pixel_budget( + reference_videos[key], max_px + ) + total_video_duration = 0.0 for i, key in enumerate(reference_videos, 1): video = reference_videos[key] - _validate_ref_video_pixels(video, model_id, i) + _validate_ref_video_pixels(video, model_id, model["resolution"], i) try: dur = video.get_duration() if dur < 1.8: diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index 0cb9a47c7..f3584aba9 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -19,6 +19,7 @@ from .conversions import ( image_tensor_pair_to_batch, pil_to_bytesio, resize_mask_to_image, + resize_video_to_pixel_budget, tensor_to_base64_string, tensor_to_bytesio, tensor_to_pil, @@ -90,6 +91,7 @@ __all__ = [ "image_tensor_pair_to_batch", "pil_to_bytesio", "resize_mask_to_image", + "resize_video_to_pixel_budget", "tensor_to_base64_string", "tensor_to_bytesio", "tensor_to_pil", diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py index 82b6d22a5..be5d5719b 100644 --- a/comfy_api_nodes/util/conversions.py +++ b/comfy_api_nodes/util/conversions.py @@ -129,22 +129,38 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO: return img_byte_arr +def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None: + """Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits. + + Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions + are rounded down to even values (many codecs require divisible-by-2). + """ + pixels = src_w * src_h + if pixels <= total_pixels: + return None + scale = math.sqrt(total_pixels / pixels) + new_w = max(2, int(src_w * scale)) + new_h = max(2, int(src_h * scale)) + new_w -= new_w % 2 + new_h -= new_h % 2 + return new_w, new_h + + def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor: - """Downscale input image tensor to roughly the specified total pixels.""" + """Downscale input image tensor to roughly the specified total pixels. + + Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels`` + and is compatible with codecs that require even dimensions (e.g. yuv420p). + """ samples = image.movedim(-1, 1) - total = int(total_pixels) - scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) - if scale_by >= 1: + dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels)) + if dims is None: return image - width = round(samples.shape[3] * scale_by) - height = round(samples.shape[2] * scale_by) - - s = common_upscale(samples, width, height, "lanczos", "disabled") - s = s.movedim(1, -1) - return s + new_w, new_h = dims + return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1) -def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor: +def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor: """Downscale input image tensor so the largest dimension is at most max_side pixels.""" samples = image.movedim(-1, 1) height, width = samples.shape[2], samples.shape[3] @@ -399,6 +415,72 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video: raise RuntimeError(f"Failed to trim video: {str(e)}") from e +def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video: + """Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio. + + Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio. + Aspect ratio is preserved up to a fraction of a percent (even-dim rounding). + """ + src_w, src_h = video.get_dimensions() + scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels) + if scale_dims is None: + return video + return _apply_video_scale(video, scale_dims) + + +def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video: + """Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass.""" + out_w, out_h = scale_dims + output_buffer = BytesIO() + input_container = None + output_container = None + + try: + input_source = video.get_stream_source() + input_container = av.open(input_source, mode="r") + output_container = av.open(output_buffer, mode="w", format="mp4") + + video_stream = output_container.add_stream("h264", rate=video.get_frame_rate()) + video_stream.width = out_w + video_stream.height = out_h + video_stream.pix_fmt = "yuv420p" + + audio_stream = None + for stream in input_container.streams: + if isinstance(stream, av.AudioStream): + audio_stream = output_container.add_stream("aac", rate=stream.sample_rate) + audio_stream.sample_rate = stream.sample_rate + audio_stream.layout = stream.layout + break + + for frame in input_container.decode(video=0): + frame = frame.reformat(width=out_w, height=out_h, format="yuv420p") + for packet in video_stream.encode(frame): + output_container.mux(packet) + for packet in video_stream.encode(): + output_container.mux(packet) + + if audio_stream is not None: + input_container.seek(0) + for audio_frame in input_container.decode(audio=0): + for packet in audio_stream.encode(audio_frame): + output_container.mux(packet) + for packet in audio_stream.encode(): + output_container.mux(packet) + + output_container.close() + input_container.close() + output_buffer.seek(0) + return InputImpl.VideoFromFile(output_buffer) + + except Exception as e: + if input_container is not None: + input_container.close() + if output_container is not None: + output_container.close() + raise RuntimeError(f"Failed to resize video: {str(e)}") from e + + def _f32_pcm(wav: torch.Tensor) -> torch.Tensor: """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file.""" if wav.dtype.is_floating_point: From eb2222538739c4ebd396cd0a40cb6d80befd04fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Tue, 21 Apr 2026 20:46:37 +0300 Subject: [PATCH 25/32] Support standalone LTXV audio VAEs (#13499) --- comfy/sd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/sd.py b/comfy/sd.py index a4d3ee269..736fe35de 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -807,6 +807,7 @@ class VAE: self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)) self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype)) elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio + sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."}) self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata) self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype) self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype) From 1e1d4f12548ceb296e61af1dff217bb8e4414345 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 21 Apr 2026 21:27:35 +0300 Subject: [PATCH 26/32] [Partner Nodes] added 4K resolution for Veo models; added Veo 3 Lite model (#13330) * feat(api nodes): added 4K resolution for Veo models; added Veo 3 Lite model Signed-off-by: bigcat88 * increase poll_interval from 5 to 9 --------- Signed-off-by: bigcat88 Co-authored-by: Jedrzej Kosinski --- comfy_api_nodes/nodes_veo2.py | 171 +++++++++++++++++++++++++--------- 1 file changed, 128 insertions(+), 43 deletions(-) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 13fc1cc36..084b086a8 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -24,8 +24,9 @@ from comfy_api_nodes.util import ( AVERAGE_DURATION_VIDEO_GEN = 32 MODELS_MAP = { "veo-2.0-generate-001": "veo-2.0-generate-001", - "veo-3.1-generate": "veo-3.1-generate-preview", - "veo-3.1-fast-generate": "veo-3.1-fast-generate-preview", + "veo-3.1-generate": "veo-3.1-generate-001", + "veo-3.1-fast-generate": "veo-3.1-fast-generate-001", + "veo-3.1-lite": "veo-3.1-lite-generate-001", "veo-3.0-generate-001": "veo-3.0-generate-001", "veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001", } @@ -247,17 +248,8 @@ class VeoVideoGenerationNode(IO.ComfyNode): raise Exception("Video generation completed but no video was returned") -class Veo3VideoGenerationNode(VeoVideoGenerationNode): - """ - Generates videos from text prompts using Google's Veo 3 API. - - Supported models: - - veo-3.0-generate-001 - - veo-3.0-fast-generate-001 - - This node extends the base Veo node with Veo 3 specific features including - audio generation and fixed 8-second duration. - """ +class Veo3VideoGenerationNode(IO.ComfyNode): + """Generates videos from text prompts using Google's Veo 3 API.""" @classmethod def define_schema(cls): @@ -279,6 +271,13 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): default="16:9", tooltip="Aspect ratio of the output video", ), + IO.Combo.Input( + "resolution", + options=["720p", "1080p", "4k"], + default="720p", + tooltip="Output video resolution. 4K is not available for veo-3.1-lite and veo-3.0 models.", + optional=True, + ), IO.String.Input( "negative_prompt", multiline=True, @@ -289,11 +288,11 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): IO.Int.Input( "duration_seconds", default=8, - min=8, + min=4, max=8, - step=1, + step=2, display_mode=IO.NumberDisplay.number, - tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)", + tooltip="Duration of the output video in seconds", optional=True, ), IO.Boolean.Input( @@ -332,10 +331,10 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): options=[ "veo-3.1-generate", "veo-3.1-fast-generate", + "veo-3.1-lite", "veo-3.0-generate-001", "veo-3.0-fast-generate-001", ], - default="veo-3.0-generate-001", tooltip="Veo 3 model to use for video generation", optional=True, ), @@ -356,21 +355,111 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): ], is_api_node=True, price_badge=IO.PriceBadge( - depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio"]), + depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "resolution", "duration_seconds"]), expr=""" ( $m := widgets.model; + $r := widgets.resolution; $a := widgets.generate_audio; - ($contains($m,"veo-3.0-fast-generate-001") or $contains($m,"veo-3.1-fast-generate")) - ? {"type":"usd","usd": ($a ? 1.2 : 0.8)} - : ($contains($m,"veo-3.0-generate-001") or $contains($m,"veo-3.1-generate")) - ? {"type":"usd","usd": ($a ? 3.2 : 1.6)} - : {"type":"range_usd","min_usd":0.8,"max_usd":3.2} + $seconds := widgets.duration_seconds; + $pps := + $contains($m, "lite") + ? ($r = "1080p" ? ($a ? 0.08 : 0.05) : ($a ? 0.05 : 0.03)) + : $contains($m, "3.1-fast") + ? ($r = "4k" ? ($a ? 0.30 : 0.25) : $r = "1080p" ? ($a ? 0.12 : 0.10) : ($a ? 0.10 : 0.08)) + : $contains($m, "3.1-generate") + ? ($r = "4k" ? ($a ? 0.60 : 0.40) : ($a ? 0.40 : 0.20)) + : $contains($m, "3.0-fast") + ? ($a ? 0.15 : 0.10) + : ($a ? 0.40 : 0.20); + {"type":"usd","usd": $pps * $seconds} ) """, ), ) + @classmethod + async def execute( + cls, + prompt, + aspect_ratio="16:9", + resolution="720p", + negative_prompt="", + duration_seconds=8, + enhance_prompt=True, + person_generation="ALLOW", + seed=0, + image=None, + model="veo-3.0-generate-001", + generate_audio=False, + ): + if "lite" in model and resolution == "4k": + raise Exception("4K resolution is not supported by the veo-3.1-lite model.") + + model = MODELS_MAP[model] + + instances = [{"prompt": prompt}] + if image is not None: + image_base64 = tensor_to_base64_string(image) + if image_base64: + instances[0]["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"} + + parameters = { + "aspectRatio": aspect_ratio, + "personGeneration": person_generation, + "durationSeconds": duration_seconds, + "enhancePrompt": True, + "generateAudio": generate_audio, + } + if negative_prompt: + parameters["negativePrompt"] = negative_prompt + if seed > 0: + parameters["seed"] = seed + if "veo-3.1" in model: + parameters["resolution"] = resolution + + initial_response = await sync_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"), + response_model=VeoGenVidResponse, + data=VeoGenVidRequest( + instances=instances, + parameters=parameters, + ), + ) + + poll_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"), + response_model=VeoGenVidPollResponse, + status_extractor=lambda r: "completed" if r.done else "pending", + data=VeoGenVidPollRequest(operationName=initial_response.name), + poll_interval=9.0, + estimated_duration=AVERAGE_DURATION_VIDEO_GEN, + ) + + if poll_response.error: + raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})") + + response = poll_response.response + filtered_count = response.raiMediaFilteredCount + if filtered_count: + reasons = response.raiMediaFilteredReasons or [] + reason_part = f": {reasons[0]}" if reasons else "" + raise Exception( + f"Content blocked by Google's Responsible AI filters{reason_part} " + f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)." + ) + + if response.videos: + video = response.videos[0] + if video.bytesBase64Encoded: + return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) + if video.gcsUri: + return IO.NodeOutput(await download_url_to_video_output(video.gcsUri)) + raise Exception("Video returned but no data or URL was provided") + raise Exception("Video generation completed but no video was returned") + class Veo3FirstLastFrameNode(IO.ComfyNode): @@ -394,7 +483,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): default="", tooltip="Negative text prompt to guide what to avoid in the video", ), - IO.Combo.Input("resolution", options=["720p", "1080p"]), + IO.Combo.Input("resolution", options=["720p", "1080p", "4k"]), IO.Combo.Input( "aspect_ratio", options=["16:9", "9:16"], @@ -424,8 +513,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): IO.Image.Input("last_frame", tooltip="End frame"), IO.Combo.Input( "model", - options=["veo-3.1-generate", "veo-3.1-fast-generate"], - default="veo-3.1-fast-generate", + options=["veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.1-lite"], ), IO.Boolean.Input( "generate_audio", @@ -443,26 +531,20 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration"]), + depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration", "resolution"]), expr=""" ( - $prices := { - "veo-3.1-fast-generate": { "audio": 0.15, "no_audio": 0.10 }, - "veo-3.1-generate": { "audio": 0.40, "no_audio": 0.20 } - }; $m := widgets.model; - $ga := (widgets.generate_audio = "true"); + $r := widgets.resolution; + $ga := widgets.generate_audio; $seconds := widgets.duration; - $modelKey := - $contains($m, "veo-3.1-fast-generate") ? "veo-3.1-fast-generate" : - $contains($m, "veo-3.1-generate") ? "veo-3.1-generate" : - ""; - $audioKey := $ga ? "audio" : "no_audio"; - $modelPrices := $lookup($prices, $modelKey); - $pps := $lookup($modelPrices, $audioKey); - ($pps != null) - ? {"type":"usd","usd": $pps * $seconds} - : {"type":"range_usd","min_usd": 0.4, "max_usd": 3.2} + $pps := + $contains($m, "lite") + ? ($r = "1080p" ? ($ga ? 0.08 : 0.05) : ($ga ? 0.05 : 0.03)) + : $contains($m, "fast") + ? ($r = "4k" ? ($ga ? 0.30 : 0.25) : $r = "1080p" ? ($ga ? 0.12 : 0.10) : ($ga ? 0.10 : 0.08)) + : ($r = "4k" ? ($ga ? 0.60 : 0.40) : ($ga ? 0.40 : 0.20)); + {"type":"usd","usd": $pps * $seconds} ) """, ), @@ -482,6 +564,9 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): model: str, generate_audio: bool, ): + if "lite" in model and resolution == "4k": + raise Exception("4K resolution is not supported by the veo-3.1-lite model.") + model = MODELS_MAP[model] initial_response = await sync_op( cls, @@ -519,7 +604,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): data=VeoGenVidPollRequest( operationName=initial_response.name, ), - poll_interval=5.0, + poll_interval=9.0, estimated_duration=AVERAGE_DURATION_VIDEO_GEN, ) From 102773cd2c13bdbe8729fdc897031dfb61dea346 Mon Sep 17 00:00:00 2001 From: Comfy Org PR Bot Date: Wed, 22 Apr 2026 03:35:45 +0900 Subject: [PATCH 27/32] Bump comfyui-frontend-package to 1.42.14 (#13493) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 671bd5693..ccdd47674 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.42.12 +comfyui-frontend-package==1.42.14 comfyui-workflow-templates==0.9.57 comfyui-embedded-docs==0.4.3 torch From 43a1263b609b923b2f69a0510bcf7ac95097e41b Mon Sep 17 00:00:00 2001 From: AustinMroz Date: Tue, 21 Apr 2026 17:58:59 -0700 Subject: [PATCH 28/32] Add gpt-image-2 as version option (#13501) --- comfy_api_nodes/nodes_openai.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index 4ee896fa8..90a29c2f2 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -363,7 +363,7 @@ class OpenAIGPTImage1(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="OpenAIGPTImage1", - display_name="OpenAI GPT Image 1.5", + display_name="OpenAI GPT Image 2", category="api node/image/OpenAI", description="Generates images synchronously via OpenAI's GPT Image endpoint.", inputs=[ @@ -427,8 +427,8 @@ class OpenAIGPTImage1(IO.ComfyNode): ), IO.Combo.Input( "model", - options=["gpt-image-1", "gpt-image-1.5"], - default="gpt-image-1.5", + options=["gpt-image-1", "gpt-image-1.5", 'gpt-image-2'], + default="gpt-image-2", optional=True, ), ], @@ -487,6 +487,8 @@ class OpenAIGPTImage1(IO.ComfyNode): price_extractor = calculate_tokens_price_image_1 elif model == "gpt-image-1.5": price_extractor = calculate_tokens_price_image_1_5 + elif model == "gpt-image-2": + price_extractor = calculate_tokens_price_image_1_5 else: raise ValueError(f"Unknown model: {model}") From 529c80255f3f2370c39780c62a9454d95344014d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 21 Apr 2026 19:59:31 -0700 Subject: [PATCH 29/32] Allow logging in comfy app files. (#13505) --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 12b04719d..dbaf2745c 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,8 @@ import folder_paths import time from comfy.cli_args import args, enables_dynamic_vram from app.logger import setup_logger +setup_logger(log_level=args.verbose, use_stdout=args.log_stdout) + from app.assets.seeder import asset_seeder from app.assets.services import register_output_files import itertools @@ -27,8 +29,6 @@ if __name__ == "__main__": os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1' os.environ['DO_NOT_TRACK'] = '1' -setup_logger(log_level=args.verbose, use_stdout=args.log_stdout) - faulthandler.enable(file=sys.stderr, all_threads=False) import comfy_aimdo.control From 6045c11d8b32d5f761c555d6ca026e4d731ac8d5 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Wed, 22 Apr 2026 11:45:25 +0800 Subject: [PATCH 30/32] chore: update workflow templates to v0.9.59 (#13507) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ccdd47674..a25bc0667 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.14 -comfyui-workflow-templates==0.9.57 +comfyui-workflow-templates==0.9.59 comfyui-embedded-docs==0.4.3 torch torchsde From 91e1f45d80fba14d992269b0b98de7a4a14c81b9 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Tue, 21 Apr 2026 22:31:36 -0700 Subject: [PATCH 31/32] fix(veo): reject 4K resolution for veo-3.0 models in Veo3VideoGenerationNode (#13504) The tooltip on the resolution input states that 4K is not available for veo-3.1-lite or veo-3.0 models, but the execute guard only rejected the lite combination. Selecting 4K with veo-3.0-generate-001 or veo-3.0-fast-generate-001 would fall through and hit the upstream API with an invalid request. Broaden the guard to match the documented behavior and update the error message accordingly. Co-authored-by: Jedrzej Kosinski --- comfy_api_nodes/nodes_veo2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 084b086a8..2ff75d9b2 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -393,8 +393,8 @@ class Veo3VideoGenerationNode(IO.ComfyNode): model="veo-3.0-generate-001", generate_audio=False, ): - if "lite" in model and resolution == "4k": - raise Exception("4K resolution is not supported by the veo-3.1-lite model.") + if resolution == "4k" and ("lite" in model or "3.0" in model): + raise Exception("4K resolution is not supported by the veo-3.1-lite or veo-3.0 models.") model = MODELS_MAP[model] From db85cf03ff33f5be09d02f2a52334971209d25d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Wed, 22 Apr 2026 14:16:02 +0300 Subject: [PATCH 32/32] feat: RIFE and FILM frame interpolation model support (CORE-29) (#13258) * initial RIFE support * Also support FILM * Better RAM usage, reduce FILM VRAM peak * Add model folder placeholder * Fix oom fallback frame loss * Remove torch.compile for now * Rename model input * Shorter input type name --------- --- .../frame_interpolation_models/film_net.py | 258 ++++++++++++++++++ .../frame_interpolation_models/ifnet.py | 128 +++++++++ comfy_extras/nodes_frame_interpolation.py | 211 ++++++++++++++ folder_paths.py | 2 + .../put_frame_interpolation_models_here | 0 nodes.py | 3 +- 6 files changed, 601 insertions(+), 1 deletion(-) create mode 100644 comfy_extras/frame_interpolation_models/film_net.py create mode 100644 comfy_extras/frame_interpolation_models/ifnet.py create mode 100644 comfy_extras/nodes_frame_interpolation.py create mode 100644 models/frame_interpolation/put_frame_interpolation_models_here diff --git a/comfy_extras/frame_interpolation_models/film_net.py b/comfy_extras/frame_interpolation_models/film_net.py new file mode 100644 index 000000000..cf4f6e1e1 --- /dev/null +++ b/comfy_extras/frame_interpolation_models/film_net.py @@ -0,0 +1,258 @@ +"""FILM: Frame Interpolation for Large Motion (ECCV 2022).""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ops + +ops = comfy.ops.disable_weight_init + + +class FilmConv2d(nn.Module): + """Conv2d with optional LeakyReLU and FILM-style padding.""" + + def __init__(self, in_channels, out_channels, size, activation=True, device=None, dtype=None, operations=ops): + super().__init__() + self.even_pad = not size % 2 + self.conv = operations.Conv2d(in_channels, out_channels, kernel_size=size, padding=size // 2 if size % 2 else 0, device=device, dtype=dtype) + self.activation = nn.LeakyReLU(0.2) if activation else None + + def forward(self, x): + if self.even_pad: + x = F.pad(x, (0, 1, 0, 1)) + x = self.conv(x) + if self.activation is not None: + x = self.activation(x) + return x + + +def _warp_core(image, flow, grid_x, grid_y): + dtype = image.dtype + H, W = flow.shape[2], flow.shape[3] + dx = flow[:, 0].float() / (W * 0.5) + dy = flow[:, 1].float() / (H * 0.5) + grid = torch.stack([grid_x[None, None, :] + dx, grid_y[None, :, None] + dy], dim=3) + return F.grid_sample(image.float(), grid, mode="bilinear", padding_mode="border", align_corners=False).to(dtype) + + +def build_image_pyramid(image, pyramid_levels): + pyramid = [image] + for _ in range(1, pyramid_levels): + image = F.avg_pool2d(image, 2, 2) + pyramid.append(image) + return pyramid + + +def flow_pyramid_synthesis(residual_pyramid): + flow = residual_pyramid[-1] + flow_pyramid = [flow] + for residual_flow in residual_pyramid[:-1][::-1]: + flow = F.interpolate(flow, size=residual_flow.shape[2:4], mode="bilinear", scale_factor=None).mul_(2).add_(residual_flow) + flow_pyramid.append(flow) + flow_pyramid.reverse() + return flow_pyramid + + +def multiply_pyramid(pyramid, scalar): + return [image * scalar[:, None, None, None] for image in pyramid] + + +def pyramid_warp(feature_pyramid, flow_pyramid, warp_fn): + return [warp_fn(features, flow) for features, flow in zip(feature_pyramid, flow_pyramid)] + + +def concatenate_pyramids(pyramid1, pyramid2): + return [torch.cat([f1, f2], dim=1) for f1, f2 in zip(pyramid1, pyramid2)] + + +class SubTreeExtractor(nn.Module): + def __init__(self, in_channels=3, channels=64, n_layers=4, device=None, dtype=None, operations=ops): + super().__init__() + convs = [] + for i in range(n_layers): + out_ch = channels << i + convs.append(nn.Sequential( + FilmConv2d(in_channels, out_ch, 3, device=device, dtype=dtype, operations=operations), + FilmConv2d(out_ch, out_ch, 3, device=device, dtype=dtype, operations=operations))) + in_channels = out_ch + self.convs = nn.ModuleList(convs) + + def forward(self, image, n): + head = image + pyramid = [] + for i, layer in enumerate(self.convs): + head = layer(head) + pyramid.append(head) + if i < n - 1: + head = F.avg_pool2d(head, 2, 2) + return pyramid + + +class FeatureExtractor(nn.Module): + def __init__(self, in_channels=3, channels=64, sub_levels=4, device=None, dtype=None, operations=ops): + super().__init__() + self.extract_sublevels = SubTreeExtractor(in_channels, channels, sub_levels, device=device, dtype=dtype, operations=operations) + self.sub_levels = sub_levels + + def forward(self, image_pyramid): + sub_pyramids = [self.extract_sublevels(image_pyramid[i], min(len(image_pyramid) - i, self.sub_levels)) + for i in range(len(image_pyramid))] + feature_pyramid = [] + for i in range(len(image_pyramid)): + features = sub_pyramids[i][0] + for j in range(1, self.sub_levels): + if j <= i: + features = torch.cat([features, sub_pyramids[i - j][j]], dim=1) + feature_pyramid.append(features) + # Free sub-pyramids no longer needed by future levels + if i >= self.sub_levels - 1: + sub_pyramids[i - self.sub_levels + 1] = None + return feature_pyramid + + +class FlowEstimator(nn.Module): + def __init__(self, in_channels, num_convs, num_filters, device=None, dtype=None, operations=ops): + super().__init__() + self._convs = nn.ModuleList() + for _ in range(num_convs): + self._convs.append(FilmConv2d(in_channels, num_filters, 3, device=device, dtype=dtype, operations=operations)) + in_channels = num_filters + self._convs.append(FilmConv2d(in_channels, num_filters // 2, 1, device=device, dtype=dtype, operations=operations)) + self._convs.append(FilmConv2d(num_filters // 2, 2, 1, activation=False, device=device, dtype=dtype, operations=operations)) + + def forward(self, features_a, features_b): + net = torch.cat([features_a, features_b], dim=1) + for conv in self._convs: + net = conv(net) + return net + + +class PyramidFlowEstimator(nn.Module): + def __init__(self, filters=64, flow_convs=(3, 3, 3, 3), flow_filters=(32, 64, 128, 256), device=None, dtype=None, operations=ops): + super().__init__() + in_channels = filters << 1 + predictors = [] + for i in range(len(flow_convs)): + predictors.append(FlowEstimator(in_channels, flow_convs[i], flow_filters[i], device=device, dtype=dtype, operations=operations)) + in_channels += filters << (i + 2) + self._predictor = predictors[-1] + self._predictors = nn.ModuleList(predictors[:-1][::-1]) + + def forward(self, feature_pyramid_a, feature_pyramid_b, warp_fn): + levels = len(feature_pyramid_a) + v = self._predictor(feature_pyramid_a[-1], feature_pyramid_b[-1]) + residuals = [v] + # Coarse-to-fine: shared predictor for deep levels, then specialized predictors for fine levels + steps = [(i, self._predictor) for i in range(levels - 2, len(self._predictors) - 1, -1)] + steps += [(len(self._predictors) - 1 - k, p) for k, p in enumerate(self._predictors)] + for i, predictor in steps: + v = F.interpolate(v, size=feature_pyramid_a[i].shape[2:4], mode="bilinear").mul_(2) + v_residual = predictor(feature_pyramid_a[i], warp_fn(feature_pyramid_b[i], v)) + residuals.append(v_residual) + v = v.add_(v_residual) + residuals.reverse() + return residuals + + +def _get_fusion_channels(level, filters): + # Per direction: multi-scale features + RGB image (3ch) + flow (2ch), doubled for both directions + return (sum(filters << i for i in range(level)) + 3 + 2) * 2 + + +class Fusion(nn.Module): + def __init__(self, n_layers=4, specialized_layers=3, filters=64, device=None, dtype=None, operations=ops): + super().__init__() + self.output_conv = operations.Conv2d(filters, 3, kernel_size=1, device=device, dtype=dtype) + self.convs = nn.ModuleList() + in_channels = _get_fusion_channels(n_layers, filters) + increase = 0 + for i in range(n_layers)[::-1]: + num_filters = (filters << i) if i < specialized_layers else (filters << specialized_layers) + self.convs.append(nn.ModuleList([ + FilmConv2d(in_channels, num_filters, 2, activation=False, device=device, dtype=dtype, operations=operations), + FilmConv2d(in_channels + (increase or num_filters), num_filters, 3, device=device, dtype=dtype, operations=operations), + FilmConv2d(num_filters, num_filters, 3, device=device, dtype=dtype, operations=operations)])) + in_channels = num_filters + increase = _get_fusion_channels(i, filters) - num_filters // 2 + + def forward(self, pyramid): + net = pyramid[-1] + for k, layers in enumerate(self.convs): + i = len(self.convs) - 1 - k + net = layers[0](F.interpolate(net, size=pyramid[i].shape[2:4], mode="nearest")) + net = layers[2](layers[1](torch.cat([pyramid[i], net], dim=1))) + return self.output_conv(net) + + +class FILMNet(nn.Module): + def __init__(self, pyramid_levels=7, fusion_pyramid_levels=5, specialized_levels=3, sub_levels=4, + filters=64, flow_convs=(3, 3, 3, 3), flow_filters=(32, 64, 128, 256), device=None, dtype=None, operations=ops): + super().__init__() + self.pyramid_levels = pyramid_levels + self.fusion_pyramid_levels = fusion_pyramid_levels + self.extract = FeatureExtractor(3, filters, sub_levels, device=device, dtype=dtype, operations=operations) + self.predict_flow = PyramidFlowEstimator(filters, flow_convs, flow_filters, device=device, dtype=dtype, operations=operations) + self.fuse = Fusion(sub_levels, specialized_levels, filters, device=device, dtype=dtype, operations=operations) + self._warp_grids = {} + + def get_dtype(self): + return self.extract.extract_sublevels.convs[0][0].conv.weight.dtype + + def _build_warp_grids(self, H, W, device): + """Pre-compute warp grids for all pyramid levels.""" + if (H, W) in self._warp_grids: + return + self._warp_grids = {} # clear old resolution grids to prevent memory leaks + for _ in range(self.pyramid_levels): + self._warp_grids[(H, W)] = ( + torch.linspace(-(1 - 1 / W), 1 - 1 / W, W, dtype=torch.float32, device=device), + torch.linspace(-(1 - 1 / H), 1 - 1 / H, H, dtype=torch.float32, device=device), + ) + H, W = H // 2, W // 2 + + def warp(self, image, flow): + grid_x, grid_y = self._warp_grids[(flow.shape[2], flow.shape[3])] + return _warp_core(image, flow, grid_x, grid_y) + + def extract_features(self, img): + """Extract image and feature pyramids for a single frame. Can be cached across pairs.""" + image_pyramid = build_image_pyramid(img, self.pyramid_levels) + feature_pyramid = self.extract(image_pyramid) + return image_pyramid, feature_pyramid + + def forward(self, img0, img1, timestep=0.5, cache=None): + # FILM uses a scalar timestep per batch element (spatially-varying timesteps not supported) + t = timestep.mean(dim=(1, 2, 3)).item() if isinstance(timestep, torch.Tensor) else timestep + return self.forward_multi_timestep(img0, img1, [t], cache=cache) + + def forward_multi_timestep(self, img0, img1, timesteps, cache=None): + """Compute flow once, synthesize at multiple timesteps. Expects batch=1 inputs.""" + self._build_warp_grids(img0.shape[2], img0.shape[3], img0.device) + + image_pyr0, feat_pyr0 = cache["img0"] if cache and "img0" in cache else self.extract_features(img0) + image_pyr1, feat_pyr1 = cache["img1"] if cache and "img1" in cache else self.extract_features(img1) + + fwd_flow = flow_pyramid_synthesis(self.predict_flow(feat_pyr0, feat_pyr1, self.warp))[:self.fusion_pyramid_levels] + bwd_flow = flow_pyramid_synthesis(self.predict_flow(feat_pyr1, feat_pyr0, self.warp))[:self.fusion_pyramid_levels] + + # Build warp targets and free full pyramids (only first fpl levels needed from here) + fpl = self.fusion_pyramid_levels + p2w = [concatenate_pyramids(image_pyr0[:fpl], feat_pyr0[:fpl]), + concatenate_pyramids(image_pyr1[:fpl], feat_pyr1[:fpl])] + del image_pyr0, image_pyr1, feat_pyr0, feat_pyr1 + + results = [] + dt_tensors = torch.tensor(timesteps, device=img0.device, dtype=img0.dtype) + for idx in range(len(timesteps)): + batch_dt = dt_tensors[idx:idx + 1] + bwd_scaled = multiply_pyramid(bwd_flow, batch_dt) + fwd_scaled = multiply_pyramid(fwd_flow, 1 - batch_dt) + fwd_warped = pyramid_warp(p2w[0], bwd_scaled, self.warp) + bwd_warped = pyramid_warp(p2w[1], fwd_scaled, self.warp) + aligned = [torch.cat([fw, bw, bf, ff], dim=1) + for fw, bw, bf, ff in zip(fwd_warped, bwd_warped, bwd_scaled, fwd_scaled)] + del fwd_warped, bwd_warped, bwd_scaled, fwd_scaled + results.append(self.fuse(aligned)) + del aligned + return torch.cat(results, dim=0) diff --git a/comfy_extras/frame_interpolation_models/ifnet.py b/comfy_extras/frame_interpolation_models/ifnet.py new file mode 100644 index 000000000..03cb34c50 --- /dev/null +++ b/comfy_extras/frame_interpolation_models/ifnet.py @@ -0,0 +1,128 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ops + +ops = comfy.ops.disable_weight_init + + +def _warp(img, flow, warp_grids): + B, _, H, W = img.shape + base_grid, flow_div = warp_grids[(H, W)] + flow_norm = torch.cat([flow[:, 0:1] / flow_div[0], flow[:, 1:2] / flow_div[1]], 1).float() + grid = (base_grid.expand(B, -1, -1, -1) + flow_norm).permute(0, 2, 3, 1) + return F.grid_sample(img.float(), grid, mode="bilinear", padding_mode="border", align_corners=True).to(img.dtype) + + +class Head(nn.Module): + def __init__(self, out_ch=4, device=None, dtype=None, operations=ops): + super().__init__() + self.cnn0 = operations.Conv2d(3, 16, 3, 2, 1, device=device, dtype=dtype) + self.cnn1 = operations.Conv2d(16, 16, 3, 1, 1, device=device, dtype=dtype) + self.cnn2 = operations.Conv2d(16, 16, 3, 1, 1, device=device, dtype=dtype) + self.cnn3 = operations.ConvTranspose2d(16, out_ch, 4, 2, 1, device=device, dtype=dtype) + self.relu = nn.LeakyReLU(0.2, True) + + def forward(self, x): + x = self.relu(self.cnn0(x)) + x = self.relu(self.cnn1(x)) + x = self.relu(self.cnn2(x)) + return self.cnn3(x) + + +class ResConv(nn.Module): + def __init__(self, c, device=None, dtype=None, operations=ops): + super().__init__() + self.conv = operations.Conv2d(c, c, 3, 1, 1, device=device, dtype=dtype) + self.beta = nn.Parameter(torch.ones((1, c, 1, 1), device=device, dtype=dtype)) + self.relu = nn.LeakyReLU(0.2, True) + + def forward(self, x): + return self.relu(torch.addcmul(x, self.conv(x), self.beta)) + + +class IFBlock(nn.Module): + def __init__(self, in_planes, c=64, device=None, dtype=None, operations=ops): + super().__init__() + self.conv0 = nn.Sequential( + nn.Sequential(operations.Conv2d(in_planes, c // 2, 3, 2, 1, device=device, dtype=dtype), nn.LeakyReLU(0.2, True)), + nn.Sequential(operations.Conv2d(c // 2, c, 3, 2, 1, device=device, dtype=dtype), nn.LeakyReLU(0.2, True))) + self.convblock = nn.Sequential(*(ResConv(c, device=device, dtype=dtype, operations=operations) for _ in range(8))) + self.lastconv = nn.Sequential(operations.ConvTranspose2d(c, 4 * 13, 4, 2, 1, device=device, dtype=dtype), nn.PixelShuffle(2)) + + def forward(self, x, flow=None, scale=1): + x = F.interpolate(x, scale_factor=1.0 / scale, mode="bilinear") + if flow is not None: + flow = F.interpolate(flow, scale_factor=1.0 / scale, mode="bilinear").div_(scale) + x = torch.cat((x, flow), 1) + feat = self.convblock(self.conv0(x)) + tmp = F.interpolate(self.lastconv(feat), scale_factor=scale, mode="bilinear") + return tmp[:, :4] * scale, tmp[:, 4:5], tmp[:, 5:] + + +class IFNet(nn.Module): + def __init__(self, head_ch=4, channels=(192, 128, 96, 64, 32), device=None, dtype=None, operations=ops): + super().__init__() + self.encode = Head(out_ch=head_ch, device=device, dtype=dtype, operations=operations) + block_in = [7 + 2 * head_ch] + [8 + 4 + 8 + 2 * head_ch] * 4 + self.blocks = nn.ModuleList([IFBlock(block_in[i], channels[i], device=device, dtype=dtype, operations=operations) for i in range(5)]) + self.scale_list = [16, 8, 4, 2, 1] + self.pad_align = 64 + self._warp_grids = {} + + def get_dtype(self): + return self.encode.cnn0.weight.dtype + + def _build_warp_grids(self, H, W, device): + if (H, W) in self._warp_grids: + return + self._warp_grids = {} # clear old resolution grids to prevent memory leaks + grid_y, grid_x = torch.meshgrid( + torch.linspace(-1.0, 1.0, H, device=device, dtype=torch.float32), + torch.linspace(-1.0, 1.0, W, device=device, dtype=torch.float32), indexing="ij") + self._warp_grids[(H, W)] = ( + torch.stack((grid_x, grid_y), dim=0).unsqueeze(0), + torch.tensor([(W - 1.0) / 2.0, (H - 1.0) / 2.0], dtype=torch.float32, device=device)) + + def warp(self, img, flow): + return _warp(img, flow, self._warp_grids) + + def extract_features(self, img): + """Extract head features for a single frame. Can be cached across pairs.""" + return self.encode(img) + + def forward(self, img0, img1, timestep=0.5, cache=None): + if not isinstance(timestep, torch.Tensor): + timestep = torch.full((img0.shape[0], 1, img0.shape[2], img0.shape[3]), timestep, device=img0.device, dtype=img0.dtype) + + self._build_warp_grids(img0.shape[2], img0.shape[3], img0.device) + + B = img0.shape[0] + f0 = cache["img0"].expand(B, -1, -1, -1) if cache and "img0" in cache else self.encode(img0) + f1 = cache["img1"].expand(B, -1, -1, -1) if cache and "img1" in cache else self.encode(img1) + flow = mask = feat = None + warped_img0, warped_img1 = img0, img1 + for i, block in enumerate(self.blocks): + if flow is None: + flow, mask, feat = block(torch.cat((img0, img1, f0, f1, timestep), 1), None, scale=self.scale_list[i]) + else: + fd, mask, feat = block( + torch.cat((warped_img0, warped_img1, self.warp(f0, flow[:, :2]), self.warp(f1, flow[:, 2:4]), timestep, mask, feat), 1), + flow, scale=self.scale_list[i]) + flow = flow.add_(fd) + warped_img0 = self.warp(img0, flow[:, :2]) + warped_img1 = self.warp(img1, flow[:, 2:4]) + return torch.lerp(warped_img1, warped_img0, torch.sigmoid(mask)) + + +def detect_rife_config(state_dict): + head_ch = state_dict["encode.cnn3.weight"].shape[1] # ConvTranspose2d: (in_ch, out_ch, kH, kW) + channels = [] + for i in range(5): + key = f"blocks.{i}.conv0.1.0.weight" + if key in state_dict: + channels.append(state_dict[key].shape[0]) + if len(channels) != 5: + raise ValueError(f"Unsupported RIFE model: expected 5 blocks, found {len(channels)}") + return head_ch, channels diff --git a/comfy_extras/nodes_frame_interpolation.py b/comfy_extras/nodes_frame_interpolation.py new file mode 100644 index 000000000..a3b00d36e --- /dev/null +++ b/comfy_extras/nodes_frame_interpolation.py @@ -0,0 +1,211 @@ +import torch +from tqdm import tqdm +from typing_extensions import override + +import comfy.model_patcher +import comfy.utils +import folder_paths +from comfy import model_management +from comfy_extras.frame_interpolation_models.ifnet import IFNet, detect_rife_config +from comfy_extras.frame_interpolation_models.film_net import FILMNet +from comfy_api.latest import ComfyExtension, io + +FrameInterpolationModel = io.Custom("INTERP_MODEL") + + +class FrameInterpolationModelLoader(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="FrameInterpolationModelLoader", + display_name="Load Frame Interpolation Model", + category="loaders", + inputs=[ + io.Combo.Input("model_name", options=folder_paths.get_filename_list("frame_interpolation"), + tooltip="Select a frame interpolation model to load. Models must be placed in the 'frame_interpolation' folder."), + ], + outputs=[ + FrameInterpolationModel.Output(), + ], + ) + + @classmethod + def execute(cls, model_name) -> io.NodeOutput: + model_path = folder_paths.get_full_path_or_raise("frame_interpolation", model_name) + sd = comfy.utils.load_torch_file(model_path, safe_load=True) + + model = cls._detect_and_load(sd) + dtype = torch.float16 if model_management.should_use_fp16(model_management.get_torch_device()) else torch.float32 + model.eval().to(dtype) + patcher = comfy.model_patcher.ModelPatcher( + model, + load_device=model_management.get_torch_device(), + offload_device=model_management.unet_offload_device(), + ) + return io.NodeOutput(patcher) + + @classmethod + def _detect_and_load(cls, sd): + # Try FILM + if "extract.extract_sublevels.convs.0.0.conv.weight" in sd: + model = FILMNet() + model.load_state_dict(sd) + return model + + # Try RIFE (needs key remapping for raw checkpoints) + sd = comfy.utils.state_dict_prefix_replace(sd, {"module.": "", "flownet.": ""}) + key_map = {} + for k in sd: + for i in range(5): + if k.startswith(f"block{i}."): + key_map[k] = f"blocks.{i}.{k[len(f'block{i}.'):]}" + if key_map: + sd = {key_map.get(k, k): v for k, v in sd.items()} + sd = {k: v for k, v in sd.items() if not k.startswith(("teacher.", "caltime."))} + + try: + head_ch, channels = detect_rife_config(sd) + except (KeyError, ValueError): + raise ValueError("Unrecognized frame interpolation model format") + model = IFNet(head_ch=head_ch, channels=channels) + model.load_state_dict(sd) + return model + + +class FrameInterpolate(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="FrameInterpolate", + display_name="Frame Interpolate", + category="image/video", + search_aliases=["rife", "film", "frame interpolation", "slow motion", "interpolate frames", "vfi"], + inputs=[ + FrameInterpolationModel.Input("interp_model"), + io.Image.Input("images"), + io.Int.Input("multiplier", default=2, min=2, max=16), + ], + outputs=[ + io.Image.Output(), + ], + ) + + @classmethod + def execute(cls, interp_model, images, multiplier) -> io.NodeOutput: + offload_device = model_management.intermediate_device() + + num_frames = images.shape[0] + if num_frames < 2 or multiplier < 2: + return io.NodeOutput(images) + + model_management.load_model_gpu(interp_model) + device = interp_model.load_device + dtype = interp_model.model_dtype() + inference_model = interp_model.model + + # Free VRAM for inference activations (model weights + ~20x a single frame's worth) + H, W = images.shape[1], images.shape[2] + activation_mem = H * W * 3 * images.element_size() * 20 + model_management.free_memory(activation_mem, device) + align = getattr(inference_model, "pad_align", 1) + + # Prepare a single padded frame on device for determining output dimensions + def prepare_frame(idx): + frame = images[idx:idx + 1].movedim(-1, 1).to(dtype=dtype, device=device) + if align > 1: + from comfy.ldm.common_dit import pad_to_patch_size + frame = pad_to_patch_size(frame, (align, align), padding_mode="reflect") + return frame + + # Count total interpolation passes for progress bar + total_pairs = num_frames - 1 + num_interp = multiplier - 1 + total_steps = total_pairs * num_interp + pbar = comfy.utils.ProgressBar(total_steps) + tqdm_bar = tqdm(total=total_steps, desc="Frame interpolation") + + batch = num_interp # reduced on OOM and persists across pairs (same resolution = same limit) + t_values = [t / multiplier for t in range(1, multiplier)] + + out_dtype = model_management.intermediate_dtype() + total_out_frames = total_pairs * multiplier + 1 + result = torch.empty((total_out_frames, 3, H, W), dtype=out_dtype, device=offload_device) + result[0] = images[0].movedim(-1, 0).to(out_dtype) + out_idx = 1 + + # Pre-compute timestep tensor on device (padded dimensions needed) + sample = prepare_frame(0) + pH, pW = sample.shape[2], sample.shape[3] + ts_full = torch.tensor(t_values, device=device, dtype=dtype).reshape(num_interp, 1, 1, 1) + ts_full = ts_full.expand(-1, 1, pH, pW) + del sample + + multi_fn = getattr(inference_model, "forward_multi_timestep", None) + feat_cache = {} + prev_frame = None + + try: + for i in range(total_pairs): + img0_single = prev_frame if prev_frame is not None else prepare_frame(i) + img1_single = prepare_frame(i + 1) + prev_frame = img1_single + + # Cache features: img1 of pair N becomes img0 of pair N+1 + feat_cache["img0"] = feat_cache.pop("next") if "next" in feat_cache else inference_model.extract_features(img0_single) + feat_cache["img1"] = inference_model.extract_features(img1_single) + feat_cache["next"] = feat_cache["img1"] + + used_multi = False + if multi_fn is not None: + # Models with timestep-independent flow can compute it once for all timesteps + try: + mids = multi_fn(img0_single, img1_single, t_values, cache=feat_cache) + result[out_idx:out_idx + num_interp] = mids[:, :, :H, :W].to(out_dtype) + out_idx += num_interp + pbar.update(num_interp) + tqdm_bar.update(num_interp) + used_multi = True + except model_management.OOM_EXCEPTION: + model_management.soft_empty_cache() + multi_fn = None # fall through to single-timestep path + + if not used_multi: + j = 0 + while j < num_interp: + b = min(batch, num_interp - j) + try: + img0 = img0_single.expand(b, -1, -1, -1) + img1 = img1_single.expand(b, -1, -1, -1) + mids = inference_model(img0, img1, timestep=ts_full[j:j + b], cache=feat_cache) + result[out_idx:out_idx + b] = mids[:, :, :H, :W].to(out_dtype) + out_idx += b + pbar.update(b) + tqdm_bar.update(b) + j += b + except model_management.OOM_EXCEPTION: + if batch <= 1: + raise + batch = max(1, batch // 2) + model_management.soft_empty_cache() + + result[out_idx] = images[i + 1].movedim(-1, 0).to(out_dtype) + out_idx += 1 + finally: + tqdm_bar.close() + + # BCHW -> BHWC + result = result.movedim(1, -1).clamp_(0.0, 1.0) + return io.NodeOutput(result) + + +class FrameInterpolationExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + FrameInterpolationModelLoader, + FrameInterpolate, + ] + + +async def comfy_entrypoint() -> FrameInterpolationExtension: + return FrameInterpolationExtension() diff --git a/folder_paths.py b/folder_paths.py index 9c96540e3..80f4b291a 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -52,6 +52,8 @@ folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patc folder_names_and_paths["audio_encoders"] = ([os.path.join(models_dir, "audio_encoders")], supported_pt_extensions) +folder_names_and_paths["frame_interpolation"] = ([os.path.join(models_dir, "frame_interpolation")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") diff --git a/models/frame_interpolation/put_frame_interpolation_models_here b/models/frame_interpolation/put_frame_interpolation_models_here new file mode 100644 index 000000000..e69de29bb diff --git a/nodes.py b/nodes.py index 299b3d758..bb38e07b8 100644 --- a/nodes.py +++ b/nodes.py @@ -2457,7 +2457,8 @@ async def init_builtin_extra_nodes(): "nodes_number_convert.py", "nodes_painter.py", "nodes_curve.py", - "nodes_rtdetr.py" + "nodes_rtdetr.py", + "nodes_frame_interpolation.py", ] import_failed = []