From b0fd65e88441073d2f79a1065a51d62b49cb0408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Thu, 26 Mar 2026 18:55:05 +0200 Subject: [PATCH 01/29] fix: regression in text generate with LTXAV model (#13170) --- comfy/text_encoders/lt.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py index 5e1273c6e..5aee1f4c0 100644 --- a/comfy/text_encoders/lt.py +++ b/comfy/text_encoders/lt.py @@ -91,11 +91,11 @@ class Gemma3_12BModel(sd1_clip.SDClipModel): self.dtypes.add(dtype) super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_12B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) - def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed): + def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty): tokens_only = [[t[0] for t in b] for b in tokens] embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device) comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5) - return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106]) # 106 is + return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106], presence_penalty=presence_penalty) # 106 is class DualLinearProjection(torch.nn.Module): def __init__(self, in_dim, out_dim_video, out_dim_audio, dtype=None, device=None, operations=None): @@ -189,8 +189,8 @@ class LTXAVTEModel(torch.nn.Module): return out.to(device=out_device, dtype=torch.float), pooled, extra - def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed): - return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed) + def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty): + return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty) def load_sd(self, sd): if "model.layers.47.self_attn.q_norm.weight" in sd: From 8165485a179e8dc33829168c16a6cff541bde507 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 26 Mar 2026 21:02:04 +0200 Subject: [PATCH 02/29] feat(api-nodes): added new Topaz model (#13175) Signed-off-by: bigcat88 --- comfy_api_nodes/nodes_topaz.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py index 6b61bd4b2..b18b31af1 100644 --- a/comfy_api_nodes/nodes_topaz.py +++ b/comfy_api_nodes/nodes_topaz.py @@ -38,6 +38,7 @@ from comfy_api_nodes.util import ( UPSCALER_MODELS_MAP = { "Starlight (Astra) Fast": "slf-1", "Starlight (Astra) Creative": "slc-1", + "Starlight Precise 2.5": "slp-2.5", } From 359559c9131899f7dc4788ff367dfe8e729a45bb Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Fri, 27 Mar 2026 03:07:38 +0800 Subject: [PATCH 03/29] chore: update workflow templates to v0.9.38 (#13176) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 76f824906..d780b2f50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.8 -comfyui-workflow-templates==0.9.36 +comfyui-workflow-templates==0.9.38 comfyui-embedded-docs==0.4.3 torch torchsde From 1dc64f35269b415385aaa78ba7d8290c54e3aef4 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Thu, 26 Mar 2026 21:45:05 -0400 Subject: [PATCH 04/29] feat: add curve inputs and raise uniform limit for GLSL shader node (#13158) * feat: add curve inputs and raise uniform limit for GLSL shader node * allow arbitrary size for curve --- blueprints/.glsl/Color_Balance_15.frag | 90 ++++++++++++++++++++++++++ blueprints/.glsl/Color_Curves_8.frag | 46 +++++++++++++ blueprints/Color Balance.json | 1 + blueprints/Color Curves.json | 1 + comfy_extras/nodes_glsl.py | 63 +++++++++++++++++- 5 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 blueprints/.glsl/Color_Balance_15.frag create mode 100644 blueprints/.glsl/Color_Curves_8.frag create mode 100644 blueprints/Color Balance.json create mode 100644 blueprints/Color Curves.json diff --git a/blueprints/.glsl/Color_Balance_15.frag b/blueprints/.glsl/Color_Balance_15.frag new file mode 100644 index 000000000..e370aa12a --- /dev/null +++ b/blueprints/.glsl/Color_Balance_15.frag @@ -0,0 +1,90 @@ +#version 300 es +precision highp float; + +uniform sampler2D u_image0; +uniform float u_float0; +uniform float u_float1; +uniform float u_float2; +uniform float u_float3; +uniform float u_float4; +uniform float u_float5; +uniform float u_float6; +uniform float u_float7; +uniform float u_float8; +uniform bool u_bool0; + +in vec2 v_texCoord; +out vec4 fragColor; + +vec3 rgb2hsl(vec3 c) { + float maxC = max(c.r, max(c.g, c.b)); + float minC = min(c.r, min(c.g, c.b)); + float l = (maxC + minC) * 0.5; + if (maxC == minC) return vec3(0.0, 0.0, l); + float d = maxC - minC; + float s = l > 0.5 ? d / (2.0 - maxC - minC) : d / (maxC + minC); + float h; + if (maxC == c.r) { + h = (c.g - c.b) / d + (c.g < c.b ? 6.0 : 0.0); + } else if (maxC == c.g) { + h = (c.b - c.r) / d + 2.0; + } else { + h = (c.r - c.g) / d + 4.0; + } + h /= 6.0; + return vec3(h, s, l); +} + +float hue2rgb(float p, float q, float t) { + if (t < 0.0) t += 1.0; + if (t > 1.0) t -= 1.0; + if (t < 1.0 / 6.0) return p + (q - p) * 6.0 * t; + if (t < 1.0 / 2.0) return q; + if (t < 2.0 / 3.0) return p + (q - p) * (2.0 / 3.0 - t) * 6.0; + return p; +} + +vec3 hsl2rgb(vec3 hsl) { + float h = hsl.x, s = hsl.y, l = hsl.z; + if (s == 0.0) return vec3(l); + float q = l < 0.5 ? l * (1.0 + s) : l + s - l * s; + float p = 2.0 * l - q; + return vec3( + hue2rgb(p, q, h + 1.0 / 3.0), + hue2rgb(p, q, h), + hue2rgb(p, q, h - 1.0 / 3.0) + ); +} + +void main() { + vec4 tex = texture(u_image0, v_texCoord); + vec3 color = tex.rgb; + + vec3 shadows = vec3(u_float0, u_float1, u_float2) * 0.01; + vec3 midtones = vec3(u_float3, u_float4, u_float5) * 0.01; + vec3 highlights = vec3(u_float6, u_float7, u_float8) * 0.01; + + float maxC = max(color.r, max(color.g, color.b)); + float minC = min(color.r, min(color.g, color.b)); + float lightness = (maxC + minC) * 0.5; + + // GIMP weight curves: linear ramps with constants a=0.25, b=0.333, scale=0.7 + const float a = 0.25; + const float b = 0.333; + const float scale = 0.7; + + float sw = clamp((lightness - b) / -a + 0.5, 0.0, 1.0) * scale; + float mw = clamp((lightness - b) / a + 0.5, 0.0, 1.0) * + clamp((lightness + b - 1.0) / -a + 0.5, 0.0, 1.0) * scale; + float hw = clamp((lightness + b - 1.0) / a + 0.5, 0.0, 1.0) * scale; + + color += sw * shadows + mw * midtones + hw * highlights; + + if (u_bool0) { + vec3 hsl = rgb2hsl(clamp(color, 0.0, 1.0)); + hsl.z = lightness; + color = hsl2rgb(hsl); + } + + fragColor = vec4(clamp(color, 0.0, 1.0), tex.a); +} diff --git a/blueprints/.glsl/Color_Curves_8.frag b/blueprints/.glsl/Color_Curves_8.frag new file mode 100644 index 000000000..c39916726 --- /dev/null +++ b/blueprints/.glsl/Color_Curves_8.frag @@ -0,0 +1,46 @@ +#version 300 es +precision highp float; + +uniform sampler2D u_image0; +uniform sampler2D u_curve0; // RGB master curve (256x1 LUT) +uniform sampler2D u_curve1; // Red channel curve +uniform sampler2D u_curve2; // Green channel curve +uniform sampler2D u_curve3; // Blue channel curve + +in vec2 v_texCoord; +layout(location = 0) out vec4 fragColor0; + +// GIMP-compatible curve lookup with manual linear interpolation. +// Matches gimp_curve_map_value_inline() from gimpcurve-map.c: +// index = value * (n_samples - 1) +// f = fract(index) +// result = (1-f) * samples[floor] + f * samples[ceil] +// +// Uses texelFetch (NEAREST) to avoid GPU half-texel offset issues +// that occur with texture() + GL_LINEAR on small 256x1 LUTs. +float applyCurve(sampler2D curve, float value) { + value = clamp(value, 0.0, 1.0); + + float pos = value * 255.0; + int lo = int(floor(pos)); + int hi = min(lo + 1, 255); + float f = pos - float(lo); + + float a = texelFetch(curve, ivec2(lo, 0), 0).r; + float b = texelFetch(curve, ivec2(hi, 0), 0).r; + + return a + f * (b - a); +} + +void main() { + vec4 color = texture(u_image0, v_texCoord); + + // GIMP order: per-channel curves first, then RGB master curve. + // See gimp_curve_map_pixels() default case in gimpcurve-map.c: + // dest = colors_curve( channel_curve( src ) ) + color.r = applyCurve(u_curve0, applyCurve(u_curve1, color.r)); + color.g = applyCurve(u_curve0, applyCurve(u_curve2, color.g)); + color.b = applyCurve(u_curve0, applyCurve(u_curve3, color.b)); + + fragColor0 = vec4(color.rgb, color.a); +} diff --git a/blueprints/Color Balance.json b/blueprints/Color Balance.json new file mode 100644 index 000000000..fe272d5dc --- /dev/null +++ b/blueprints/Color Balance.json @@ -0,0 +1 @@ +{"revision": 0, "last_node_id": 20, "last_link_id": 0, "nodes": [{"id": 20, "type": "243b9e93-7303-4500-8c70-58acb712f5bc", "pos": [3610, -2630], "size": [270, 420], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "image", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": null}], "outputs": [{"label": "IMAGE", "localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": []}], "properties": {"proxyWidgets": [["4", "value"], ["5", "value"], ["6", "value"], ["7", "value"], ["8", "value"], ["9", "value"], ["10", "value"], ["11", "value"], ["12", "value"], ["13", "value"]]}, "widgets_values": [], "title": "Color Balance"}], "links": [], "version": 0.4, "definitions": {"subgraphs": [{"id": "243b9e93-7303-4500-8c70-58acb712f5bc", "version": 1, "state": {"lastGroupId": 0, "lastNodeId": 15, "lastLinkId": 39, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "Color Balance", "inputNode": {"id": -10, "bounding": [2660, -4500, 120, 60]}, "outputNode": {"id": -20, "bounding": [4270, -4500, 120, 60]}, "inputs": [{"id": "d24c0b6d-00bd-4e95-be80-8114e8376ec0", "name": "images.image0", "type": "IMAGE", "linkIds": [29], "localized_name": "images.image0", "label": "image", "pos": [2760, -4480]}], "outputs": [{"id": "92723f62-996e-496d-ad4f-81a38be4ad64", "name": "IMAGE0", "type": "IMAGE", "linkIds": [28], "localized_name": "IMAGE0", "label": "IMAGE", "pos": [4290, -4480]}], "widgets": [], "nodes": [{"id": 4, "type": "PrimitiveFloat", "pos": [3060, -4500], "size": [270, 58], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "shadows red", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [30]}], "title": "Shadows Red", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [0, 255, 255]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [255, 0, 0]}]}, "widgets_values": [0]}, {"id": 5, "type": "PrimitiveFloat", "pos": [3060, -4390], "size": [270, 58], "flags": {}, "order": 1, "mode": 0, "inputs": [{"label": "shadows green", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [31]}], "title": "Shadows Green", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [255, 0, 255]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [0, 255, 0]}]}, "widgets_values": [0]}, {"id": 6, "type": "PrimitiveFloat", "pos": [3060, -4280], "size": [270, 58], "flags": {}, "order": 2, "mode": 0, "inputs": [{"label": "shadows blue", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [32]}], "title": "Shadows Blue", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [255, 255, 0]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [0, 0, 255]}]}, "widgets_values": [0]}, {"id": 7, "type": "PrimitiveFloat", "pos": [3060, -4170], "size": [270, 58], "flags": {}, "order": 3, "mode": 0, "inputs": [{"label": "midtones red", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [33]}], "title": "Midtones Red", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [0, 255, 255]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [255, 0, 0]}]}, "widgets_values": [0]}, {"id": 8, "type": "PrimitiveFloat", "pos": [3060, -4060], "size": [270, 58], "flags": {}, "order": 4, "mode": 0, "inputs": [{"label": "midtones green", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [34]}], "title": "Midtones Green", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [255, 0, 255]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [0, 255, 0]}]}, "widgets_values": [0]}, {"id": 9, "type": "PrimitiveFloat", "pos": [3060, -3950], "size": [270, 58], "flags": {}, "order": 5, "mode": 0, "inputs": [{"label": "midtones blue", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [35]}], "title": "Midtones Blue", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [255, 255, 0]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [0, 0, 255]}]}, "widgets_values": [0]}, {"id": 10, "type": "PrimitiveFloat", "pos": [3060, -3840], "size": [270, 58], "flags": {}, "order": 6, "mode": 0, "inputs": [{"label": "highlights red", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [36]}], "title": "Highlights Red", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [0, 255, 255]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [255, 0, 0]}]}, "widgets_values": [0]}, {"id": 11, "type": "PrimitiveFloat", "pos": [3060, -3730], "size": [270, 58], "flags": {}, "order": 7, "mode": 0, "inputs": [{"label": "highlights green", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [37]}], "title": "Highlights Green", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [255, 0, 255]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [0, 255, 0]}]}, "widgets_values": [0]}, {"id": 12, "type": "PrimitiveFloat", "pos": [3060, -3620], "size": [270, 58], "flags": {}, "order": 8, "mode": 0, "inputs": [{"label": "highlights blue", "localized_name": "value", "name": "value", "type": "FLOAT", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "FLOAT", "name": "FLOAT", "type": "FLOAT", "links": [38]}], "title": "Highlights Blue", "properties": {"Node name for S&R": "PrimitiveFloat", "max": 100, "min": -100, "step": 1, "display": "gradientslider", "gradient_stops": [{"offset": 0, "color": [255, 255, 0]}, {"offset": 0.5, "color": [128, 128, 128]}, {"offset": 1, "color": [0, 0, 255]}]}, "widgets_values": [0]}, {"id": 13, "type": "PrimitiveBoolean", "pos": [3060, -3510], "size": [270, 58], "flags": {}, "order": 9, "mode": 0, "inputs": [{"label": "preserve luminosity", "localized_name": "value", "name": "value", "type": "BOOLEAN", "widget": {"name": "value"}, "link": null}], "outputs": [{"localized_name": "BOOLEAN", "name": "BOOLEAN", "type": "BOOLEAN", "links": [39]}], "title": "Preserve Luminosity", "properties": {"Node name for S&R": "PrimitiveBoolean"}, "widgets_values": [true]}, {"id": 15, "type": "GLSLShader", "pos": [3590, -4500], "size": [420, 500], "flags": {}, "order": 10, "mode": 0, "inputs": [{"label": "image0", "localized_name": "images.image0", "name": "images.image0", "type": "IMAGE", "link": 29}, {"label": "image1", "localized_name": "images.image1", "name": "images.image1", "shape": 7, "type": "IMAGE", "link": null}, {"label": "u_float0", "localized_name": "floats.u_float0", "name": "floats.u_float0", "shape": 7, "type": "FLOAT", "link": 30}, {"label": "u_float1", "localized_name": "floats.u_float1", "name": "floats.u_float1", "shape": 7, "type": "FLOAT", "link": 31}, {"label": "u_float2", "localized_name": "floats.u_float2", "name": "floats.u_float2", "shape": 7, "type": "FLOAT", "link": 32}, {"label": "u_float3", "localized_name": "floats.u_float3", "name": "floats.u_float3", "shape": 7, "type": "FLOAT", "link": 33}, {"label": "u_float4", "localized_name": "floats.u_float4", "name": "floats.u_float4", "shape": 7, "type": "FLOAT", "link": 34}, {"label": "u_float5", "localized_name": "floats.u_float5", "name": "floats.u_float5", "shape": 7, "type": "FLOAT", "link": 35}, {"label": "u_float6", "localized_name": "floats.u_float6", "name": "floats.u_float6", "shape": 7, "type": "FLOAT", "link": 36}, {"label": "u_float7", "localized_name": "floats.u_float7", "name": "floats.u_float7", "shape": 7, "type": "FLOAT", "link": 37}, {"label": "u_float8", "localized_name": "floats.u_float8", "name": "floats.u_float8", "shape": 7, "type": "FLOAT", "link": 38}, {"label": "u_bool0", "localized_name": "bools.u_bool0", "name": "bools.u_bool0", "shape": 7, "type": "BOOLEAN", "link": 39}, {"localized_name": "fragment_shader", "name": "fragment_shader", "type": "STRING", "widget": {"name": "fragment_shader"}, "link": null}, {"localized_name": "size_mode", "name": "size_mode", "type": "COMFY_DYNAMICCOMBO_V3", "widget": {"name": "size_mode"}, "link": null}], "outputs": [{"localized_name": "IMAGE0", "name": "IMAGE0", "type": "IMAGE", "links": [28]}, {"localized_name": "IMAGE1", "name": "IMAGE1", "type": "IMAGE", "links": null}, {"localized_name": "IMAGE2", "name": "IMAGE2", "type": "IMAGE", "links": null}, {"localized_name": "IMAGE3", "name": "IMAGE3", "type": "IMAGE", "links": null}], "properties": {"Node name for S&R": "GLSLShader"}, "widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform float u_float0; // shadows red (-100 to 100)\nuniform float u_float1; // shadows green (-100 to 100)\nuniform float u_float2; // shadows blue (-100 to 100)\nuniform float u_float3; // midtones red (-100 to 100)\nuniform float u_float4; // midtones green (-100 to 100)\nuniform float u_float5; // midtones blue (-100 to 100)\nuniform float u_float6; // highlights red (-100 to 100)\nuniform float u_float7; // highlights green (-100 to 100)\nuniform float u_float8; // highlights blue (-100 to 100)\nuniform bool u_bool0; // preserve luminosity\n\nin vec2 v_texCoord;\nout vec4 fragColor;\n\nvec3 rgb2hsl(vec3 c) {\n float maxC = max(c.r, max(c.g, c.b));\n float minC = min(c.r, min(c.g, c.b));\n float l = (maxC + minC) * 0.5;\n if (maxC == minC) return vec3(0.0, 0.0, l);\n float d = maxC - minC;\n float s = l > 0.5 ? d / (2.0 - maxC - minC) : d / (maxC + minC);\n float h;\n if (maxC == c.r) {\n h = (c.g - c.b) / d + (c.g < c.b ? 6.0 : 0.0);\n } else if (maxC == c.g) {\n h = (c.b - c.r) / d + 2.0;\n } else {\n h = (c.r - c.g) / d + 4.0;\n }\n h /= 6.0;\n return vec3(h, s, l);\n}\n\nfloat hue2rgb(float p, float q, float t) {\n if (t < 0.0) t += 1.0;\n if (t > 1.0) t -= 1.0;\n if (t < 1.0 / 6.0) return p + (q - p) * 6.0 * t;\n if (t < 1.0 / 2.0) return q;\n if (t < 2.0 / 3.0) return p + (q - p) * (2.0 / 3.0 - t) * 6.0;\n return p;\n}\n\nvec3 hsl2rgb(vec3 hsl) {\n float h = hsl.x, s = hsl.y, l = hsl.z;\n if (s == 0.0) return vec3(l);\n float q = l < 0.5 ? l * (1.0 + s) : l + s - l * s;\n float p = 2.0 * l - q;\n return vec3(\n hue2rgb(p, q, h + 1.0 / 3.0),\n hue2rgb(p, q, h),\n hue2rgb(p, q, h - 1.0 / 3.0)\n );\n}\n\nvoid main() {\n vec4 tex = texture(u_image0, v_texCoord);\n vec3 color = tex.rgb;\n\n // Build shadows/midtones/highlights vectors (scale -100..100 to -1..1)\n vec3 shadows = vec3(u_float0, u_float1, u_float2) * 0.01;\n vec3 midtones = vec3(u_float3, u_float4, u_float5) * 0.01;\n vec3 highlights = vec3(u_float6, u_float7, u_float8) * 0.01;\n\n // GIMP: HSL lightness for weight calculation\n float maxC = max(color.r, max(color.g, color.b));\n float minC = min(color.r, min(color.g, color.b));\n float lightness = (maxC + minC) * 0.5;\n\n // GIMP weight curves: linear ramps with constants a=0.25, b=0.333, scale=0.7\n const float a = 0.25;\n const float b = 0.333;\n const float scale = 0.7;\n\n float sw = clamp((lightness - b) / -a + 0.5, 0.0, 1.0) * scale;\n float mw = clamp((lightness - b) / a + 0.5, 0.0, 1.0) *\n clamp((lightness + b - 1.0) / -a + 0.5, 0.0, 1.0) * scale;\n float hw = clamp((lightness + b - 1.0) / a + 0.5, 0.0, 1.0) * scale;\n\n color += sw * shadows + mw * midtones + hw * highlights;\n\n if (u_bool0) {\n vec3 hsl = rgb2hsl(clamp(color, 0.0, 1.0));\n hsl.z = lightness;\n color = hsl2rgb(hsl);\n }\n\n fragColor = vec4(clamp(color, 0.0, 1.0), tex.a);\n}", "from_input"]}], "groups": [], "links": [{"id": 29, "origin_id": -10, "origin_slot": 0, "target_id": 15, "target_slot": 0, "type": "IMAGE"}, {"id": 28, "origin_id": 15, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}, {"id": 30, "origin_id": 4, "origin_slot": 0, "target_id": 15, "target_slot": 2, "type": "FLOAT"}, {"id": 31, "origin_id": 5, "origin_slot": 0, "target_id": 15, "target_slot": 3, "type": "FLOAT"}, {"id": 32, "origin_id": 6, "origin_slot": 0, "target_id": 15, "target_slot": 4, "type": "FLOAT"}, {"id": 33, "origin_id": 7, "origin_slot": 0, "target_id": 15, "target_slot": 5, "type": "FLOAT"}, {"id": 34, "origin_id": 8, "origin_slot": 0, "target_id": 15, "target_slot": 6, "type": "FLOAT"}, {"id": 35, "origin_id": 9, "origin_slot": 0, "target_id": 15, "target_slot": 7, "type": "FLOAT"}, {"id": 36, "origin_id": 10, "origin_slot": 0, "target_id": 15, "target_slot": 8, "type": "FLOAT"}, {"id": 37, "origin_id": 11, "origin_slot": 0, "target_id": 15, "target_slot": 9, "type": "FLOAT"}, {"id": 38, "origin_id": 12, "origin_slot": 0, "target_id": 15, "target_slot": 10, "type": "FLOAT"}, {"id": 39, "origin_id": 13, "origin_slot": 0, "target_id": 15, "target_slot": 11, "type": "BOOLEAN"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image Tools/Color adjust"}]}} \ No newline at end of file diff --git a/blueprints/Color Curves.json b/blueprints/Color Curves.json new file mode 100644 index 000000000..aca5bf0b0 --- /dev/null +++ b/blueprints/Color Curves.json @@ -0,0 +1 @@ +{"revision": 0,"last_node_id": 10,"last_link_id": 0,"nodes": [{"id": 10,"type": "d5c462c8-1372-4af8-84f2-547c83470d04","pos": [3610,-2630],"size": [270,420],"flags": {},"order": 0,"mode": 0,"inputs": [{"label": "image","localized_name": "images.image0","name": "images.image0","type": "IMAGE","link": null}],"outputs": [{"label": "IMAGE","localized_name": "IMAGE0","name": "IMAGE0","type": "IMAGE","links": []}],"properties": {"proxyWidgets": [["4","curve"],["5","curve"],["6","curve"],["7","curve"]]},"widgets_values": [],"title": "Color Curves"}],"links": [],"version": 0.4,"definitions": {"subgraphs": [{"id": "d5c462c8-1372-4af8-84f2-547c83470d04","version": 1,"state": {"lastGroupId": 0,"lastNodeId": 8,"lastLinkId": 33,"lastRerouteId": 0},"revision": 0,"config": {},"name": "Color Curves","inputNode": {"id": -10,"bounding": [2660,-4500,120,60]},"outputNode": {"id": -20,"bounding": [4270,-4500,120,60]},"inputs": [{"id": "abc345b7-f55e-4f32-a11d-3aa4c2b0936b","name": "images.image0","type": "IMAGE","linkIds": [29],"localized_name": "images.image0","label": "image","pos": [2760,-4480]}],"outputs": [{"id": "eb0ec079-46da-4408-8263-9ef85569d33d","name": "IMAGE0","type": "IMAGE","linkIds": [28],"localized_name": "IMAGE0","label": "IMAGE","pos": [4290,-4480]}],"widgets": [],"nodes": [{"id": 4,"type": "CurveEditor","pos": [3060,-4500],"size": [270,200],"flags": {},"order": 0,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [30]}],"title": "RGB Master","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 5,"type": "CurveEditor","pos": [3060,-4250],"size": [270,200],"flags": {},"order": 1,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [31]}],"title": "Red","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 6,"type": "CurveEditor","pos": [3060,-4000],"size": [270,200],"flags": {},"order": 2,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [32]}],"title": "Green","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 7,"type": "CurveEditor","pos": [3060,-3750],"size": [270,200],"flags": {},"order": 3,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [33]}],"title": "Blue","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 8,"type": "GLSLShader","pos": [3590,-4500],"size": [420,500],"flags": {},"order": 4,"mode": 0,"inputs": [{"label": "image0","localized_name": "images.image0","name": "images.image0","type": "IMAGE","link": 29},{"label": "image1","localized_name": "images.image1","name": "images.image1","shape": 7,"type": "IMAGE","link": null},{"label": "u_curve0","localized_name": "curves.u_curve0","name": "curves.u_curve0","shape": 7,"type": "CURVE","link": 30},{"label": "u_curve1","localized_name": "curves.u_curve1","name": "curves.u_curve1","shape": 7,"type": "CURVE","link": 31},{"label": "u_curve2","localized_name": "curves.u_curve2","name": "curves.u_curve2","shape": 7,"type": "CURVE","link": 32},{"label": "u_curve3","localized_name": "curves.u_curve3","name": "curves.u_curve3","shape": 7,"type": "CURVE","link": 33},{"localized_name": "fragment_shader","name": "fragment_shader","type": "STRING","widget": {"name": "fragment_shader"},"link": null},{"localized_name": "size_mode","name": "size_mode","type": "COMFY_DYNAMICCOMBO_V3","widget": {"name": "size_mode"},"link": null}],"outputs": [{"localized_name": "IMAGE0","name": "IMAGE0","type": "IMAGE","links": [28]},{"localized_name": "IMAGE1","name": "IMAGE1","type": "IMAGE","links": null},{"localized_name": "IMAGE2","name": "IMAGE2","type": "IMAGE","links": null},{"localized_name": "IMAGE3","name": "IMAGE3","type": "IMAGE","links": null}],"properties": {"Node name for S&R": "GLSLShader"},"widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform sampler2D u_curve0; // RGB master curve (256x1 LUT)\nuniform sampler2D u_curve1; // Red channel curve\nuniform sampler2D u_curve2; // Green channel curve\nuniform sampler2D u_curve3; // Blue channel curve\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\n// GIMP-compatible curve lookup with manual linear interpolation.\n// Matches gimp_curve_map_value_inline() from gimpcurve-map.c:\n// index = value * (n_samples - 1)\n// f = fract(index)\n// result = (1-f) * samples[floor] + f * samples[ceil]\n//\n// Uses texelFetch (NEAREST) to avoid GPU half-texel offset issues\n// that occur with texture() + GL_LINEAR on small 256x1 LUTs.\nfloat applyCurve(sampler2D curve, float value) {\n value = clamp(value, 0.0, 1.0);\n\n float pos = value * 255.0;\n int lo = int(floor(pos));\n int hi = min(lo + 1, 255);\n float f = pos - float(lo);\n\n float a = texelFetch(curve, ivec2(lo, 0), 0).r;\n float b = texelFetch(curve, ivec2(hi, 0), 0).r;\n\n return a + f * (b - a);\n}\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n\n // GIMP order: per-channel curves first, then RGB master curve.\n // See gimp_curve_map_pixels() default case in gimpcurve-map.c:\n // dest = colors_curve( channel_curve( src ) )\n color.r = applyCurve(u_curve0, applyCurve(u_curve1, color.r));\n color.g = applyCurve(u_curve0, applyCurve(u_curve2, color.g));\n color.b = applyCurve(u_curve0, applyCurve(u_curve3, color.b));\n\n fragColor0 = vec4(color.rgb, color.a);\n}\n","from_input"]}],"groups": [],"links": [{"id": 29,"origin_id": -10,"origin_slot": 0,"target_id": 8,"target_slot": 0,"type": "IMAGE"},{"id": 28,"origin_id": 8,"origin_slot": 0,"target_id": -20,"target_slot": 0,"type": "IMAGE"},{"id": 30,"origin_id": 4,"origin_slot": 0,"target_id": 8,"target_slot": 2,"type": "CURVE"},{"id": 31,"origin_id": 5,"origin_slot": 0,"target_id": 8,"target_slot": 3,"type": "CURVE"},{"id": 32,"origin_id": 6,"origin_slot": 0,"target_id": 8,"target_slot": 4,"type": "CURVE"},{"id": 33,"origin_id": 7,"origin_slot": 0,"target_id": 8,"target_slot": 5,"type": "CURVE"}],"extra": {"workflowRendererVersion": "LG"},"category": "Image Tools/Color adjust"}]}} \ No newline at end of file diff --git a/comfy_extras/nodes_glsl.py b/comfy_extras/nodes_glsl.py index 2a59a9285..0e4d957ff 100644 --- a/comfy_extras/nodes_glsl.py +++ b/comfy_extras/nodes_glsl.py @@ -87,7 +87,9 @@ class SizeModeInput(TypedDict): MAX_IMAGES = 5 # u_image0-4 -MAX_UNIFORMS = 5 # u_float0-4, u_int0-4 +MAX_UNIFORMS = 20 # u_float0-19, u_int0-19 +MAX_BOOLS = 10 # u_bool0-9 +MAX_CURVES = 4 # u_curve0-3 (1D LUT textures) MAX_OUTPUTS = 4 # fragColor0-3 (MRT) # Vertex shader using gl_VertexID trick - no VBO needed. @@ -497,6 +499,8 @@ def _render_shader_batch( image_batches: list[list[np.ndarray]], floats: list[float], ints: list[int], + bools: list[bool] | None = None, + curves: list[np.ndarray] | None = None, ) -> list[list[np.ndarray]]: """ Render a fragment shader for multiple batches efficiently. @@ -511,6 +515,8 @@ def _render_shader_batch( image_batches: List of batches, each batch is a list of input images (H, W, C) float32 [0,1] floats: List of float uniforms ints: List of int uniforms + bools: List of bool uniforms (passed as int 0/1 to GLSL bool uniforms) + curves: List of 1D LUT arrays (float32) of arbitrary size for u_curve0-N Returns: List of batch outputs, each is a list of output images (H, W, 4) float32 [0,1] @@ -533,11 +539,17 @@ def _render_shader_batch( # Detect multi-pass rendering num_passes = _detect_pass_count(fragment_code) + if bools is None: + bools = [] + if curves is None: + curves = [] + # Track resources for cleanup program = None fbo = None output_textures = [] input_textures = [] + curve_textures = [] ping_pong_textures = [] ping_pong_fbos = [] @@ -624,6 +636,28 @@ def _render_shader_batch( if loc >= 0: gl.glUniform1i(loc, v) + for i, v in enumerate(bools): + loc = gl.glGetUniformLocation(program, f"u_bool{i}") + if loc >= 0: + gl.glUniform1i(loc, 1 if v else 0) + + # Create 1D LUT textures for curves (bound after image texture units) + for i, lut in enumerate(curves): + tex = gl.glGenTextures(1) + curve_textures.append(tex) + unit = MAX_IMAGES + i + gl.glActiveTexture(gl.GL_TEXTURE0 + unit) + gl.glBindTexture(gl.GL_TEXTURE_2D, tex) + gl.glTexImage2D(gl.GL_TEXTURE_2D, 0, gl.GL_R32F, len(lut), 1, 0, gl.GL_RED, gl.GL_FLOAT, lut) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MIN_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_MAG_FILTER, gl.GL_LINEAR) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_WRAP_S, gl.GL_CLAMP_TO_EDGE) + gl.glTexParameteri(gl.GL_TEXTURE_2D, gl.GL_TEXTURE_WRAP_T, gl.GL_CLAMP_TO_EDGE) + + loc = gl.glGetUniformLocation(program, f"u_curve{i}") + if loc >= 0: + gl.glUniform1i(loc, unit) + # Get u_pass uniform location for multi-pass pass_loc = gl.glGetUniformLocation(program, "u_pass") @@ -718,6 +752,8 @@ def _render_shader_batch( for tex in input_textures: gl.glDeleteTextures(int(tex)) + for tex in curve_textures: + gl.glDeleteTextures(int(tex)) for tex in output_textures: gl.glDeleteTextures(int(tex)) for tex in ping_pong_textures: @@ -754,6 +790,20 @@ class GLSLShader(io.ComfyNode): max=MAX_UNIFORMS, ) + bool_template = io.Autogrow.TemplatePrefix( + io.Boolean.Input("bool", default=False), + prefix="u_bool", + min=0, + max=MAX_BOOLS, + ) + + curve_template = io.Autogrow.TemplatePrefix( + io.Curve.Input("curve"), + prefix="u_curve", + min=0, + max=MAX_CURVES, + ) + return io.Schema( node_id="GLSLShader", display_name="GLSL Shader", @@ -762,6 +812,7 @@ class GLSLShader(io.ComfyNode): "Apply GLSL ES fragment shaders to images. " "u_resolution (vec2) is always available." ), + is_experimental=True, inputs=[ io.String.Input( "fragment_shader", @@ -796,6 +847,8 @@ class GLSLShader(io.ComfyNode): io.Autogrow.Input("images", template=image_template, tooltip=f"Images are available as u_image0-{MAX_IMAGES-1} (sampler2D) in the shader code"), io.Autogrow.Input("floats", template=float_template, tooltip=f"Floats are available as u_float0-{MAX_UNIFORMS-1} in the shader code"), io.Autogrow.Input("ints", template=int_template, tooltip=f"Ints are available as u_int0-{MAX_UNIFORMS-1} in the shader code"), + io.Autogrow.Input("bools", template=bool_template, tooltip=f"Booleans are available as u_bool0-{MAX_BOOLS-1} (bool) in the shader code"), + io.Autogrow.Input("curves", template=curve_template, tooltip=f"Curves are available as u_curve0-{MAX_CURVES-1} (sampler2D, 1D LUT) in the shader code. Sample with texture(u_curve0, vec2(x, 0.5)).r"), ], outputs=[ io.Image.Output(display_name="IMAGE0", tooltip="Available via layout(location = 0) out vec4 fragColor0 in the shader code"), @@ -813,13 +866,19 @@ class GLSLShader(io.ComfyNode): images: io.Autogrow.Type, floats: io.Autogrow.Type = None, ints: io.Autogrow.Type = None, + bools: io.Autogrow.Type = None, + curves: io.Autogrow.Type = None, **kwargs, ) -> io.NodeOutput: + image_list = [v for v in images.values() if v is not None] float_list = ( [v if v is not None else 0.0 for v in floats.values()] if floats else [] ) int_list = [v if v is not None else 0 for v in ints.values()] if ints else [] + bool_list = [v if v is not None else False for v in bools.values()] if bools else [] + + curve_luts = [v.to_lut().astype(np.float32) for v in curves.values() if v is not None] if curves else [] if not image_list: raise ValueError("At least one input image is required") @@ -846,6 +905,8 @@ class GLSLShader(io.ComfyNode): image_batches, float_list, int_list, + bool_list, + curve_luts, ) # Collect outputs into tensors From b1fdbeb9a71ca3b51e594ba457c1d5f001359c92 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 26 Mar 2026 19:18:16 -0700 Subject: [PATCH 05/29] Fix blur and sharpen nodes not working with fp16 intermediates. (#13181) --- comfy_extras/nodes_post_processing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index 06626f9dd..9037c3d20 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -67,11 +67,11 @@ class Blend(io.ComfyNode): def g(cls, x): return torch.where(x <= 0.25, ((16 * x - 12) * x + 4) * x, torch.sqrt(x)) -def gaussian_kernel(kernel_size: int, sigma: float, device=None): +def gaussian_kernel(kernel_size: int, sigma: float, device=None, dtype=torch.float32): x, y = torch.meshgrid(torch.linspace(-1, 1, kernel_size, device=device), torch.linspace(-1, 1, kernel_size, device=device), indexing="ij") d = torch.sqrt(x * x + y * y) g = torch.exp(-(d * d) / (2.0 * sigma * sigma)) - return g / g.sum() + return (g / g.sum()).to(dtype) class Blur(io.ComfyNode): @classmethod @@ -99,7 +99,7 @@ class Blur(io.ComfyNode): batch_size, height, width, channels = image.shape kernel_size = blur_radius * 2 + 1 - kernel = gaussian_kernel(kernel_size, sigma, device=image.device).repeat(channels, 1, 1).unsqueeze(1) + kernel = gaussian_kernel(kernel_size, sigma, device=image.device, dtype=image.dtype).repeat(channels, 1, 1).unsqueeze(1) image = image.permute(0, 3, 1, 2) # Torch wants (B, C, H, W) we use (B, H, W, C) padded_image = F.pad(image, (blur_radius,blur_radius,blur_radius,blur_radius), 'reflect') @@ -200,7 +200,7 @@ class Sharpen(io.ComfyNode): image = image.to(comfy.model_management.get_torch_device()) kernel_size = sharpen_radius * 2 + 1 - kernel = gaussian_kernel(kernel_size, sigma, device=image.device) * -(alpha*10) + kernel = gaussian_kernel(kernel_size, sigma, device=image.device, dtype=image.dtype) * -(alpha*10) kernel = kernel.to(dtype=image.dtype) center = kernel_size // 2 kernel[center, center] = kernel[center, center] - kernel.sum() + 1.0 From 225c52f6a4fb4e4591ee1fa648bbb8d4266b324a Mon Sep 17 00:00:00 2001 From: Jin Yi Date: Fri, 27 Mar 2026 14:13:29 +0900 Subject: [PATCH 06/29] fix: register image/svg+xml MIME type for .svg files (#13186) The /view endpoint returns text/plain for .svg files on some platforms because Python's mimetypes module does not always include SVG by default. Explicitly register image/svg+xml so tags can render SVGs correctly. Amp-Thread-ID: https://ampcode.com/threads/T-019d2da7-6a64-726a-af91-bd9c44e7f43c --- utils/mime_types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/mime_types.py b/utils/mime_types.py index 916e963c5..a173ad109 100644 --- a/utils/mime_types.py +++ b/utils/mime_types.py @@ -24,6 +24,7 @@ def init_mime_types(): # Web types (used by server.py for static file serving) mimetypes.add_type('application/javascript; charset=utf-8', '.js') mimetypes.add_type('image/webp', '.webp') + mimetypes.add_type('image/svg+xml', '.svg') # Model and data file types (used by asset scanning / metadata extraction) mimetypes.add_type("application/safetensors", ".safetensors") From 85b74951355d272d3da6c2eefe79b46c1d5619ca Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Sat, 28 Mar 2026 01:13:02 +0800 Subject: [PATCH 07/29] chore: update workflow templates to v0.9.39 (#13196) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d780b2f50..6f0659a00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.8 -comfyui-workflow-templates==0.9.38 +comfyui-workflow-templates==0.9.39 comfyui-embedded-docs==0.4.3 torch torchsde From 6a2cdb817dfee967c02f65eddd9fd85a1d7bf53e Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 27 Mar 2026 21:11:41 +0200 Subject: [PATCH 08/29] fix(api-nodes-nanobana): raise error when not output image is present (#13167) Signed-off-by: bigcat88 --- comfy_api_nodes/nodes_gemini.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 25d747e76..2b77a022e 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -201,6 +201,16 @@ async def get_image_from_response(response: GeminiGenerateContentResponse, thoug returned_image = await download_url_to_image_tensor(part.fileData.fileUri) image_tensors.append(returned_image) if len(image_tensors) == 0: + if not thought: + # No images generated --> extract text response for a meaningful error + model_message = get_text_from_response(response).strip() + if model_message: + raise ValueError(f"Gemini did not generate an image. Model response: {model_message}") + raise ValueError( + "Gemini did not generate an image. " + "Try rephrasing your prompt or changing the response modality to 'IMAGE+TEXT' " + "to see the model's reasoning." + ) return torch.zeros((1, 1024, 1024, 4)) return torch.cat(image_tensors, dim=0) From 3a56201da58c24d7b8048b200ef01e285b5f2b8a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 27 Mar 2026 17:36:26 -0700 Subject: [PATCH 09/29] Allow flux conditioning without a pooled output. (#13198) --- comfy/model_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 70aff886e..94579fa3e 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -890,7 +890,7 @@ class Flux(BaseModel): return torch.cat((image, mask), dim=1) def encode_adm(self, **kwargs): - return kwargs["pooled_output"] + return kwargs.get("pooled_output", None) def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) From 3696c5bad6eaab4986d584a77662dfbda46ccc01 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Fri, 27 Mar 2026 21:06:38 -0400 Subject: [PATCH 10/29] Add `has_intermediate_output` flag for nodes with interactive UI (#13048) --- comfy_api/latest/_io.py | 22 ++++++++++++++++++++++ comfy_extras/nodes_glsl.py | 1 + comfy_extras/nodes_images.py | 1 + comfy_extras/nodes_painter.py | 1 + execution.py | 29 ++++++++++++++++++++++++----- server.py | 5 +++++ 6 files changed, 54 insertions(+), 5 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 1cbc8ed26..fdeffea2d 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1373,6 +1373,7 @@ class NodeInfoV1: price_badge: dict | None = None search_aliases: list[str]=None essentials_category: str=None + has_intermediate_output: bool=None @dataclass @@ -1496,6 +1497,16 @@ class Schema: """When True, all inputs from the prompt will be passed to the node as kwargs, even if not defined in the schema.""" essentials_category: str | None = None """Optional category for the Essentials tab. Path-based like category field (e.g., 'Basic', 'Image Tools/Editing').""" + has_intermediate_output: bool=False + """Flags this node as having intermediate output that should persist across page refreshes. + + Nodes with this flag behave like output nodes (their UI results are cached and resent + to the frontend) but do NOT automatically get added to the execution list. This means + they will only execute if they are on the dependency path of a real output node. + + Use this for nodes with interactive/operable UI regions that produce intermediate outputs + (e.g., Image Crop, Painter) rather than final outputs (e.g., Save Image). + """ def validate(self): '''Validate the schema: @@ -1595,6 +1606,7 @@ class Schema: category=self.category, description=self.description, output_node=self.is_output_node, + has_intermediate_output=self.has_intermediate_output, deprecated=self.is_deprecated, experimental=self.is_experimental, dev_only=self.is_dev_only, @@ -1886,6 +1898,14 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): cls.GET_SCHEMA() return cls._OUTPUT_NODE + _HAS_INTERMEDIATE_OUTPUT = None + @final + @classproperty + def HAS_INTERMEDIATE_OUTPUT(cls): # noqa + if cls._HAS_INTERMEDIATE_OUTPUT is None: + cls.GET_SCHEMA() + return cls._HAS_INTERMEDIATE_OUTPUT + _INPUT_IS_LIST = None @final @classproperty @@ -1978,6 +1998,8 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal): cls._API_NODE = schema.is_api_node if cls._OUTPUT_NODE is None: cls._OUTPUT_NODE = schema.is_output_node + if cls._HAS_INTERMEDIATE_OUTPUT is None: + cls._HAS_INTERMEDIATE_OUTPUT = schema.has_intermediate_output if cls._INPUT_IS_LIST is None: cls._INPUT_IS_LIST = schema.is_input_list if cls._NOT_IDEMPOTENT is None: diff --git a/comfy_extras/nodes_glsl.py b/comfy_extras/nodes_glsl.py index 0e4d957ff..ea7420a73 100644 --- a/comfy_extras/nodes_glsl.py +++ b/comfy_extras/nodes_glsl.py @@ -813,6 +813,7 @@ class GLSLShader(io.ComfyNode): "u_resolution (vec2) is always available." ), is_experimental=True, + has_intermediate_output=True, inputs=[ io.String.Input( "fragment_shader", diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index a8223cf8b..a77f0641f 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -59,6 +59,7 @@ class ImageCropV2(IO.ComfyNode): display_name="Image Crop", category="image/transform", essentials_category="Image Tools", + has_intermediate_output=True, inputs=[ IO.Image.Input("image"), IO.BoundingBox.Input("crop_region", component="ImageCrop"), diff --git a/comfy_extras/nodes_painter.py b/comfy_extras/nodes_painter.py index b9ecdf5ea..e104c8480 100644 --- a/comfy_extras/nodes_painter.py +++ b/comfy_extras/nodes_painter.py @@ -30,6 +30,7 @@ class PainterNode(io.ComfyNode): node_id="Painter", display_name="Painter", category="image", + has_intermediate_output=True, inputs=[ io.Image.Input( "image", diff --git a/execution.py b/execution.py index 1a6c3429c..43c3c648d 100644 --- a/execution.py +++ b/execution.py @@ -411,6 +411,19 @@ def format_value(x): else: return str(x) +def _is_intermediate_output(dynprompt, node_id): + class_type = dynprompt.get_node(node_id)["class_type"] + class_def = nodes.NODE_CLASS_MAPPINGS[class_type] + return getattr(class_def, 'HAS_INTERMEDIATE_OUTPUT', False) + +def _send_cached_ui(server, node_id, display_node_id, cached, prompt_id, ui_outputs): + if server.client_id is None: + return + cached_ui = cached.ui or {} + server.send_sync("executed", { "node": node_id, "display_node": display_node_id, "output": cached_ui.get("output", None), "prompt_id": prompt_id }, server.client_id) + if cached.ui is not None: + ui_outputs[node_id] = cached.ui + async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) @@ -421,11 +434,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, class_def = nodes.NODE_CLASS_MAPPINGS[class_type] cached = await caches.outputs.get(unique_id) if cached is not None: - if server.client_id is not None: - cached_ui = cached.ui or {} - server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_ui.get("output",None), "prompt_id": prompt_id }, server.client_id) - if cached.ui is not None: - ui_outputs[unique_id] = cached.ui + _send_cached_ui(server, unique_id, display_node_id, cached, prompt_id, ui_outputs) get_progress_state().finish_progress(unique_id) execution_list.cache_update(unique_id, cached) return (ExecutionResult.SUCCESS, None, None) @@ -767,6 +776,16 @@ class PromptExecutor: self.caches.outputs.poll(ram_headroom=self.cache_args["ram"]) else: # Only execute when the while-loop ends without break + # Send cached UI for intermediate output nodes that weren't executed + for node_id in dynamic_prompt.all_node_ids(): + if node_id in executed: + continue + if not _is_intermediate_output(dynamic_prompt, node_id): + continue + cached = await self.caches.outputs.get(node_id) + if cached is not None: + display_node_id = dynamic_prompt.get_display_node_id(node_id) + _send_cached_ui(self.server, node_id, display_node_id, cached, prompt_id, ui_node_outputs) self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False) ui_outputs = {} diff --git a/server.py b/server.py index 173a28376..27b14825e 100644 --- a/server.py +++ b/server.py @@ -709,6 +709,11 @@ class PromptServer(): else: info['output_node'] = False + if hasattr(obj_class, 'HAS_INTERMEDIATE_OUTPUT') and obj_class.HAS_INTERMEDIATE_OUTPUT == True: + info['has_intermediate_output'] = True + else: + info['has_intermediate_output'] = False + if hasattr(obj_class, 'CATEGORY'): info['category'] = obj_class.CATEGORY From b353a7c863e0ada594c0779e227915fea2e8296f Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Fri, 27 Mar 2026 18:34:16 -0700 Subject: [PATCH 11/29] Integrate RAM cache with model RAM management (#13173) --- comfy/cli_args.py | 4 +++- comfy/memory_management.py | 14 ++++++++++++ comfy/model_management.py | 8 +++---- comfy/model_patcher.py | 3 --- comfy/pinned_memory.py | 6 +++++ comfy/sd.py | 6 ----- comfy_execution/caching.py | 46 +++++++++++++++++--------------------- execution.py | 9 +++++++- main.py | 8 +++++-- 9 files changed, 61 insertions(+), 43 deletions(-) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 13612175e..dbaadf723 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -110,11 +110,13 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.") +CACHE_RAM_AUTO_GB = -1.0 + cache_group = parser.add_mutually_exclusive_group() cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") -cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB") +cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") diff --git a/comfy/memory_management.py b/comfy/memory_management.py index f9078fe7c..48e3c11da 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -141,3 +141,17 @@ def interpret_gathered_like(tensors, gathered): return dest_views aimdo_enabled = False + +extra_ram_release_callback = None +RAM_CACHE_HEADROOM = 0 + +def set_ram_cache_release_state(callback, headroom): + global extra_ram_release_callback + global RAM_CACHE_HEADROOM + extra_ram_release_callback = callback + RAM_CACHE_HEADROOM = max(0, int(headroom)) + +def extra_ram_release(target): + if extra_ram_release_callback is None: + return 0 + return extra_ram_release_callback(target) diff --git a/comfy/model_management.py b/comfy/model_management.py index 9617d8388..ce079cf2f 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -669,7 +669,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins for i in range(len(current_loaded_models) -1, -1, -1): shift_model = current_loaded_models[i] - if shift_model.device == device: + if device is None or shift_model.device == device: if shift_model not in keep_loaded and not shift_model.is_dead(): can_unload.append((-shift_model.model_offloaded_memory(), sys.getrefcount(shift_model.model), shift_model.model_memory(), i)) shift_model.currently_used = False @@ -679,8 +679,8 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins i = x[-1] memory_to_free = 1e32 pins_to_free = 1e32 - if not DISABLE_SMART_MEMORY: - memory_to_free = memory_required - get_free_memory(device) + if not DISABLE_SMART_MEMORY or device is None: + memory_to_free = 0 if device is None else memory_required - get_free_memory(device) pins_to_free = pins_required - get_free_ram() if current_loaded_models[i].model.is_dynamic() and for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models @@ -708,7 +708,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins if len(unloaded_model) > 0: soft_empty_cache() - else: + elif device is not None: if vram_state != VRAMState.HIGH_VRAM: mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True) if mem_free_torch > mem_free_total * 0.25: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index c26d37db2..6deb71e12 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -300,9 +300,6 @@ class ModelPatcher: def model_mmap_residency(self, free=False): return comfy.model_management.module_mmap_residency(self.model, free=free) - def get_ram_usage(self): - return self.model_size() - def loaded_size(self): return self.model.model_loaded_weight_memory diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index f6fb806c4..6f142282d 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -2,6 +2,7 @@ import comfy.model_management import comfy.memory_management import comfy_aimdo.host_buffer import comfy_aimdo.torch +import psutil from comfy.cli_args import args @@ -12,6 +13,11 @@ def pin_memory(module): if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None: return #FIXME: This is a RAM cache trigger event + ram_headroom = comfy.memory_management.RAM_CACHE_HEADROOM + #we split the difference and assume half the RAM cache headroom is for us + if ram_headroom > 0 and psutil.virtual_memory().available < (ram_headroom * 0.5): + comfy.memory_management.extra_ram_release(ram_headroom) + size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY: diff --git a/comfy/sd.py b/comfy/sd.py index e2645438c..e1a2840d2 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -280,9 +280,6 @@ class CLIP: n.apply_hooks_to_conds = self.apply_hooks_to_conds return n - def get_ram_usage(self): - return self.patcher.get_ram_usage() - def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): return self.patcher.add_patches(patches, strength_patch, strength_model) @@ -840,9 +837,6 @@ class VAE: self.size = comfy.model_management.module_size(self.first_stage_model) return self.size - def get_ram_usage(self): - return self.model_size() - def throw_exception_if_invalid(self): if self.first_stage_model is None: raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.") diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index 78212bde3..f9c913bdb 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -1,6 +1,5 @@ import asyncio import bisect -import gc import itertools import psutil import time @@ -475,6 +474,10 @@ class LRUCache(BasicCache): self._mark_used(node_id) return await self._set_immediate(node_id, value) + def set_local(self, node_id, value): + self._mark_used(node_id) + BasicCache.set_local(self, node_id, value) + async def ensure_subcache_for(self, node_id, children_ids): # Just uses subcaches for tracking 'live' nodes await super()._ensure_subcache(node_id, children_ids) @@ -489,15 +492,10 @@ class LRUCache(BasicCache): return self -#Iterating the cache for usage analysis might be expensive, so if we trigger make sure -#to take a chunk out to give breathing space on high-node / low-ram-per-node flows. +#Small baseline weight used when a cache entry has no measurable CPU tensors. +#Keeps unknown-sized entries in eviction scoring without dominating tensor-backed entries. -RAM_CACHE_HYSTERESIS = 1.1 - -#This is kinda in GB but not really. It needs to be non-zero for the below heuristic -#and as long as Multi GB models dwarf this it will approximate OOM scoring OK - -RAM_CACHE_DEFAULT_RAM_USAGE = 0.1 +RAM_CACHE_DEFAULT_RAM_USAGE = 0.05 #Exponential bias towards evicting older workflows so garbage will be taken out #in constantly changing setups. @@ -521,19 +519,17 @@ class RAMPressureCache(LRUCache): self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time() return await super().get(node_id) - def poll(self, ram_headroom): - def _ram_gb(): - return psutil.virtual_memory().available / (1024**3) + def set_local(self, node_id, value): + self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time() + super().set_local(node_id, value) - if _ram_gb() > ram_headroom: - return - gc.collect() - if _ram_gb() > ram_headroom: + def ram_release(self, target): + if psutil.virtual_memory().available >= target: return clean_list = [] - for key, (outputs, _), in self.cache.items(): + for key, cache_entry in self.cache.items(): oom_score = RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER ** (self.generation - self.used_generation[key]) ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE @@ -542,22 +538,20 @@ class RAMPressureCache(LRUCache): if outputs is None: return for output in outputs: - if isinstance(output, list): + if isinstance(output, (list, tuple)): scan_list_for_ram_usage(output) elif isinstance(output, torch.Tensor) and output.device.type == 'cpu': - #score Tensors at a 50% discount for RAM usage as they are likely to - #be high value intermediates - ram_usage += (output.numel() * output.element_size()) * 0.5 - elif hasattr(output, "get_ram_usage"): - ram_usage += output.get_ram_usage() - scan_list_for_ram_usage(outputs) + ram_usage += output.numel() * output.element_size() + scan_list_for_ram_usage(cache_entry.outputs) oom_score *= ram_usage #In the case where we have no information on the node ram usage at all, #break OOM score ties on the last touch timestamp (pure LRU) bisect.insort(clean_list, (oom_score, self.timestamps[key], key)) - while _ram_gb() < ram_headroom * RAM_CACHE_HYSTERESIS and clean_list: + while psutil.virtual_memory().available < target and clean_list: _, _, key = clean_list.pop() del self.cache[key] - gc.collect() + self.used_generation.pop(key, None) + self.timestamps.pop(key, None) + self.children.pop(key, None) diff --git a/execution.py b/execution.py index 43c3c648d..5e02dffb2 100644 --- a/execution.py +++ b/execution.py @@ -724,6 +724,9 @@ class PromptExecutor: self.add_message("execution_start", { "prompt_id": prompt_id}, broadcast=False) self._notify_prompt_lifecycle("start", prompt_id) + ram_headroom = int(self.cache_args["ram"] * (1024 ** 3)) + ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None + comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom) try: with torch.inference_mode(): @@ -773,7 +776,10 @@ class PromptExecutor: execution_list.unstage_node_execution() else: # result == ExecutionResult.SUCCESS: execution_list.complete_node_execution() - self.caches.outputs.poll(ram_headroom=self.cache_args["ram"]) + + if self.cache_type == CacheType.RAM_PRESSURE: + comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom) + comfy.memory_management.extra_ram_release(ram_headroom) else: # Only execute when the while-loop ends without break # Send cached UI for intermediate output nodes that weren't executed @@ -801,6 +807,7 @@ class PromptExecutor: if comfy.model_management.DISABLE_SMART_MEMORY: comfy.model_management.unload_all_models() finally: + comfy.memory_management.set_ram_cache_release_state(None, 0) self._notify_prompt_lifecycle("end", prompt_id) diff --git a/main.py b/main.py index 058e8e2de..12b04719d 100644 --- a/main.py +++ b/main.py @@ -275,15 +275,19 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]: def prompt_worker(q, server_instance): current_time: float = 0.0 + cache_ram = args.cache_ram + if cache_ram < 0: + cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0)) + cache_type = execution.CacheType.CLASSIC if args.cache_lru > 0: cache_type = execution.CacheType.LRU - elif args.cache_ram > 0: + elif cache_ram > 0: cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: cache_type = execution.CacheType.NONE - e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : args.cache_ram } ) + e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } ) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 From fc1fdf3389aa3b6cfcc2946bb9284cae9cd15cc7 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Sat, 28 Mar 2026 13:13:05 -0400 Subject: [PATCH 12/29] fix: avoid nested sampler function calls in Color Curves shader (#13209) --- blueprints/.glsl/Color_Curves_8.frag | 9 ++++++--- blueprints/Color Curves.json | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/blueprints/.glsl/Color_Curves_8.frag b/blueprints/.glsl/Color_Curves_8.frag index c39916726..878ea6875 100644 --- a/blueprints/.glsl/Color_Curves_8.frag +++ b/blueprints/.glsl/Color_Curves_8.frag @@ -38,9 +38,12 @@ void main() { // GIMP order: per-channel curves first, then RGB master curve. // See gimp_curve_map_pixels() default case in gimpcurve-map.c: // dest = colors_curve( channel_curve( src ) ) - color.r = applyCurve(u_curve0, applyCurve(u_curve1, color.r)); - color.g = applyCurve(u_curve0, applyCurve(u_curve2, color.g)); - color.b = applyCurve(u_curve0, applyCurve(u_curve3, color.b)); + float tmp_r = applyCurve(u_curve1, color.r); + float tmp_g = applyCurve(u_curve2, color.g); + float tmp_b = applyCurve(u_curve3, color.b); + color.r = applyCurve(u_curve0, tmp_r); + color.g = applyCurve(u_curve0, tmp_g); + color.b = applyCurve(u_curve0, tmp_b); fragColor0 = vec4(color.rgb, color.a); } diff --git a/blueprints/Color Curves.json b/blueprints/Color Curves.json index aca5bf0b0..933c53978 100644 --- a/blueprints/Color Curves.json +++ b/blueprints/Color Curves.json @@ -1 +1 @@ -{"revision": 0,"last_node_id": 10,"last_link_id": 0,"nodes": [{"id": 10,"type": "d5c462c8-1372-4af8-84f2-547c83470d04","pos": [3610,-2630],"size": [270,420],"flags": {},"order": 0,"mode": 0,"inputs": [{"label": "image","localized_name": "images.image0","name": "images.image0","type": "IMAGE","link": null}],"outputs": [{"label": "IMAGE","localized_name": "IMAGE0","name": "IMAGE0","type": "IMAGE","links": []}],"properties": {"proxyWidgets": [["4","curve"],["5","curve"],["6","curve"],["7","curve"]]},"widgets_values": [],"title": "Color Curves"}],"links": [],"version": 0.4,"definitions": {"subgraphs": [{"id": "d5c462c8-1372-4af8-84f2-547c83470d04","version": 1,"state": {"lastGroupId": 0,"lastNodeId": 8,"lastLinkId": 33,"lastRerouteId": 0},"revision": 0,"config": {},"name": "Color Curves","inputNode": {"id": -10,"bounding": [2660,-4500,120,60]},"outputNode": {"id": -20,"bounding": [4270,-4500,120,60]},"inputs": [{"id": "abc345b7-f55e-4f32-a11d-3aa4c2b0936b","name": "images.image0","type": "IMAGE","linkIds": [29],"localized_name": "images.image0","label": "image","pos": [2760,-4480]}],"outputs": [{"id": "eb0ec079-46da-4408-8263-9ef85569d33d","name": "IMAGE0","type": "IMAGE","linkIds": [28],"localized_name": "IMAGE0","label": "IMAGE","pos": [4290,-4480]}],"widgets": [],"nodes": [{"id": 4,"type": "CurveEditor","pos": [3060,-4500],"size": [270,200],"flags": {},"order": 0,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [30]}],"title": "RGB Master","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 5,"type": "CurveEditor","pos": [3060,-4250],"size": [270,200],"flags": {},"order": 1,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [31]}],"title": "Red","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 6,"type": "CurveEditor","pos": [3060,-4000],"size": [270,200],"flags": {},"order": 2,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [32]}],"title": "Green","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 7,"type": "CurveEditor","pos": [3060,-3750],"size": [270,200],"flags": {},"order": 3,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [33]}],"title": "Blue","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 8,"type": "GLSLShader","pos": [3590,-4500],"size": [420,500],"flags": {},"order": 4,"mode": 0,"inputs": [{"label": "image0","localized_name": "images.image0","name": "images.image0","type": "IMAGE","link": 29},{"label": "image1","localized_name": "images.image1","name": "images.image1","shape": 7,"type": "IMAGE","link": null},{"label": "u_curve0","localized_name": "curves.u_curve0","name": "curves.u_curve0","shape": 7,"type": "CURVE","link": 30},{"label": "u_curve1","localized_name": "curves.u_curve1","name": "curves.u_curve1","shape": 7,"type": "CURVE","link": 31},{"label": "u_curve2","localized_name": "curves.u_curve2","name": "curves.u_curve2","shape": 7,"type": "CURVE","link": 32},{"label": "u_curve3","localized_name": "curves.u_curve3","name": "curves.u_curve3","shape": 7,"type": "CURVE","link": 33},{"localized_name": "fragment_shader","name": "fragment_shader","type": "STRING","widget": {"name": "fragment_shader"},"link": null},{"localized_name": "size_mode","name": "size_mode","type": "COMFY_DYNAMICCOMBO_V3","widget": {"name": "size_mode"},"link": null}],"outputs": [{"localized_name": "IMAGE0","name": "IMAGE0","type": "IMAGE","links": [28]},{"localized_name": "IMAGE1","name": "IMAGE1","type": "IMAGE","links": null},{"localized_name": "IMAGE2","name": "IMAGE2","type": "IMAGE","links": null},{"localized_name": "IMAGE3","name": "IMAGE3","type": "IMAGE","links": null}],"properties": {"Node name for S&R": "GLSLShader"},"widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform sampler2D u_curve0; // RGB master curve (256x1 LUT)\nuniform sampler2D u_curve1; // Red channel curve\nuniform sampler2D u_curve2; // Green channel curve\nuniform sampler2D u_curve3; // Blue channel curve\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\n// GIMP-compatible curve lookup with manual linear interpolation.\n// Matches gimp_curve_map_value_inline() from gimpcurve-map.c:\n// index = value * (n_samples - 1)\n// f = fract(index)\n// result = (1-f) * samples[floor] + f * samples[ceil]\n//\n// Uses texelFetch (NEAREST) to avoid GPU half-texel offset issues\n// that occur with texture() + GL_LINEAR on small 256x1 LUTs.\nfloat applyCurve(sampler2D curve, float value) {\n value = clamp(value, 0.0, 1.0);\n\n float pos = value * 255.0;\n int lo = int(floor(pos));\n int hi = min(lo + 1, 255);\n float f = pos - float(lo);\n\n float a = texelFetch(curve, ivec2(lo, 0), 0).r;\n float b = texelFetch(curve, ivec2(hi, 0), 0).r;\n\n return a + f * (b - a);\n}\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n\n // GIMP order: per-channel curves first, then RGB master curve.\n // See gimp_curve_map_pixels() default case in gimpcurve-map.c:\n // dest = colors_curve( channel_curve( src ) )\n color.r = applyCurve(u_curve0, applyCurve(u_curve1, color.r));\n color.g = applyCurve(u_curve0, applyCurve(u_curve2, color.g));\n color.b = applyCurve(u_curve0, applyCurve(u_curve3, color.b));\n\n fragColor0 = vec4(color.rgb, color.a);\n}\n","from_input"]}],"groups": [],"links": [{"id": 29,"origin_id": -10,"origin_slot": 0,"target_id": 8,"target_slot": 0,"type": "IMAGE"},{"id": 28,"origin_id": 8,"origin_slot": 0,"target_id": -20,"target_slot": 0,"type": "IMAGE"},{"id": 30,"origin_id": 4,"origin_slot": 0,"target_id": 8,"target_slot": 2,"type": "CURVE"},{"id": 31,"origin_id": 5,"origin_slot": 0,"target_id": 8,"target_slot": 3,"type": "CURVE"},{"id": 32,"origin_id": 6,"origin_slot": 0,"target_id": 8,"target_slot": 4,"type": "CURVE"},{"id": 33,"origin_id": 7,"origin_slot": 0,"target_id": 8,"target_slot": 5,"type": "CURVE"}],"extra": {"workflowRendererVersion": "LG"},"category": "Image Tools/Color adjust"}]}} \ No newline at end of file +{"revision": 0,"last_node_id": 10,"last_link_id": 0,"nodes": [{"id": 10,"type": "d5c462c8-1372-4af8-84f2-547c83470d04","pos": [3610,-2630],"size": [270,420],"flags": {},"order": 0,"mode": 0,"inputs": [{"label": "image","localized_name": "images.image0","name": "images.image0","type": "IMAGE","link": null}],"outputs": [{"label": "IMAGE","localized_name": "IMAGE0","name": "IMAGE0","type": "IMAGE","links": []}],"properties": {"proxyWidgets": [["4","curve"],["5","curve"],["6","curve"],["7","curve"]]},"widgets_values": [],"title": "Color Curves"}],"links": [],"version": 0.4,"definitions": {"subgraphs": [{"id": "d5c462c8-1372-4af8-84f2-547c83470d04","version": 1,"state": {"lastGroupId": 0,"lastNodeId": 8,"lastLinkId": 33,"lastRerouteId": 0},"revision": 0,"config": {},"name": "Color Curves","inputNode": {"id": -10,"bounding": [2660,-4500,120,60]},"outputNode": {"id": -20,"bounding": [4270,-4500,120,60]},"inputs": [{"id": "abc345b7-f55e-4f32-a11d-3aa4c2b0936b","name": "images.image0","type": "IMAGE","linkIds": [29],"localized_name": "images.image0","label": "image","pos": [2760,-4480]}],"outputs": [{"id": "eb0ec079-46da-4408-8263-9ef85569d33d","name": "IMAGE0","type": "IMAGE","linkIds": [28],"localized_name": "IMAGE0","label": "IMAGE","pos": [4290,-4480]}],"widgets": [],"nodes": [{"id": 4,"type": "CurveEditor","pos": [3060,-4500],"size": [270,200],"flags": {},"order": 0,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [30]}],"title": "RGB Master","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 5,"type": "CurveEditor","pos": [3060,-4250],"size": [270,200],"flags": {},"order": 1,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [31]}],"title": "Red","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 6,"type": "CurveEditor","pos": [3060,-4000],"size": [270,200],"flags": {},"order": 2,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [32]}],"title": "Green","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 7,"type": "CurveEditor","pos": [3060,-3750],"size": [270,200],"flags": {},"order": 3,"mode": 0,"inputs": [{"label": "curve","localized_name": "curve","name": "curve","type": "CURVE","widget": {"name": "curve"},"link": null},{"label": "histogram","localized_name": "histogram","name": "histogram","type": "HISTOGRAM","shape": 7,"link": null}],"outputs": [{"localized_name": "CURVE","name": "CURVE","type": "CURVE","links": [33]}],"title": "Blue","properties": {"Node name for S&R": "CurveEditor"},"widgets_values": []},{"id": 8,"type": "GLSLShader","pos": [3590,-4500],"size": [420,500],"flags": {},"order": 4,"mode": 0,"inputs": [{"label": "image0","localized_name": "images.image0","name": "images.image0","type": "IMAGE","link": 29},{"label": "image1","localized_name": "images.image1","name": "images.image1","shape": 7,"type": "IMAGE","link": null},{"label": "u_curve0","localized_name": "curves.u_curve0","name": "curves.u_curve0","shape": 7,"type": "CURVE","link": 30},{"label": "u_curve1","localized_name": "curves.u_curve1","name": "curves.u_curve1","shape": 7,"type": "CURVE","link": 31},{"label": "u_curve2","localized_name": "curves.u_curve2","name": "curves.u_curve2","shape": 7,"type": "CURVE","link": 32},{"label": "u_curve3","localized_name": "curves.u_curve3","name": "curves.u_curve3","shape": 7,"type": "CURVE","link": 33},{"localized_name": "fragment_shader","name": "fragment_shader","type": "STRING","widget": {"name": "fragment_shader"},"link": null},{"localized_name": "size_mode","name": "size_mode","type": "COMFY_DYNAMICCOMBO_V3","widget": {"name": "size_mode"},"link": null}],"outputs": [{"localized_name": "IMAGE0","name": "IMAGE0","type": "IMAGE","links": [28]},{"localized_name": "IMAGE1","name": "IMAGE1","type": "IMAGE","links": null},{"localized_name": "IMAGE2","name": "IMAGE2","type": "IMAGE","links": null},{"localized_name": "IMAGE3","name": "IMAGE3","type": "IMAGE","links": null}],"properties": {"Node name for S&R": "GLSLShader"},"widgets_values": ["#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform sampler2D u_curve0; // RGB master curve (256x1 LUT)\nuniform sampler2D u_curve1; // Red channel curve\nuniform sampler2D u_curve2; // Green channel curve\nuniform sampler2D u_curve3; // Blue channel curve\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\n// GIMP-compatible curve lookup with manual linear interpolation.\n// Matches gimp_curve_map_value_inline() from gimpcurve-map.c:\n// index = value * (n_samples - 1)\n// f = fract(index)\n// result = (1-f) * samples[floor] + f * samples[ceil]\n//\n// Uses texelFetch (NEAREST) to avoid GPU half-texel offset issues\n// that occur with texture() + GL_LINEAR on small 256x1 LUTs.\nfloat applyCurve(sampler2D curve, float value) {\n value = clamp(value, 0.0, 1.0);\n\n float pos = value * 255.0;\n int lo = int(floor(pos));\n int hi = min(lo + 1, 255);\n float f = pos - float(lo);\n\n float a = texelFetch(curve, ivec2(lo, 0), 0).r;\n float b = texelFetch(curve, ivec2(hi, 0), 0).r;\n\n return a + f * (b - a);\n}\n\nvoid main() {\n vec4 color = texture(u_image0, v_texCoord);\n\n // GIMP order: per-channel curves first, then RGB master curve.\n // See gimp_curve_map_pixels() default case in gimpcurve-map.c:\n // dest = colors_curve( channel_curve( src ) )\n float tmp_r = applyCurve(u_curve1, color.r);\n float tmp_g = applyCurve(u_curve2, color.g);\n float tmp_b = applyCurve(u_curve3, color.b);\n color.r = applyCurve(u_curve0, tmp_r);\n color.g = applyCurve(u_curve0, tmp_g);\n color.b = applyCurve(u_curve0, tmp_b);\n\n fragColor0 = vec4(color.rgb, color.a);\n}\n","from_input"]}],"groups": [],"links": [{"id": 29,"origin_id": -10,"origin_slot": 0,"target_id": 8,"target_slot": 0,"type": "IMAGE"},{"id": 28,"origin_id": 8,"origin_slot": 0,"target_id": -20,"target_slot": 0,"type": "IMAGE"},{"id": 30,"origin_id": 4,"origin_slot": 0,"target_id": 8,"target_slot": 2,"type": "CURVE"},{"id": 31,"origin_id": 5,"origin_slot": 0,"target_id": 8,"target_slot": 3,"type": "CURVE"},{"id": 32,"origin_id": 6,"origin_slot": 0,"target_id": 8,"target_slot": 4,"type": "CURVE"},{"id": 33,"origin_id": 7,"origin_slot": 0,"target_id": 8,"target_slot": 5,"type": "CURVE"}],"extra": {"workflowRendererVersion": "LG"},"category": "Image Tools/Color adjust"}]}} \ No newline at end of file From 3f77450ef14b51039a10aa847e9137090a32ff48 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 28 Mar 2026 19:35:59 -0700 Subject: [PATCH 13/29] Fix #13214 (#13216) --- comfy/sd.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index e1a2840d2..7425765a4 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1736,15 +1736,16 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable """ dtype = model_options.get("dtype", None) + custom_operations = model_options.get("custom_operations", None) + if custom_operations is None: + sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata) + #Allow loading unets from checkpoint files diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) temp_sd = comfy.utils.state_dict_prefix_replace(sd, {diffusion_model_prefix: ""}, filter_keys=True) if len(temp_sd) > 0: sd = temp_sd - custom_operations = model_options.get("custom_operations", None) - if custom_operations is None: - sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata) parameters = comfy.utils.calculate_parameters(sd) weight_dtype = comfy.utils.weight_dtype(sd) From a500f1edacfa797ab457e2111b5aeb1fd980d1ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Sun, 29 Mar 2026 06:34:10 +0300 Subject: [PATCH 14/29] CORE-13 feat: Support RT-DETRv4 detection model (#12748) --- comfy/ldm/rt_detr/rtdetr_v4.py | 725 +++++++++++++++++++++++++++++++++ comfy/model_base.py | 5 + comfy/model_detection.py | 6 + comfy/supported_models.py | 17 +- comfy_extras/nodes_rtdetr.py | 154 +++++++ comfy_extras/nodes_sdpose.py | 17 +- nodes.py | 1 + 7 files changed, 922 insertions(+), 3 deletions(-) create mode 100644 comfy/ldm/rt_detr/rtdetr_v4.py create mode 100644 comfy_extras/nodes_rtdetr.py diff --git a/comfy/ldm/rt_detr/rtdetr_v4.py b/comfy/ldm/rt_detr/rtdetr_v4.py new file mode 100644 index 000000000..9443761cb --- /dev/null +++ b/comfy/ldm/rt_detr/rtdetr_v4.py @@ -0,0 +1,725 @@ +from collections import OrderedDict +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision +import comfy.model_management +from comfy.ldm.modules.attention import optimized_attention_for_device + +COCO_CLASSES = [ + 'person','bicycle','car','motorcycle','airplane','bus','train','truck','boat', + 'traffic light','fire hydrant','stop sign','parking meter','bench','bird','cat', + 'dog','horse','sheep','cow','elephant','bear','zebra','giraffe','backpack', + 'umbrella','handbag','tie','suitcase','frisbee','skis','snowboard','sports ball', + 'kite','baseball bat','baseball glove','skateboard','surfboard','tennis racket', + 'bottle','wine glass','cup','fork','knife','spoon','bowl','banana','apple', + 'sandwich','orange','broccoli','carrot','hot dog','pizza','donut','cake','chair', + 'couch','potted plant','bed','dining table','toilet','tv','laptop','mouse', + 'remote','keyboard','cell phone','microwave','oven','toaster','sink', + 'refrigerator','book','clock','vase','scissors','teddy bear','hair drier','toothbrush', +] + +# --------------------------------------------------------------------------- +# HGNetv2 backbone +# --------------------------------------------------------------------------- + +class ConvBNAct(nn.Module): + """Conv→BN→ReLU. padding='same' adds asymmetric zero-pad (stem).""" + def __init__(self, ic, oc, k=3, s=1, groups=1, use_act=True, device=None, dtype=None, operations=None): + super().__init__() + + self.conv = operations.Conv2d(ic, oc, k, s, (k - 1) // 2, groups=groups, bias=False, device=device, dtype=dtype) + self.bn = nn.BatchNorm2d(oc, device=device, dtype=dtype) + self.act = nn.ReLU() if use_act else nn.Identity() + + def forward(self, x): + return self.act(self.bn(self.conv(x))) + +class LightConvBNAct(nn.Module): + def __init__(self, ic, oc, k, device=None, dtype=None, operations=None): + super().__init__() + self.conv1 = ConvBNAct(ic, oc, 1, use_act=False, device=device, dtype=dtype, operations=operations) + self.conv2 = ConvBNAct(oc, oc, k, groups=oc, use_act=True, device=device, dtype=dtype, operations=operations) + + def forward(self, x): + return self.conv2(self.conv1(x)) + +class _StemBlock(nn.Module): + def __init__(self, ic, mc, oc, device=None, dtype=None, operations=None): + super().__init__() + self.stem1 = ConvBNAct(ic, mc, 3, 2, device=device, dtype=dtype, operations=operations) + # stem2a/stem2b use kernel=2, stride=1, no internal padding; + # padding is applied manually in forward (matching PaddlePaddle original) + self.stem2a = ConvBNAct(mc, mc//2, 2, 1, device=device, dtype=dtype, operations=operations) + self.stem2b = ConvBNAct(mc//2, mc, 2, 1, device=device, dtype=dtype, operations=operations) + self.stem3 = ConvBNAct(mc*2, mc, 3, 2, device=device, dtype=dtype, operations=operations) + self.stem4 = ConvBNAct(mc, oc, 1, device=device, dtype=dtype, operations=operations) + self.pool = nn.MaxPool2d(2, 1, ceil_mode=True) + + def forward(self, x): + x = self.stem1(x) + x = F.pad(x, (0, 1, 0, 1)) # pad before pool and stem2a + x2 = self.stem2a(x) + x2 = F.pad(x2, (0, 1, 0, 1)) # pad before stem2b + x2 = self.stem2b(x2) + x1 = self.pool(x) + return self.stem4(self.stem3(torch.cat([x1, x2], 1))) + + +class _HG_Block(nn.Module): + def __init__(self, ic, mc, oc, layer_num, k=3, residual=False, light=False, device=None, dtype=None, operations=None): + super().__init__() + self.residual = residual + if light: + self.layers = nn.ModuleList( + [LightConvBNAct(ic if i == 0 else mc, mc, k, device=device, dtype=dtype, operations=operations) for i in range(layer_num)]) + else: + self.layers = nn.ModuleList( + [ConvBNAct(ic if i == 0 else mc, mc, k, device=device, dtype=dtype, operations=operations) for i in range(layer_num)]) + total = ic + layer_num * mc + + self.aggregation = nn.Sequential( + ConvBNAct(total, oc // 2, 1, device=device, dtype=dtype, operations=operations), + ConvBNAct(oc // 2, oc, 1, device=device, dtype=dtype, operations=operations)) + + def forward(self, x): + identity = x + outs = [x] + for layer in self.layers: + x = layer(x) + outs.append(x) + x = self.aggregation(torch.cat(outs, 1)) + return x + identity if self.residual else x + + +class _HG_Stage(nn.Module): + # config order: ic, mc, oc, num_blocks, downsample, light, k, layer_num + def __init__(self, ic, mc, oc, num_blocks, downsample=True, light=False, k=3, layer_num=6, device=None, dtype=None, operations=None): + super().__init__() + if downsample: + self.downsample = ConvBNAct(ic, ic, 3, 2, groups=ic, use_act=False, device=device, dtype=dtype, operations=operations) + else: + self.downsample = nn.Identity() + self.blocks = nn.Sequential(*[ + _HG_Block(ic if i == 0 else oc, mc, oc, layer_num, + k=k, residual=(i != 0), light=light, device=device, dtype=dtype, operations=operations) + for i in range(num_blocks) + ]) + + def forward(self, x): + return self.blocks(self.downsample(x)) + + +class HGNetv2(nn.Module): + # B5 config: stem=[3,32,64], stages=[ic, mc, oc, blocks, down, light, k, layers] + _STAGE_CFGS = [[64, 64, 128, 1, False, False, 3, 6], + [128, 128, 512, 2, True, False, 3, 6], + [512, 256, 1024, 5, True, True, 5, 6], + [1024,512, 2048, 2, True, True, 5, 6]] + + def __init__(self, return_idx=(1, 2, 3), device=None, dtype=None, operations=None): + super().__init__() + self.stem = _StemBlock(3, 32, 64, device=device, dtype=dtype, operations=operations) + self.stages = nn.ModuleList([_HG_Stage(*cfg, device=device, dtype=dtype, operations=operations) for cfg in self._STAGE_CFGS]) + self.return_idx = list(return_idx) + self.out_channels = [self._STAGE_CFGS[i][2] for i in return_idx] + + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + x = self.stem(x) + outs = [] + for i, stage in enumerate(self.stages): + x = stage(x) + if i in self.return_idx: + outs.append(x) + return outs + + +# --------------------------------------------------------------------------- +# Encoder — HybridEncoder (dfine version: RepNCSPELAN4 + SCDown PAN) +# --------------------------------------------------------------------------- + +class ConvNormLayer(nn.Module): + """Conv→act (expects pre-fused BN weights).""" + def __init__(self, ic, oc, k, s, g=1, padding=None, act=None, device=None, dtype=None, operations=None): + super().__init__() + p = (k - 1) // 2 if padding is None else padding + self.conv = operations.Conv2d(ic, oc, k, s, p, groups=g, bias=True, device=device, dtype=dtype) + self.act = nn.SiLU() if act == 'silu' else nn.Identity() + + def forward(self, x): + return self.act(self.conv(x)) + + +class VGGBlock(nn.Module): + """Rep-VGG block (expects pre-fused weights).""" + def __init__(self, ic, oc, device=None, dtype=None, operations=None): + super().__init__() + self.conv = operations.Conv2d(ic, oc, 3, 1, padding=1, bias=True, device=device, dtype=dtype) + self.act = nn.SiLU() + + def forward(self, x): + return self.act(self.conv(x)) + + +class CSPLayer(nn.Module): + def __init__(self, ic, oc, num_blocks=3, expansion=1.0, act='silu', device=None, dtype=None, operations=None): + super().__init__() + h = int(oc * expansion) + self.conv1 = ConvNormLayer(ic, h, 1, 1, act=act, device=device, dtype=dtype, operations=operations) + self.conv2 = ConvNormLayer(ic, h, 1, 1, act=act, device=device, dtype=dtype, operations=operations) + self.bottlenecks = nn.Sequential(*[VGGBlock(h, h, device=device, dtype=dtype, operations=operations) for _ in range(num_blocks)]) + self.conv3 = ConvNormLayer(h, oc, 1, 1, act=act, device=device, dtype=dtype, operations=operations) if h != oc else nn.Identity() + + def forward(self, x): + return self.conv3(self.bottlenecks(self.conv1(x)) + self.conv2(x)) + + +class RepNCSPELAN4(nn.Module): + """CSP-ELAN block — the FPN/PAN block in RTv4's HybridEncoder.""" + def __init__(self, c1, c2, c3, c4, n=3, act='silu', device=None, dtype=None, operations=None): + super().__init__() + self.c = c3 // 2 + self.cv1 = ConvNormLayer(c1, c3, 1, 1, act=act, device=device, dtype=dtype, operations=operations) + self.cv2 = nn.Sequential(CSPLayer(c3 // 2, c4, n, 1.0, act=act, device=device, dtype=dtype, operations=operations), ConvNormLayer(c4, c4, 3, 1, act=act, device=device, dtype=dtype, operations=operations)) + self.cv3 = nn.Sequential(CSPLayer(c4, c4, n, 1.0, act=act, device=device, dtype=dtype, operations=operations), ConvNormLayer(c4, c4, 3, 1, act=act, device=device, dtype=dtype, operations=operations)) + self.cv4 = ConvNormLayer(c3 + 2 * c4, c2, 1, 1, act=act, device=device, dtype=dtype, operations=operations) + + def forward(self, x): + y = list(self.cv1(x).split((self.c, self.c), 1)) + y.extend(m(y[-1]) for m in [self.cv2, self.cv3]) + return self.cv4(torch.cat(y, 1)) + + +class SCDown(nn.Module): + """Separable conv downsampling used in HybridEncoder PAN bottom-up path.""" + def __init__(self, ic, oc, k, s, device=None, dtype=None, operations=None): + super().__init__() + self.cv1 = ConvNormLayer(ic, oc, 1, 1, device=device, dtype=dtype, operations=operations) + self.cv2 = ConvNormLayer(oc, oc, k, s, g=oc, device=device, dtype=dtype, operations=operations) + + def forward(self, x): + return self.cv2(self.cv1(x)) + + +class SelfAttention(nn.Module): + def __init__(self, embed_dim, num_heads, device=None, dtype=None, operations=None): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.q_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype) + self.k_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype) + self.v_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype) + self.out_proj = operations.Linear(embed_dim, embed_dim, device=device, dtype=dtype) + + def forward(self, query, key, value, attn_mask=None): + optimized_attention = optimized_attention_for_device(query.device, False, small_input=True) + q, k, v = self.q_proj(query), self.k_proj(key), self.v_proj(value) + out = optimized_attention(q, k, v, heads=self.num_heads, mask=attn_mask) + return self.out_proj(out) + + +class _TransformerEncoderLayer(nn.Module): + """Single AIFI encoder layer (pre- or post-norm, GELU by default).""" + def __init__(self, d_model, nhead, dim_feedforward, device=None, dtype=None, operations=None): + super().__init__() + self.self_attn = SelfAttention(d_model, nhead, device=device, dtype=dtype, operations=operations) + self.linear1 = operations.Linear(d_model, dim_feedforward, device=device, dtype=dtype) + self.linear2 = operations.Linear(dim_feedforward, d_model, device=device, dtype=dtype) + self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype) + self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype) + self.activation = nn.GELU() + + def forward(self, src, src_mask=None, pos_embed=None): + q = k = src if pos_embed is None else src + pos_embed + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask) + src = self.norm1(src + src2) + src2 = self.linear2(self.activation(self.linear1(src))) + return self.norm2(src + src2) + + +class _TransformerEncoder(nn.Module): + """Thin wrapper so state-dict keys are encoder.0.layers.N.*""" + def __init__(self, num_layers, d_model, nhead, dim_feedforward, device=None, dtype=None, operations=None): + super().__init__() + self.layers = nn.ModuleList([ + _TransformerEncoderLayer(d_model, nhead, dim_feedforward, device=device, dtype=dtype, operations=operations) + for _ in range(num_layers) + ]) + + def forward(self, src, src_mask=None, pos_embed=None): + for layer in self.layers: + src = layer(src, src_mask=src_mask, pos_embed=pos_embed) + return src + + +class HybridEncoder(nn.Module): + def __init__(self, in_channels=(512, 1024, 2048), feat_strides=(8, 16, 32), hidden_dim=256, nhead=8, dim_feedforward=2048, use_encoder_idx=(2,), num_encoder_layers=1, + pe_temperature=10000, expansion=1.0, depth_mult=1.0, act='silu', eval_spatial_size=(640, 640), device=None, dtype=None, operations=None): + super().__init__() + self.in_channels = list(in_channels) + self.feat_strides = list(feat_strides) + self.hidden_dim = hidden_dim + self.use_encoder_idx = list(use_encoder_idx) + self.pe_temperature = pe_temperature + self.eval_spatial_size = eval_spatial_size + self.out_channels = [hidden_dim] * len(in_channels) + self.out_strides = list(feat_strides) + + # channel projection (expects pre-fused weights) + self.input_proj = nn.ModuleList([ + nn.Sequential(OrderedDict([('conv', operations.Conv2d(ch, hidden_dim, 1, bias=True, device=device, dtype=dtype))])) + for ch in in_channels + ]) + + # AIFI transformer — use _TransformerEncoder so keys are encoder.0.layers.N.* + self.encoder = nn.ModuleList([ + _TransformerEncoder(num_encoder_layers, hidden_dim, nhead, dim_feedforward, device=device, dtype=dtype, operations=operations) + for _ in range(len(use_encoder_idx)) + ]) + + nb = round(3 * depth_mult) + exp = expansion + + # top-down FPN (dfine: lateral conv has no act) + self.lateral_convs = nn.ModuleList( + [ConvNormLayer(hidden_dim, hidden_dim, 1, 1, device=device, dtype=dtype, operations=operations) + for _ in range(len(in_channels) - 1)]) + self.fpn_blocks = nn.ModuleList( + [RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(exp * hidden_dim // 2), nb, act=act, device=device, dtype=dtype, operations=operations) + for _ in range(len(in_channels) - 1)]) + + # bottom-up PAN (dfine: nn.Sequential(SCDown) — keeps checkpoint key .0.cv1/.0.cv2) + self.downsample_convs = nn.ModuleList( + [nn.Sequential(SCDown(hidden_dim, hidden_dim, 3, 2, device=device, dtype=dtype, operations=operations)) + for _ in range(len(in_channels) - 1)]) + self.pan_blocks = nn.ModuleList( + [RepNCSPELAN4(hidden_dim * 2, hidden_dim, hidden_dim * 2, round(exp * hidden_dim // 2), nb, act=act, device=device, dtype=dtype, operations=operations) + for _ in range(len(in_channels) - 1)]) + + # cache positional embeddings for fixed spatial size + if eval_spatial_size: + for idx in self.use_encoder_idx: + stride = self.feat_strides[idx] + pe = self._build_pe(eval_spatial_size[1] // stride, + eval_spatial_size[0] // stride, + hidden_dim, pe_temperature) + setattr(self, f'pos_embed{idx}', pe) + + @staticmethod + def _build_pe(w, h, dim=256, temp=10000.): + assert dim % 4 == 0 + gw = torch.arange(w, dtype=torch.float32) + gh = torch.arange(h, dtype=torch.float32) + gw, gh = torch.meshgrid(gw, gh, indexing='ij') + pdim = dim // 4 + omega = 1. / (temp ** (torch.arange(pdim, dtype=torch.float32) / pdim)) + ow = gw.flatten()[:, None] @ omega[None] + oh = gh.flatten()[:, None] @ omega[None] + return torch.cat([ow.sin(), ow.cos(), oh.sin(), oh.cos()], 1)[None] + + def forward(self, feats: List[torch.Tensor]) -> List[torch.Tensor]: + proj = [self.input_proj[i](f) for i, f in enumerate(feats)] + + for i, enc_idx in enumerate(self.use_encoder_idx): + h, w = proj[enc_idx].shape[2:] + src = proj[enc_idx].flatten(2).permute(0, 2, 1) + pe = getattr(self, f'pos_embed{enc_idx}').to(device=src.device, dtype=src.dtype) + for layer in self.encoder[i].layers: + src = layer(src, pos_embed=pe) + proj[enc_idx] = src.permute(0, 2, 1).reshape(-1, self.hidden_dim, h, w).contiguous() + + n = len(self.in_channels) + inner = [proj[-1]] + for k in range(n - 1, 0, -1): + j = n - 1 - k + top = self.lateral_convs[j](inner[0]) + inner[0] = top + up = F.interpolate(top, scale_factor=2., mode='nearest') + inner.insert(0, self.fpn_blocks[j](torch.cat([up, proj[k - 1]], 1))) + + outs = [inner[0]] + for k in range(n - 1): + outs.append(self.pan_blocks[k]( + torch.cat([self.downsample_convs[k](outs[-1]), inner[k + 1]], 1))) + return outs + + +# --------------------------------------------------------------------------- +# Decoder — DFINETransformer +# --------------------------------------------------------------------------- + +def _deformable_attn_v2(value: list, spatial_shapes, sampling_locations: torch.Tensor, attention_weights: torch.Tensor, num_points_list: List[int]) -> torch.Tensor: + """ + value : list of per-level tensors [bs*n_head, c, h_l, w_l] + sampling_locations: [bs, Lq, n_head, sum(pts), 2] in [0,1] + attention_weights : [bs, Lq, n_head, sum(pts)] + """ + _, c = value[0].shape[:2] # bs*n_head, c + _, Lq, n_head, _, _ = sampling_locations.shape + bs = sampling_locations.shape[0] + n_h = n_head + + grids = (2 * sampling_locations - 1) # [bs, Lq, n_head, sum_pts, 2] + grids = grids.permute(0, 2, 1, 3, 4).flatten(0, 1) # [bs*n_head, Lq, sum_pts, 2] + grids_per_lvl = grids.split(num_points_list, dim=2) # list of [bs*n_head, Lq, pts_l, 2] + + sampled = [] + for lvl, (h, w) in enumerate(spatial_shapes): + val_l = value[lvl].reshape(bs * n_h, c, h, w) + sv = F.grid_sample(val_l, grids_per_lvl[lvl], mode='bilinear', padding_mode='zeros', align_corners=False) + sampled.append(sv) # sv: [bs*n_head, c, Lq, pts_l] + + attn = attention_weights.permute(0, 2, 1, 3) # [bs, n_head, Lq, sum_pts] + attn = attn.flatten(0, 1).unsqueeze(1) # [bs*n_head, 1, Lq, sum_pts] + out = (torch.cat(sampled, -1) * attn).sum(-1) # [bs*n_head, c, Lq] + out = out.reshape(bs, n_h * c, Lq) + return out.permute(0, 2, 1) # [bs, Lq, hidden] + + +class MSDeformableAttention(nn.Module): + def __init__(self, embed_dim=256, num_heads=8, num_levels=3, num_points=4, offset_scale=0.5, device=None, dtype=None, operations=None): + super().__init__() + self.embed_dim, self.num_heads = embed_dim, num_heads + self.head_dim = embed_dim // num_heads + pts = num_points if isinstance(num_points, list) else [num_points] * num_levels + self.num_points_list = pts + self.offset_scale = offset_scale + total = num_heads * sum(pts) + self.register_buffer('num_points_scale', torch.tensor([1. / n for n in pts for _ in range(n)], dtype=torch.float32)) + self.sampling_offsets = operations.Linear(embed_dim, total * 2, device=device, dtype=dtype) + self.attention_weights = operations.Linear(embed_dim, total, device=device, dtype=dtype) + + def forward(self, query, ref_pts, value, spatial_shapes): + bs, Lq = query.shape[:2] + offsets = self.sampling_offsets(query).reshape( + bs, Lq, self.num_heads, sum(self.num_points_list), 2) + attn_w = F.softmax( + self.attention_weights(query).reshape( + bs, Lq, self.num_heads, sum(self.num_points_list)), -1) + scale = self.num_points_scale.to(query).unsqueeze(-1) + offset = offsets * scale * ref_pts[:, :, None, :, 2:] * self.offset_scale + locs = ref_pts[:, :, None, :, :2] + offset # [bs, Lq, n_head, sum_pts, 2] + return _deformable_attn_v2(value, spatial_shapes, locs, attn_w, self.num_points_list) + + +class Gate(nn.Module): + def __init__(self, d_model, device=None, dtype=None, operations=None): + super().__init__() + self.gate = operations.Linear(2 * d_model, 2 * d_model, device=device, dtype=dtype) + self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype) + + def forward(self, x1, x2): + g1, g2 = torch.sigmoid(self.gate(torch.cat([x1, x2], -1))).chunk(2, -1) + return self.norm(g1 * x1 + g2 * x2) + + +class MLP(nn.Module): + def __init__(self, in_dim, hidden_dim, out_dim, num_layers, device=None, dtype=None, operations=None): + super().__init__() + dims = [in_dim] + [hidden_dim] * (num_layers - 1) + [out_dim] + self.layers = nn.ModuleList(operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype) for i in range(num_layers)) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.SiLU()(layer(x)) if i < len(self.layers) - 1 else layer(x) + return x + + +class TransformerDecoderLayer(nn.Module): + def __init__(self, d_model=256, nhead=8, dim_feedforward=1024, num_levels=3, num_points=4, device=None, dtype=None, operations=None): + super().__init__() + self.self_attn = SelfAttention(d_model, nhead, device=device, dtype=dtype, operations=operations) + self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype) + self.cross_attn = MSDeformableAttention(d_model, nhead, num_levels, num_points, device=device, dtype=dtype, operations=operations) + self.gateway = Gate(d_model, device=device, dtype=dtype, operations=operations) + self.linear1 = operations.Linear(d_model, dim_feedforward, device=device, dtype=dtype) + self.activation = nn.ReLU() + self.linear2 = operations.Linear(dim_feedforward, d_model, device=device, dtype=dtype) + self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype) + + def forward(self, target, ref_pts, value, spatial_shapes, attn_mask=None, query_pos=None): + q = k = target if query_pos is None else target + query_pos + t2 = self.self_attn(q, k, value=target, attn_mask=attn_mask) + target = self.norm1(target + t2) + t2 = self.cross_attn( + target if query_pos is None else target + query_pos, + ref_pts, value, spatial_shapes) + target = self.gateway(target, t2) + t2 = self.linear2(self.activation(self.linear1(target))) + target = self.norm3((target + t2).clamp(-65504, 65504)) + return target + + +# --------------------------------------------------------------------------- +# FDR utilities +# --------------------------------------------------------------------------- + +def weighting_function(reg_max, up, reg_scale): + """Non-uniform weighting function W(n) for FDR box regression.""" + ub1 = (abs(up[0]) * abs(reg_scale)).item() + ub2 = ub1 * 2 + step = (ub1 + 1) ** (2 / (reg_max - 2)) + left = [-(step ** i) + 1 for i in range(reg_max // 2 - 1, 0, -1)] + right = [ (step ** i) - 1 for i in range(1, reg_max // 2)] + vals = [-ub2] + left + [0] + right + [ub2] + return torch.tensor(vals, dtype=up.dtype, device=up.device) + + +def distance2bbox(points, distance, reg_scale): + """Decode edge-distances → cxcywh boxes.""" + rs = abs(reg_scale).to(dtype=points.dtype) + x1 = points[..., 0] - (0.5 * rs + distance[..., 0]) * (points[..., 2] / rs) + y1 = points[..., 1] - (0.5 * rs + distance[..., 1]) * (points[..., 3] / rs) + x2 = points[..., 0] + (0.5 * rs + distance[..., 2]) * (points[..., 2] / rs) + y2 = points[..., 1] + (0.5 * rs + distance[..., 3]) * (points[..., 3] / rs) + x0, y0, x1_, y1_ = (x1 + x2) / 2, (y1 + y2) / 2, x2 - x1, y2 - y1 + return torch.stack([x0, y0, x1_, y1_], -1) + + +class Integral(nn.Module): + """Sum Pr(n)·W(n) over the distribution bins.""" + def __init__(self, reg_max=32): + super().__init__() + self.reg_max = reg_max + + def forward(self, x, project): + shape = x.shape + x = F.softmax(x.reshape(-1, self.reg_max + 1), 1) + x = F.linear(x, project.to(device=x.device, dtype=x.dtype)).reshape(-1, 4) + return x.reshape(list(shape[:-1]) + [-1]) + + +class LQE(nn.Module): + """Location Quality Estimator — refines class scores using corner distribution.""" + def __init__(self, k=4, hidden_dim=64, num_layers=2, reg_max=32, device=None, dtype=None, operations=None): + super().__init__() + self.k, self.reg_max = k, reg_max + self.reg_conf = MLP(4 * (k + 1), hidden_dim, 1, num_layers, device=device, dtype=dtype, operations=operations) + + def forward(self, scores, pred_corners): + B, L, _ = pred_corners.shape + prob = F.softmax(pred_corners.reshape(B, L, 4, self.reg_max + 1), -1) + topk, _ = prob.topk(self.k, -1) + stat = torch.cat([topk, topk.mean(-1, keepdim=True)], -1) + return scores + self.reg_conf(stat.reshape(B, L, -1)) + + +class TransformerDecoder(nn.Module): + def __init__(self, hidden_dim, nhead, dim_feedforward, num_levels, num_points, num_layers, reg_max, reg_scale, up, eval_idx=-1, device=None, dtype=None, operations=None): + super().__init__() + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.nhead = nhead + self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx + self.up, self.reg_scale, self.reg_max = up, reg_scale, reg_max + self.layers = nn.ModuleList([ + TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, num_levels, num_points, device=device, dtype=dtype, operations=operations) + for _ in range(self.eval_idx + 1) + ]) + self.lqe_layers = nn.ModuleList([LQE(4, 64, 2, reg_max, device=device, dtype=dtype, operations=operations) for _ in range(self.eval_idx + 1)]) + self.register_buffer('project', weighting_function(reg_max, up, reg_scale)) + + def _value_op(self, memory, spatial_shapes): + """Reshape memory to per-level value tensors for deformable attention.""" + c = self.hidden_dim // self.nhead + split = [h * w for h, w in spatial_shapes] + val = memory.reshape(memory.shape[0], memory.shape[1], self.nhead, c) # memory: [bs, sum(h*w), hidden_dim] + # → [bs, n_head, c, sum_hw] + val = val.permute(0, 2, 3, 1).flatten(0, 1) # [bs*n_head, c, sum_hw] + return val.split(split, dim=-1) # list of [bs*n_head, c, h_l*w_l] + + def forward(self, target, ref_pts_unact, memory, spatial_shapes, bbox_head, score_head, query_pos_head, pre_bbox_head, integral): + val_split_flat = self._value_op(memory, spatial_shapes) # pre-split value for deformable attention + + # reshape to [bs*n_head, c, h_l, w_l] + value = [] + for lvl, (h, w) in enumerate(spatial_shapes): + v = val_split_flat[lvl] # [bs*n_head, c, h*w] + value.append(v.reshape(v.shape[0], v.shape[1], h, w)) + + ref_pts = F.sigmoid(ref_pts_unact) + output = target + output_detach = pred_corners_undetach = 0 + + dec_bboxes, dec_logits = [], [] + + for i, layer in enumerate(self.layers): + ref_input = ref_pts.unsqueeze(2) # [bs, Lq, 1, 4] + query_pos = query_pos_head(ref_pts).clamp(-10, 10) + output = layer(output, ref_input, value, spatial_shapes, query_pos=query_pos) + + if i == 0: + ref_unact = ref_pts.clamp(1e-5, 1 - 1e-5) + ref_unact = torch.log(ref_unact / (1 - ref_unact)) + pre_bboxes = F.sigmoid(pre_bbox_head(output) + ref_unact) + ref_pts_initial = pre_bboxes.detach() + + pred_corners = bbox_head[i](output + output_detach) + pred_corners_undetach + inter_ref_bbox = distance2bbox(ref_pts_initial, integral(pred_corners, self.project), self.reg_scale) + + if i == self.eval_idx: + scores = score_head[i](output) + scores = self.lqe_layers[i](scores, pred_corners) + dec_bboxes.append(inter_ref_bbox) + dec_logits.append(scores) + break + + pred_corners_undetach = pred_corners + ref_pts = inter_ref_bbox.detach() + output_detach = output.detach() + + return torch.stack(dec_bboxes), torch.stack(dec_logits) + + +class DFINETransformer(nn.Module): + def __init__(self, num_classes=80, hidden_dim=256, num_queries=300, feat_channels=[256, 256, 256], feat_strides=[8, 16, 32], + num_levels=3, num_points=[3, 6, 3], nhead=8, num_layers=6, dim_feedforward=1024, eval_idx=-1, eps=1e-2, reg_max=32, + reg_scale=8.0, eval_spatial_size=(640, 640), device=None, dtype=None, operations=None): + super().__init__() + assert len(feat_strides) == len(feat_channels) + self.hidden_dim = hidden_dim + self.num_queries = num_queries + self.num_levels = num_levels + self.eps = eps + self.eval_spatial_size = eval_spatial_size + + self.feat_strides = list(feat_strides) + for i in range(num_levels - len(feat_strides)): + self.feat_strides.append(feat_strides[-1] * 2 ** (i + 1)) + + # input projection (expects pre-fused weights) + self.input_proj = nn.ModuleList() + for ch in feat_channels: + if ch == hidden_dim: + self.input_proj.append(nn.Identity()) + else: + self.input_proj.append(nn.Sequential(OrderedDict([ + ('conv', operations.Conv2d(ch, hidden_dim, 1, bias=True, device=device, dtype=dtype))]))) + in_ch = feat_channels[-1] + for i in range(num_levels - len(feat_channels)): + self.input_proj.append(nn.Sequential(OrderedDict([ + ('conv', operations.Conv2d(in_ch if i == 0 else hidden_dim, + hidden_dim, 3, 2, 1, bias=True, device=device, dtype=dtype))]))) + in_ch = hidden_dim + + # FDR parameters (non-trainable placeholders, set from config) + self.up = nn.Parameter(torch.tensor([0.5]), requires_grad=False) + self.reg_scale = nn.Parameter(torch.tensor([reg_scale]), requires_grad=False) + + pts = num_points if isinstance(num_points, (list, tuple)) else [num_points] * num_levels + self.decoder = TransformerDecoder(hidden_dim, nhead, dim_feedforward, num_levels, pts, + num_layers, reg_max, self.reg_scale, self.up, eval_idx, device=device, dtype=dtype, operations=operations) + + self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, 2, device=device, dtype=dtype, operations=operations) + self.enc_output = nn.Sequential(OrderedDict([ + ('proj', operations.Linear(hidden_dim, hidden_dim, device=device, dtype=dtype)), + ('norm', operations.LayerNorm(hidden_dim, device=device, dtype=dtype))])) + self.enc_score_head = operations.Linear(hidden_dim, num_classes, device=device, dtype=dtype) + self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, device=device, dtype=dtype, operations=operations) + + self.eval_idx_ = eval_idx if eval_idx >= 0 else num_layers + eval_idx + self.dec_score_head = nn.ModuleList( + [operations.Linear(hidden_dim, num_classes, device=device, dtype=dtype) for _ in range(self.eval_idx_ + 1)]) + self.pre_bbox_head = MLP(hidden_dim, hidden_dim, 4, 3, device=device, dtype=dtype, operations=operations) + self.dec_bbox_head = nn.ModuleList( + [MLP(hidden_dim, hidden_dim, 4 * (reg_max + 1), 3, device=device, dtype=dtype, operations=operations) + for _ in range(self.eval_idx_ + 1)]) + self.integral = Integral(reg_max) + + if eval_spatial_size: + # Register as buffers so checkpoint values override the freshly-computed defaults + anchors, valid_mask = self._gen_anchors() + self.register_buffer('anchors', anchors) + self.register_buffer('valid_mask', valid_mask) + + def _gen_anchors(self, spatial_shapes=None, grid_size=0.05, dtype=torch.float32, device='cpu'): + if spatial_shapes is None: + h0, w0 = self.eval_spatial_size + spatial_shapes = [[int(h0 / s), int(w0 / s)] for s in self.feat_strides] + anchors = [] + for lvl, (h, w) in enumerate(spatial_shapes): + gy, gx = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij') + gxy = (torch.stack([gx, gy], -1).float() + 0.5) / torch.tensor([w, h], dtype=dtype) + wh = torch.ones_like(gxy) * grid_size * (2. ** lvl) + anchors.append(torch.cat([gxy, wh], -1).reshape(-1, h * w, 4)) + anchors = torch.cat(anchors, 1).to(device) + valid_mask = ((anchors > self.eps) & (anchors < 1 - self.eps)).all(-1, keepdim=True) + anchors = torch.log(anchors / (1 - anchors)) + anchors = torch.where(valid_mask, anchors, torch.full_like(anchors, float('inf'))) + return anchors, valid_mask + + def _encoder_input(self, feats: List[torch.Tensor]): + proj = [self.input_proj[i](f) for i, f in enumerate(feats)] + for i in range(len(feats), self.num_levels): + proj.append(self.input_proj[i](feats[-1] if i == len(feats) else proj[-1])) + flat, shapes = [], [] + for f in proj: + _, _, h, w = f.shape + flat.append(f.flatten(2).permute(0, 2, 1)) + shapes.append([h, w]) + return torch.cat(flat, 1), shapes + + def _decoder_input(self, memory: torch.Tensor): + anchors, valid_mask = self.anchors.to(memory), self.valid_mask + if memory.shape[0] > 1: + anchors = anchors.repeat(memory.shape[0], 1, 1) + + mem = valid_mask.to(memory) * memory + out_mem = self.enc_output(mem) + logits = self.enc_score_head(out_mem) + _, idx = torch.topk(logits.max(-1).values, self.num_queries, dim=-1) + idx_e = idx.unsqueeze(-1) + topk_mem = out_mem.gather(1, idx_e.expand(-1, -1, out_mem.shape[-1])) + topk_anc = anchors.gather(1, idx_e.expand(-1, -1, anchors.shape[-1])) + topk_ref = self.enc_bbox_head(topk_mem) + topk_anc + return topk_mem.detach(), topk_ref.detach() + + def forward(self, feats: List[torch.Tensor]): + memory, shapes = self._encoder_input(feats) + content, ref = self._decoder_input(memory) + out_bboxes, out_logits = self.decoder( + content, ref, memory, shapes, + self.dec_bbox_head, self.dec_score_head, + self.query_pos_head, self.pre_bbox_head, self.integral) + return {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} + + +# --------------------------------------------------------------------------- +# Main model +# --------------------------------------------------------------------------- + +class RTv4(nn.Module): + def __init__(self, num_classes=80, num_queries=300, enc_h=256, dec_h=256, enc_ff=2048, dec_ff=1024, feat_strides=[8, 16, 32], device=None, dtype=None, operations=None, **kwargs): + super().__init__() + self.device = device + self.dtype = dtype + self.operations = operations + + self.backbone = HGNetv2(device=device, dtype=dtype, operations=operations) + self.encoder = HybridEncoder(hidden_dim=enc_h, dim_feedforward=enc_ff, device=device, dtype=dtype, operations=operations) + self.decoder = DFINETransformer(num_classes=num_classes, hidden_dim=dec_h, num_queries=num_queries, + feat_channels=[enc_h] * len(feat_strides), feat_strides=feat_strides, dim_feedforward=dec_ff, device=device, dtype=dtype, operations=operations) + + self.num_classes = num_classes + self.num_queries = num_queries + self.load_device = comfy.model_management.get_torch_device() + + def _forward(self, x: torch.Tensor): + return self.decoder(self.encoder(self.backbone(x))) + + def postprocess(self, outputs, orig_size: tuple = (640, 640)) -> List[dict]: + logits = outputs['pred_logits'] + boxes = torchvision.ops.box_convert(outputs['pred_boxes'], 'cxcywh', 'xyxy') + boxes = boxes * torch.tensor(orig_size, device=boxes.device, dtype=boxes.dtype).repeat(1, 2).unsqueeze(1) + scores = F.sigmoid(logits) + scores, idx = torch.topk(scores.flatten(1), self.num_queries, dim=-1) + labels = idx % self.num_classes + boxes = boxes.gather(1, (idx // self.num_classes).unsqueeze(-1).expand(-1, -1, 4)) + return [{'labels': lbl, 'boxes': b, 'scores': s} for lbl, b, s in zip(labels, boxes, scores)] + + def forward(self, x: torch.Tensor, orig_size: tuple = (640, 640), **kwargs): + outputs = self._forward(x.to(device=self.load_device, dtype=self.dtype)) + return self.postprocess(outputs, orig_size) diff --git a/comfy/model_base.py b/comfy/model_base.py index 94579fa3e..c2ae646aa 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -52,6 +52,7 @@ import comfy.ldm.qwen_image.model import comfy.ldm.kandinsky5.model import comfy.ldm.anima.model import comfy.ldm.ace.ace_step15 +import comfy.ldm.rt_detr.rtdetr_v4 import comfy.model_management import comfy.patcher_extension @@ -1957,3 +1958,7 @@ class Kandinsky5Image(Kandinsky5): def concat_cond(self, **kwargs): return None + +class RT_DETR_v4(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.rt_detr.rtdetr_v4.RTv4) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 35a6822e3..1c8ae2325 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -698,6 +698,12 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["audio_model"] = "ace1.5" return dit_config + if '{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix) in state_dict_keys: # RT-DETR_v4 + dit_config = {} + dit_config["image_model"] = "RT_DETR_v4" + dit_config["enc_h"] = state_dict['{}encoder.pan_blocks.1.cv4.conv.weight'.format(key_prefix)].shape[0] + return dit_config + if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys: return None diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 07feb31b3..9a5612716 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1734,6 +1734,21 @@ class LongCatImage(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect)) -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima] + +class RT_DETR_v4(supported_models_base.BASE): + unet_config = { + "image_model": "RT_DETR_v4", + } + + supported_inference_dtypes = [torch.float16, torch.float32] + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.RT_DETR_v4(self, device=device) + return out + + def clip_target(self, state_dict={}): + return None + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4] models += [SVD_img2vid] diff --git a/comfy_extras/nodes_rtdetr.py b/comfy_extras/nodes_rtdetr.py new file mode 100644 index 000000000..61307e268 --- /dev/null +++ b/comfy_extras/nodes_rtdetr.py @@ -0,0 +1,154 @@ +from typing_extensions import override + +import torch +from comfy.ldm.rt_detr.rtdetr_v4 import COCO_CLASSES +import comfy.model_management +import comfy.utils +from comfy_api.latest import ComfyExtension, io +from torchvision.transforms import ToPILImage, ToTensor +from PIL import ImageDraw, ImageFont + + +class RTDETR_detect(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="RTDETR_detect", + display_name="RT-DETR Detect", + category="detection/", + search_aliases=["bbox", "bounding box", "object detection", "coco"], + inputs=[ + io.Model.Input("model", display_name="model"), + io.Image.Input("image", display_name="image"), + io.Float.Input("threshold", display_name="threshold", default=0.5), + io.Combo.Input("class_name", options=["all"] + COCO_CLASSES, default="all", tooltip="Filter detections by class. Set to 'all' to disable filtering."), + io.Int.Input("max_detections", display_name="max_detections", default=100, tooltip="Maximum number of detections to return per image. In order of descending confidence score."), + ], + outputs=[ + io.BoundingBox.Output("bboxes")], + ) + + @classmethod + def execute(cls, model, image, threshold, class_name, max_detections) -> io.NodeOutput: + B, H, W, C = image.shape + + image_in = comfy.utils.common_upscale(image.movedim(-1, 1), 640, 640, "bilinear", crop="disabled") + + comfy.model_management.load_model_gpu(model) + results = model.model.diffusion_model(image_in, (W, H)) # list of B dicts + + all_bbox_dicts = [] + + for det in results: + keep = det['scores'] > threshold + boxes = det['boxes'][keep].cpu() + labels = det['labels'][keep].cpu() + scores = det['scores'][keep].cpu() + + bbox_dicts = [ + { + "x": float(box[0]), + "y": float(box[1]), + "width": float(box[2] - box[0]), + "height": float(box[3] - box[1]), + "label": COCO_CLASSES[int(label)], + "score": float(score) + } + for box, label, score in zip(boxes, labels, scores) + if class_name == "all" or COCO_CLASSES[int(label)] == class_name + ] + bbox_dicts.sort(key=lambda d: d["score"], reverse=True) + all_bbox_dicts.append(bbox_dicts[:max_detections]) + + return io.NodeOutput(all_bbox_dicts) + + +class DrawBBoxes(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="DrawBBoxes", + display_name="Draw BBoxes", + category="detection/", + search_aliases=["bbox", "bounding box", "object detection", "rt_detr", "visualize detections", "coco"], + inputs=[ + io.Image.Input("image", optional=True), + io.BoundingBox.Input("bboxes", force_input=True), + ], + outputs=[ + io.Image.Output("out_image"), + ], + ) + + @classmethod + def execute(cls, bboxes, image=None) -> io.NodeOutput: + # Normalise to list[list[dict]], then fit to batch size B. + B = image.shape[0] if image is not None else 1 + if isinstance(bboxes, dict): + bboxes = [[bboxes]] + elif not isinstance(bboxes, list) or not bboxes: + bboxes = [[]] + elif isinstance(bboxes[0], dict): + bboxes = [bboxes] # flat list → same detections for every image + + if len(bboxes) == 1: + bboxes = bboxes * B + bboxes = (bboxes + [[]] * B)[:B] + + if image is None: + B = len(bboxes) + max_w = max((int(d["x"] + d["width"]) for frame in bboxes for d in frame), default=640) + max_h = max((int(d["y"] + d["height"]) for frame in bboxes for d in frame), default=640) + image = torch.zeros((B, max_h, max_w, 3), dtype=torch.float32) + + all_out_images = [] + for i in range(B): + detections = bboxes[i] + if detections: + boxes = torch.tensor([[d["x"], d["y"], d["x"] + d["width"], d["y"] + d["height"]] for d in detections]) + labels = [d.get("label") if d.get("label") in COCO_CLASSES else None for d in detections] + scores = torch.tensor([d.get("score", 1.0) for d in detections]) + else: + boxes = torch.zeros((0, 4)) + labels = [] + scores = torch.zeros((0,)) + + pil_image = image[i].movedim(-1, 0) + img = ToPILImage()(pil_image) + if detections: + img = cls.draw_detections(img, boxes, labels, scores) + all_out_images.append(ToTensor()(img).unsqueeze(0).movedim(1, -1)) + + out_images = torch.cat(all_out_images, dim=0).to(comfy.model_management.intermediate_device()) + return io.NodeOutput(out_images) + + @classmethod + def draw_detections(cls, img, boxes, labels, scores): + draw = ImageDraw.Draw(img) + try: + font = ImageFont.truetype('arial.ttf', 16) + except Exception: + font = ImageFont.load_default() + colors = [(255,0,0),(0,200,0),(0,0,255),(255,165,0),(128,0,128), + (0,255,255),(255,20,147),(100,149,237)] + for box, label, score in sorted(zip(boxes, labels, scores), key=lambda x: x[2].item()): + x1, y1, x2, y2 = box.tolist() + color_idx = COCO_CLASSES.index(label) if label is not None else 0 + c = colors[color_idx % len(colors)] + draw.rectangle([x1, y1, x2, y2], outline=c, width=3) + if label is not None: + draw.text((x1 + 2, y1 + 2), f'{label} {score:.2f}', fill=c, font=font) + return img + + +class RTDETRExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + RTDETR_detect, + DrawBBoxes, + ] + + +async def comfy_entrypoint() -> RTDETRExtension: + return RTDETRExtension() diff --git a/comfy_extras/nodes_sdpose.py b/comfy_extras/nodes_sdpose.py index 71441848e..46b5fb226 100644 --- a/comfy_extras/nodes_sdpose.py +++ b/comfy_extras/nodes_sdpose.py @@ -661,6 +661,7 @@ class CropByBBoxes(io.ComfyNode): io.Int.Input("output_width", default=512, min=64, max=4096, step=8, tooltip="Width each crop is resized to."), io.Int.Input("output_height", default=512, min=64, max=4096, step=8, tooltip="Height each crop is resized to."), io.Int.Input("padding", default=0, min=0, max=1024, step=1, tooltip="Extra padding in pixels added on each side of the bbox before cropping."), + io.Combo.Input("keep_aspect", options=["stretch", "pad"], default="stretch", tooltip="Whether to stretch the crop to fit the output size, or pad with black pixels to preserve aspect ratio."), ], outputs=[ io.Image.Output(tooltip="All crops stacked into a single image batch."), @@ -668,7 +669,7 @@ class CropByBBoxes(io.ComfyNode): ) @classmethod - def execute(cls, image, bboxes, output_width, output_height, padding) -> io.NodeOutput: + def execute(cls, image, bboxes, output_width, output_height, padding, keep_aspect="stretch") -> io.NodeOutput: total_frames = image.shape[0] img_h = image.shape[1] img_w = image.shape[2] @@ -716,7 +717,19 @@ class CropByBBoxes(io.ComfyNode): x1, y1, x2, y2 = fb_x1, fb_y1, fb_x2, fb_y2 crop_chw = frame_chw[:, :, y1:y2, x1:x2] # (1, C, crop_h, crop_w) - resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled") + + if keep_aspect == "pad": + crop_h, crop_w = y2 - y1, x2 - x1 + scale = min(output_width / crop_w, output_height / crop_h) + scaled_w = int(round(crop_w * scale)) + scaled_h = int(round(crop_h * scale)) + scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled") + pad_left = (output_width - scaled_w) // 2 + pad_top = (output_height - scaled_h) // 2 + resized = torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device) + resized[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled + else: # "stretch" + resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled") crops.append(resized) if not crops: diff --git a/nodes.py b/nodes.py index 37ceac2fc..299b3d758 100644 --- a/nodes.py +++ b/nodes.py @@ -2457,6 +2457,7 @@ async def init_builtin_extra_nodes(): "nodes_number_convert.py", "nodes_painter.py", "nodes_curve.py", + "nodes_rtdetr.py" ] import_failed = [] From d113d1cc32bccd99a83ddfc5c1e22fd9e024a6b0 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Mon, 30 Mar 2026 00:11:30 +0300 Subject: [PATCH 15/29] feat(api-nodes-Tencent3D): allow smaller possible face_count; add uv_image output (#13207) Signed-off-by: bigcat88 --- comfy_api_nodes/nodes_hunyuan3d.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/comfy_api_nodes/nodes_hunyuan3d.py b/comfy_api_nodes/nodes_hunyuan3d.py index 753c09b6e..44c94a98e 100644 --- a/comfy_api_nodes/nodes_hunyuan3d.py +++ b/comfy_api_nodes/nodes_hunyuan3d.py @@ -132,7 +132,7 @@ class TencentTextToModelNode(IO.ComfyNode): tooltip="The LowPoly option is unavailable for the `3.1` model.", ), IO.String.Input("prompt", multiline=True, default="", tooltip="Supports up to 1024 characters."), - IO.Int.Input("face_count", default=500000, min=40000, max=1500000), + IO.Int.Input("face_count", default=500000, min=3000, max=1500000), IO.DynamicCombo.Input( "generate_type", options=[ @@ -251,7 +251,7 @@ class TencentImageToModelNode(IO.ComfyNode): IO.Image.Input("image_left", optional=True), IO.Image.Input("image_right", optional=True), IO.Image.Input("image_back", optional=True), - IO.Int.Input("face_count", default=500000, min=40000, max=1500000), + IO.Int.Input("face_count", default=500000, min=3000, max=1500000), IO.DynamicCombo.Input( "generate_type", options=[ @@ -422,6 +422,7 @@ class TencentModelTo3DUVNode(IO.ComfyNode): outputs=[ IO.File3DOBJ.Output(display_name="OBJ"), IO.File3DFBX.Output(display_name="FBX"), + IO.Image.Output(display_name="uv_image"), ], hidden=[ IO.Hidden.auth_token_comfy_org, @@ -468,9 +469,16 @@ class TencentModelTo3DUVNode(IO.ComfyNode): response_model=To3DProTaskResultResponse, status_extractor=lambda r: r.Status, ) + uv_image_file = get_file_from_response(result.ResultFile3Ds, "uv_image", raise_if_not_found=False) + uv_image = ( + await download_url_to_image_tensor(uv_image_file.Url) + if uv_image_file is not None + else torch.zeros(1, 1, 1, 3) + ) return IO.NodeOutput( await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "obj").Url, "obj"), await download_url_to_file_3d(get_file_from_response(result.ResultFile3Ds, "fbx").Url, "fbx"), + uv_image, ) From 8d723d2caa959a8c4b14050c30dd83cf931e3dc1 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Sun, 29 Mar 2026 16:43:24 -0700 Subject: [PATCH 16/29] Fix/tweak pinned memory accounting (#13221) * mm: Lower windows pin threshold Some workflows have more extranous use of shared GPU memory than is accounted for in the 5% pin headroom. Lower this for safety. * mm: Remove pin count clearing threshold. TOTAL_PINNED_MEMORY is shared between the legacy and aimdo pinning systems, however this catch-all assumes only the legacy system exists. Remove the catch-all as the PINNED_MEMORY buffer is coherent already. --- comfy/model_management.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index ce079cf2f..0eebf1ded 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1326,9 +1326,9 @@ MAX_PINNED_MEMORY = -1 if not args.disable_pinned_memory: if is_nvidia() or is_amd(): if WINDOWS: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50% + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50% else: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"]) @@ -1403,8 +1403,6 @@ def unpin_memory(tensor): if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) - if len(PINNED_MEMORY) == 0: - TOTAL_PINNED_MEMORY = 0 return True else: logging.warning("Unpin error.") From 537c10d231cf691ada070fa98978a2ddadb064fc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 29 Mar 2026 19:07:38 -0700 Subject: [PATCH 17/29] Update README.md with latest AMD Linux pytorch. (#13228) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 62c4f528c..021f88a31 100644 --- a/README.md +++ b/README.md @@ -232,7 +232,7 @@ Put your VAE in: models/vae AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: -```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.1``` +```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm7.2``` This is the command to install the nightly with ROCm 7.2 which might have some performance improvements: From 55e6478526590572b88272045b5014d9b60fa2a0 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Sun, 29 Mar 2026 21:02:44 -0700 Subject: [PATCH 18/29] Rename utils/string nodes with Text prefix and add search aliases (#13227) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename all 11 nodes in the utils/string category to include a "Text" prefix for better discoverability and natural sorting. Regex nodes get user-friendly names without "Regex" in the display name. Renames: - Concatenate → Text Concatenate - Substring → Text Substring - Length → Text Length - Case Converter → Text Case Converter - Trim → Text Trim - Replace → Text Replace - Contains → Text Contains - Compare → Text Compare - Regex Match → Text Match - Regex Extract → Text Extract Substring - Regex Replace → Text Replace (Regex) All renamed nodes include their old display name as a search alias so users can still find them by searching the original name. Regex nodes also include "regex" as a search alias. --- comfy_extras/nodes_string.py | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py index b4e5f148a..75a8bb4ee 100644 --- a/comfy_extras/nodes_string.py +++ b/comfy_extras/nodes_string.py @@ -9,9 +9,9 @@ class StringConcatenate(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringConcatenate", - display_name="Concatenate", + display_name="Text Concatenate", category="utils/string", - search_aliases=["text concat", "join text", "merge text", "combine strings", "concat", "concatenate", "append text", "combine text", "string"], + search_aliases=["Concatenate", "text concat", "join text", "merge text", "combine strings", "concat", "concatenate", "append text", "combine text", "string"], inputs=[ io.String.Input("string_a", multiline=True), io.String.Input("string_b", multiline=True), @@ -32,8 +32,8 @@ class StringSubstring(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringSubstring", - search_aliases=["extract text", "text portion"], - display_name="Substring", + search_aliases=["Substring", "extract text", "text portion"], + display_name="Text Substring", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -55,8 +55,8 @@ class StringLength(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringLength", - search_aliases=["character count", "text size"], - display_name="Length", + search_aliases=["character count", "text size", "string length"], + display_name="Text Length", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -76,8 +76,8 @@ class CaseConverter(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="CaseConverter", - search_aliases=["text case", "uppercase", "lowercase", "capitalize"], - display_name="Case Converter", + search_aliases=["Case Converter", "text case", "uppercase", "lowercase", "capitalize"], + display_name="Text Case Converter", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -109,8 +109,8 @@ class StringTrim(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringTrim", - search_aliases=["clean whitespace", "remove whitespace"], - display_name="Trim", + search_aliases=["Trim", "clean whitespace", "remove whitespace", "strip"], + display_name="Text Trim", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -140,8 +140,8 @@ class StringReplace(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringReplace", - search_aliases=["find and replace", "substitute", "swap text"], - display_name="Replace", + search_aliases=["Replace", "find and replace", "substitute", "swap text"], + display_name="Text Replace", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -163,8 +163,8 @@ class StringContains(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringContains", - search_aliases=["text includes", "string includes"], - display_name="Contains", + search_aliases=["Contains", "text includes", "string includes"], + display_name="Text Contains", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -191,8 +191,8 @@ class StringCompare(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StringCompare", - search_aliases=["text match", "string equals", "starts with", "ends with"], - display_name="Compare", + search_aliases=["Compare", "text match", "string equals", "starts with", "ends with"], + display_name="Text Compare", category="utils/string", inputs=[ io.String.Input("string_a", multiline=True), @@ -227,8 +227,8 @@ class RegexMatch(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RegexMatch", - search_aliases=["pattern match", "text contains", "string match"], - display_name="Regex Match", + search_aliases=["Regex Match", "regex", "pattern match", "text contains", "string match"], + display_name="Text Match", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -268,8 +268,8 @@ class RegexExtract(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RegexExtract", - search_aliases=["pattern extract", "text parser", "parse text"], - display_name="Regex Extract", + search_aliases=["Regex Extract", "regex", "pattern extract", "text parser", "parse text"], + display_name="Text Extract Substring", category="utils/string", inputs=[ io.String.Input("string", multiline=True), @@ -343,8 +343,8 @@ class RegexReplace(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RegexReplace", - search_aliases=["pattern replace", "find and replace", "substitution"], - display_name="Regex Replace", + search_aliases=["Regex Replace", "regex", "pattern replace", "regex replace", "substitution"], + display_name="Text Replace (Regex)", category="utils/string", description="Find and replace text using regex patterns.", inputs=[ From 076639fed99742f43ffbf0b0df34bb8fd105b8e9 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 30 Mar 2026 20:11:02 -0700 Subject: [PATCH 19/29] Update README with note on model support (#13235) Added note about additional supported models in ComfyUI. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 021f88a31..a47506fc8 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,7 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo ## Features - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything. +- NOTE: There are many more models supported than the list below, if you want to see what is supported see our templates list inside ComfyUI. - Image Models - SD1.x, SD2.x ([unCLIP](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)) - [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/), [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/) From e2ddf28d78f190b27d136668a7dc15c7f0ec75dc Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 31 Mar 2026 14:27:17 -0700 Subject: [PATCH 20/29] Fix some fp8 scaled checkpoints no longer working. (#13239) --- comfy/sd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/sd.py b/comfy/sd.py index 7425765a4..5b6b59ea4 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1745,6 +1745,8 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable temp_sd = comfy.utils.state_dict_prefix_replace(sd, {diffusion_model_prefix: ""}, filter_keys=True) if len(temp_sd) > 0: sd = temp_sd + if custom_operations is None: + sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata) parameters = comfy.utils.calculate_parameters(sd) weight_dtype = comfy.utils.weight_dtype(sd) From 7d437687c260df7772c603658111148e0e863e59 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Wed, 1 Apr 2026 11:23:25 +0800 Subject: [PATCH 21/29] chore: update workflow templates to v0.9.41 (#13242) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6f0659a00..0d88fdcfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.8 -comfyui-workflow-templates==0.9.39 +comfyui-workflow-templates==0.9.41 comfyui-embedded-docs==0.4.3 torch torchsde From 0c63b4f6e3ce807a0f85c1826710bcf18ade3e2c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 1 Apr 2026 17:22:06 -0700 Subject: [PATCH 22/29] Remove dead code. (#13251) --- comfy/ldm/modules/encoders/noise_aug_modules.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/modules/encoders/noise_aug_modules.py b/comfy/ldm/modules/encoders/noise_aug_modules.py index a5d866030..c853e4298 100644 --- a/comfy/ldm/modules/encoders/noise_aug_modules.py +++ b/comfy/ldm/modules/encoders/noise_aug_modules.py @@ -3,12 +3,9 @@ from ..diffusionmodules.openaimodel import Timestep import torch class CLIPEmbeddingNoiseAugmentation(ImageConcatWithNoiseAugmentation): - def __init__(self, *args, clip_stats_path=None, timestep_dim=256, **kwargs): + def __init__(self, *args, timestep_dim=256, **kwargs): super().__init__(*args, **kwargs) - if clip_stats_path is None: - clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim) - else: - clip_mean, clip_std = torch.load(clip_stats_path, map_location="cpu") + clip_mean, clip_std = torch.zeros(timestep_dim), torch.ones(timestep_dim) self.register_buffer("data_mean", clip_mean[None, :], persistent=False) self.register_buffer("data_std", clip_std[None, :], persistent=False) self.time_embed = Timestep(timestep_dim) From 76b75f3ad755ef5ff78b3670abbab549fb080243 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 2 Apr 2026 13:39:34 -0700 Subject: [PATCH 23/29] Fix some issue with insecure browsers. (#13261) If you are on a recent chromium or chrome based browser this doesn't affect you. This is to give time for the lazy firefox devs to implement PNA. --- server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server.py b/server.py index 27b14825e..881da8e66 100644 --- a/server.py +++ b/server.py @@ -146,6 +146,10 @@ def is_loopback(host): def create_origin_only_middleware(): @web.middleware async def origin_only_middleware(request: web.Request, handler): + if 'Sec-Fetch-Site' in request.headers: + sec_fetch_site = request.headers['Sec-Fetch-Site'] + if sec_fetch_site == 'cross-site': + return web.Response(status=403) #this code is used to prevent the case where a random website can queue comfy workflows by making a POST to 127.0.0.1 which browsers don't prevent for some dumb reason. #in that case the Host and Origin hostnames won't match #I know the proper fix would be to add a cookie but this should take care of the problem in the meantime From 5de94e70ec116a93cbd110fbfaec266ae1f423c5 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 3 Apr 2026 09:51:47 +0300 Subject: [PATCH 24/29] feat(api-nodes): new Partner nodes for Wan2.7 (#13264) Signed-off-by: bigcat88 --- comfy_api_nodes/apis/wan.py | 226 ++++++++ comfy_api_nodes/nodes_wan.py | 1011 +++++++++++++++++++++++++++++----- 2 files changed, 1089 insertions(+), 148 deletions(-) create mode 100644 comfy_api_nodes/apis/wan.py diff --git a/comfy_api_nodes/apis/wan.py b/comfy_api_nodes/apis/wan.py new file mode 100644 index 000000000..44b65e4f6 --- /dev/null +++ b/comfy_api_nodes/apis/wan.py @@ -0,0 +1,226 @@ +from pydantic import BaseModel, Field + + +class Text2ImageInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: str | None = Field(None) + + +class Image2ImageInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: str | None = Field(None) + images: list[str] = Field(..., min_length=1, max_length=2) + + +class Text2VideoInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: str | None = Field(None) + audio_url: str | None = Field(None) + + +class Image2VideoInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: str | None = Field(None) + img_url: str = Field(...) + audio_url: str | None = Field(None) + + +class Reference2VideoInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: str | None = Field(None) + reference_video_urls: list[str] = Field(...) + + +class Txt2ImageParametersField(BaseModel): + size: str = Field(...) + n: int = Field(1, description="Number of images to generate.") # we support only value=1 + seed: int = Field(..., ge=0, le=2147483647) + prompt_extend: bool = Field(True) + watermark: bool = Field(False) + + +class Image2ImageParametersField(BaseModel): + size: str | None = Field(None) + n: int = Field(1, description="Number of images to generate.") # we support only value=1 + seed: int = Field(..., ge=0, le=2147483647) + watermark: bool = Field(False) + + +class Text2VideoParametersField(BaseModel): + size: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + duration: int = Field(5, ge=5, le=15) + prompt_extend: bool = Field(True) + watermark: bool = Field(False) + audio: bool = Field(False, description="Whether to generate audio automatically.") + shot_type: str = Field("single") + + +class Image2VideoParametersField(BaseModel): + resolution: str = Field(...) + seed: int = Field(..., ge=0, le=2147483647) + duration: int = Field(5, ge=5, le=15) + prompt_extend: bool = Field(True) + watermark: bool = Field(False) + audio: bool = Field(False, description="Whether to generate audio automatically.") + shot_type: str = Field("single") + + +class Reference2VideoParametersField(BaseModel): + size: str = Field(...) + duration: int = Field(5, ge=5, le=15) + shot_type: str = Field("single") + seed: int = Field(..., ge=0, le=2147483647) + watermark: bool = Field(False) + + +class Text2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Text2ImageInputField = Field(...) + parameters: Txt2ImageParametersField = Field(...) + + +class Image2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Image2ImageInputField = Field(...) + parameters: Image2ImageParametersField = Field(...) + + +class Text2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Text2VideoInputField = Field(...) + parameters: Text2VideoParametersField = Field(...) + + +class Image2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Image2VideoInputField = Field(...) + parameters: Image2VideoParametersField = Field(...) + + +class Reference2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Reference2VideoInputField = Field(...) + parameters: Reference2VideoParametersField = Field(...) + + +class Wan27MediaItem(BaseModel): + type: str = Field(...) + url: str = Field(...) + + +class Wan27ReferenceVideoInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: str | None = Field(None) + media: list[Wan27MediaItem] = Field(...) + + +class Wan27ReferenceVideoParametersField(BaseModel): + resolution: str = Field(...) + ratio: str | None = Field(None) + duration: int = Field(5, ge=2, le=10) + watermark: bool = Field(False) + seed: int = Field(..., ge=0, le=2147483647) + + +class Wan27ReferenceVideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Wan27ReferenceVideoInputField = Field(...) + parameters: Wan27ReferenceVideoParametersField = Field(...) + + +class Wan27ImageToVideoInputField(BaseModel): + prompt: str | None = Field(None) + negative_prompt: str | None = Field(None) + media: list[Wan27MediaItem] = Field(...) + + +class Wan27ImageToVideoParametersField(BaseModel): + resolution: str = Field(...) + duration: int = Field(5, ge=2, le=15) + prompt_extend: bool = Field(True) + watermark: bool = Field(False) + seed: int = Field(..., ge=0, le=2147483647) + + +class Wan27ImageToVideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Wan27ImageToVideoInputField = Field(...) + parameters: Wan27ImageToVideoParametersField = Field(...) + + +class Wan27VideoEditInputField(BaseModel): + prompt: str = Field(...) + media: list[Wan27MediaItem] = Field(...) + + +class Wan27VideoEditParametersField(BaseModel): + resolution: str = Field(...) + ratio: str | None = Field(None) + duration: int = Field(0) + audio_setting: str = Field("auto") + watermark: bool = Field(False) + seed: int = Field(..., ge=0, le=2147483647) + + +class Wan27VideoEditTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Wan27VideoEditInputField = Field(...) + parameters: Wan27VideoEditParametersField = Field(...) + + +class Wan27Text2VideoParametersField(BaseModel): + resolution: str = Field(...) + ratio: str | None = Field(None) + duration: int = Field(5, ge=2, le=15) + prompt_extend: bool = Field(True) + watermark: bool = Field(False) + seed: int = Field(..., ge=0, le=2147483647) + + +class Wan27Text2VideoTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Text2VideoInputField = Field(...) + parameters: Wan27Text2VideoParametersField = Field(...) + + +class TaskCreationOutputField(BaseModel): + task_id: str = Field(...) + task_status: str = Field(...) + + +class TaskCreationResponse(BaseModel): + output: TaskCreationOutputField | None = Field(None) + request_id: str = Field(...) + code: str | None = Field(None, description="Error code for the failed request.") + message: str | None = Field(None, description="Details about the failed request.") + + +class TaskResult(BaseModel): + url: str | None = Field(None) + code: str | None = Field(None) + message: str | None = Field(None) + + +class ImageTaskStatusOutputField(TaskCreationOutputField): + task_id: str = Field(...) + task_status: str = Field(...) + results: list[TaskResult] | None = Field(None) + + +class VideoTaskStatusOutputField(TaskCreationOutputField): + task_id: str = Field(...) + task_status: str = Field(...) + video_url: str | None = Field(None) + code: str | None = Field(None) + message: str | None = Field(None) + + +class ImageTaskStatusResponse(BaseModel): + output: ImageTaskStatusOutputField | None = Field(None) + request_id: str = Field(...) + + +class VideoTaskStatusResponse(BaseModel): + output: VideoTaskStatusOutputField | None = Field(None) + request_id: str = Field(...) diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index e2afe7f9c..d1470894a 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -1,9 +1,40 @@ import re -from pydantic import BaseModel, Field from typing_extensions import override from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.wan import ( + Image2ImageInputField, + Image2ImageParametersField, + Image2ImageTaskCreationRequest, + Image2VideoInputField, + Image2VideoParametersField, + Image2VideoTaskCreationRequest, + ImageTaskStatusResponse, + Reference2VideoInputField, + Reference2VideoParametersField, + Reference2VideoTaskCreationRequest, + TaskCreationResponse, + Text2ImageInputField, + Text2ImageTaskCreationRequest, + Text2VideoInputField, + Text2VideoParametersField, + Text2VideoTaskCreationRequest, + Txt2ImageParametersField, + VideoTaskStatusResponse, + Wan27ImageToVideoInputField, + Wan27ImageToVideoParametersField, + Wan27ImageToVideoTaskCreationRequest, + Wan27MediaItem, + Wan27ReferenceVideoInputField, + Wan27ReferenceVideoParametersField, + Wan27ReferenceVideoTaskCreationRequest, + Wan27Text2VideoParametersField, + Wan27Text2VideoTaskCreationRequest, + Wan27VideoEditInputField, + Wan27VideoEditParametersField, + Wan27VideoEditTaskCreationRequest, +) from comfy_api_nodes.util import ( ApiEndpoint, audio_to_base64_string, @@ -13,157 +44,14 @@ from comfy_api_nodes.util import ( poll_op, sync_op, tensor_to_base64_string, + upload_audio_to_comfyapi, + upload_image_to_comfyapi, upload_video_to_comfyapi, validate_audio_duration, + validate_string, validate_video_duration, ) - -class Text2ImageInputField(BaseModel): - prompt: str = Field(...) - negative_prompt: str | None = Field(None) - - -class Image2ImageInputField(BaseModel): - prompt: str = Field(...) - negative_prompt: str | None = Field(None) - images: list[str] = Field(..., min_length=1, max_length=2) - - -class Text2VideoInputField(BaseModel): - prompt: str = Field(...) - negative_prompt: str | None = Field(None) - audio_url: str | None = Field(None) - - -class Image2VideoInputField(BaseModel): - prompt: str = Field(...) - negative_prompt: str | None = Field(None) - img_url: str = Field(...) - audio_url: str | None = Field(None) - - -class Reference2VideoInputField(BaseModel): - prompt: str = Field(...) - negative_prompt: str | None = Field(None) - reference_video_urls: list[str] = Field(...) - - -class Txt2ImageParametersField(BaseModel): - size: str = Field(...) - n: int = Field(1, description="Number of images to generate.") # we support only value=1 - seed: int = Field(..., ge=0, le=2147483647) - prompt_extend: bool = Field(True) - watermark: bool = Field(False) - - -class Image2ImageParametersField(BaseModel): - size: str | None = Field(None) - n: int = Field(1, description="Number of images to generate.") # we support only value=1 - seed: int = Field(..., ge=0, le=2147483647) - watermark: bool = Field(False) - - -class Text2VideoParametersField(BaseModel): - size: str = Field(...) - seed: int = Field(..., ge=0, le=2147483647) - duration: int = Field(5, ge=5, le=15) - prompt_extend: bool = Field(True) - watermark: bool = Field(False) - audio: bool = Field(False, description="Whether to generate audio automatically.") - shot_type: str = Field("single") - - -class Image2VideoParametersField(BaseModel): - resolution: str = Field(...) - seed: int = Field(..., ge=0, le=2147483647) - duration: int = Field(5, ge=5, le=15) - prompt_extend: bool = Field(True) - watermark: bool = Field(False) - audio: bool = Field(False, description="Whether to generate audio automatically.") - shot_type: str = Field("single") - - -class Reference2VideoParametersField(BaseModel): - size: str = Field(...) - duration: int = Field(5, ge=5, le=15) - shot_type: str = Field("single") - seed: int = Field(..., ge=0, le=2147483647) - watermark: bool = Field(False) - - -class Text2ImageTaskCreationRequest(BaseModel): - model: str = Field(...) - input: Text2ImageInputField = Field(...) - parameters: Txt2ImageParametersField = Field(...) - - -class Image2ImageTaskCreationRequest(BaseModel): - model: str = Field(...) - input: Image2ImageInputField = Field(...) - parameters: Image2ImageParametersField = Field(...) - - -class Text2VideoTaskCreationRequest(BaseModel): - model: str = Field(...) - input: Text2VideoInputField = Field(...) - parameters: Text2VideoParametersField = Field(...) - - -class Image2VideoTaskCreationRequest(BaseModel): - model: str = Field(...) - input: Image2VideoInputField = Field(...) - parameters: Image2VideoParametersField = Field(...) - - -class Reference2VideoTaskCreationRequest(BaseModel): - model: str = Field(...) - input: Reference2VideoInputField = Field(...) - parameters: Reference2VideoParametersField = Field(...) - - -class TaskCreationOutputField(BaseModel): - task_id: str = Field(...) - task_status: str = Field(...) - - -class TaskCreationResponse(BaseModel): - output: TaskCreationOutputField | None = Field(None) - request_id: str = Field(...) - code: str | None = Field(None, description="Error code for the failed request.") - message: str | None = Field(None, description="Details about the failed request.") - - -class TaskResult(BaseModel): - url: str | None = Field(None) - code: str | None = Field(None) - message: str | None = Field(None) - - -class ImageTaskStatusOutputField(TaskCreationOutputField): - task_id: str = Field(...) - task_status: str = Field(...) - results: list[TaskResult] | None = Field(None) - - -class VideoTaskStatusOutputField(TaskCreationOutputField): - task_id: str = Field(...) - task_status: str = Field(...) - video_url: str | None = Field(None) - code: str | None = Field(None) - message: str | None = Field(None) - - -class ImageTaskStatusResponse(BaseModel): - output: ImageTaskStatusOutputField | None = Field(None) - request_id: str = Field(...) - - -class VideoTaskStatusResponse(BaseModel): - output: VideoTaskStatusOutputField | None = Field(None) - request_id: str = Field(...) - - RES_IN_PARENS = re.compile(r"\((\d+)\s*[x×]\s*(\d+)\)") @@ -179,7 +67,6 @@ class WanTextToImageApi(IO.ComfyNode): IO.Combo.Input( "model", options=["wan2.5-t2i-preview"], - default="wan2.5-t2i-preview", tooltip="Model to use.", ), IO.String.Input( @@ -936,6 +823,829 @@ class WanReferenceVideoApi(IO.ComfyNode): return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) +class Wan2TextToVideoApi(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Wan2TextToVideoApi", + display_name="Wan 2.7 Text to Video", + category="api node/video/Wan", + description="Generates a video based on a text prompt using the Wan 2.7 model.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "wan2.7-t2v", + [ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt describing the elements and visual features. " + "Supports English and Chinese.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative prompt describing what to avoid.", + ), + IO.Combo.Input( + "resolution", + options=["720P", "1080P"], + ), + IO.Combo.Input( + "ratio", + options=["16:9", "9:16", "1:1", "4:3", "3:4"], + ), + IO.Int.Input( + "duration", + default=5, + min=2, + max=15, + step=1, + display_mode=IO.NumberDisplay.number, + ), + ], + ), + ], + ), + IO.Audio.Input( + "audio", + optional=True, + tooltip="Audio for driving video generation (e.g., lip sync, beat-matched motion). " + "Duration: 3s-30s. If not provided, the model automatically generates matching " + "background music or sound effects.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + ), + IO.Boolean.Input( + "prompt_extend", + default=True, + tooltip="Whether to enhance the prompt with AI assistance.", + advanced=True, + ), + IO.Boolean.Input( + "watermark", + default=False, + tooltip="Whether to add an AI-generated watermark to the result.", + advanced=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $dur := $lookup(widgets, "model.duration"); + $ppsTable := { "720p": 0.1, "1080p": 0.15 }; + $pps := $lookup($ppsTable, $res); + { "type": "usd", "usd": $pps * $dur } + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + seed: int, + prompt_extend: bool, + watermark: bool, + audio: Input.Audio | None = None, + ): + validate_string(model["prompt"], strip_whitespace=False, min_length=1) + audio_url = None + if audio is not None: + validate_audio_duration(audio, 1.5, 60.0) + audio_url = await upload_audio_to_comfyapi( + cls, audio, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mpeg" + ) + initial_response = await sync_op( + cls, + ApiEndpoint( + path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + method="POST", + ), + response_model=TaskCreationResponse, + data=Wan27Text2VideoTaskCreationRequest( + model=model["model"], + input=Text2VideoInputField( + prompt=model["prompt"], + negative_prompt=model["negative_prompt"] or None, + audio_url=audio_url, + ), + parameters=Wan27Text2VideoParametersField( + resolution=model["resolution"], + ratio=model["ratio"], + duration=model["duration"], + seed=seed, + prompt_extend=prompt_extend, + watermark=watermark, + ), + ), + ) + if not initial_response.output: + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), + response_model=VideoTaskStatusResponse, + status_extractor=lambda x: x.output.task_status, + poll_interval=7, + ) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + +class Wan2ImageToVideoApi(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Wan2ImageToVideoApi", + display_name="Wan 2.7 Image to Video", + category="api node/video/Wan", + description="Generate a video from a first-frame image, with optional last-frame image and audio.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "wan2.7-i2v", + [ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt describing the elements and visual features. " + "Supports English and Chinese.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative prompt describing what to avoid.", + ), + IO.Combo.Input( + "resolution", + options=["720P", "1080P"], + ), + IO.Int.Input( + "duration", + default=5, + min=2, + max=15, + step=1, + display_mode=IO.NumberDisplay.number, + ), + ], + ), + ], + ), + IO.Image.Input( + "first_frame", + tooltip="First frame image. The output aspect ratio is derived from this image.", + ), + IO.Image.Input( + "last_frame", + optional=True, + tooltip="Last frame image. The model generates a video transitioning from first to last frame.", + ), + IO.Audio.Input( + "audio", + optional=True, + tooltip="Audio for driving video generation (e.g., lip sync, beat-matched motion). " + "Duration: 2s-30s. If not provided, the model automatically generates matching " + "background music or sound effects.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + ), + IO.Boolean.Input( + "prompt_extend", + default=True, + tooltip="Whether to enhance the prompt with AI assistance.", + advanced=True, + ), + IO.Boolean.Input( + "watermark", + default=False, + tooltip="Whether to add an AI-generated watermark to the result.", + advanced=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $dur := $lookup(widgets, "model.duration"); + $ppsTable := { "720p": 0.1, "1080p": 0.15 }; + $pps := $lookup($ppsTable, $res); + { "type": "usd", "usd": $pps * $dur } + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + first_frame: Input.Image, + seed: int, + prompt_extend: bool, + watermark: bool, + last_frame: Input.Image | None = None, + audio: Input.Audio | None = None, + ): + media = [ + Wan27MediaItem( + type="first_frame", + url=await upload_image_to_comfyapi(cls, image=first_frame), + ) + ] + if last_frame is not None: + media.append( + Wan27MediaItem( + type="last_frame", + url=await upload_image_to_comfyapi(cls, image=last_frame), + ) + ) + if audio is not None: + validate_audio_duration(audio, 2.0, 30.0) + audio_url = await upload_audio_to_comfyapi( + cls, audio, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mpeg" + ) + media.append(Wan27MediaItem(type="driving_audio", url=audio_url)) + initial_response = await sync_op( + cls, + ApiEndpoint( + path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + method="POST", + ), + response_model=TaskCreationResponse, + data=Wan27ImageToVideoTaskCreationRequest( + model=model["model"], + input=Wan27ImageToVideoInputField( + prompt=model["prompt"] or None, + negative_prompt=model["negative_prompt"] or None, + media=media, + ), + parameters=Wan27ImageToVideoParametersField( + resolution=model["resolution"], + duration=model["duration"], + seed=seed, + prompt_extend=prompt_extend, + watermark=watermark, + ), + ), + ) + if not initial_response.output: + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), + response_model=VideoTaskStatusResponse, + status_extractor=lambda x: x.output.task_status, + poll_interval=7, + ) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + +class Wan2VideoContinuationApi(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Wan2VideoContinuationApi", + display_name="Wan 2.7 Video Continuation", + category="api node/video/Wan", + description="Continue a video from where it left off, with optional last-frame control.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "wan2.7-i2v", + [ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt describing the elements and visual features. Supports English and Chinese.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative prompt describing what to avoid.", + ), + IO.Combo.Input( + "resolution", + options=["720P", "1080P"], + ), + IO.Int.Input( + "duration", + default=5, + min=2, + max=15, + step=1, + display_mode=IO.NumberDisplay.number, + tooltip="Total output duration in seconds. The model generates continuation " + "to fill the remaining time after the input clip.", + ), + ], + ), + ], + ), + IO.Video.Input( + "first_clip", + tooltip="Input video to continue from. Duration: 2s-10s. " + "The output aspect ratio is derived from this video.", + ), + IO.Image.Input( + "last_frame", + optional=True, + tooltip="Last frame image. The continuation will transition towards this frame.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + ), + IO.Boolean.Input( + "prompt_extend", + default=True, + tooltip="Whether to enhance the prompt with AI assistance.", + advanced=True, + ), + IO.Boolean.Input( + "watermark", + default=False, + tooltip="Whether to add an AI-generated watermark to the result.", + advanced=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $dur := $lookup(widgets, "model.duration"); + $ppsTable := { "720p": 0.1, "1080p": 0.15 }; + $pps := $lookup($ppsTable, $res); + $outputPrice := $pps * $dur; + { + "type": "range_usd", + "min_usd": 2 * $pps + $outputPrice, + "max_usd": 5 * $pps + $outputPrice + } + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + first_clip: Input.Video, + prompt: str = "", + negative_prompt: str = "", + last_frame: Input.Image | None = None, + seed: int = 0, + prompt_extend: bool = True, + watermark: bool = False, + ): + validate_video_duration(first_clip, min_duration=2, max_duration=10) + media = [ + Wan27MediaItem( + type="first_clip", + url=await upload_video_to_comfyapi(cls, first_clip), + ) + ] + if last_frame is not None: + media.append( + Wan27MediaItem( + type="last_frame", + url=await upload_image_to_comfyapi(cls, image=last_frame), + ) + ) + initial_response = await sync_op( + cls, + ApiEndpoint( + path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + method="POST", + ), + response_model=TaskCreationResponse, + data=Wan27ImageToVideoTaskCreationRequest( + model=model["model"], + input=Wan27ImageToVideoInputField( + prompt=model["prompt"] or None, + negative_prompt=model["negative_prompt"] or None, + media=media, + ), + parameters=Wan27ImageToVideoParametersField( + resolution=model["resolution"], + duration=model["duration"], + seed=seed, + prompt_extend=prompt_extend, + watermark=watermark, + ), + ), + ) + if not initial_response.output: + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), + response_model=VideoTaskStatusResponse, + status_extractor=lambda x: x.output.task_status, + poll_interval=7, + ) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + +class Wan2VideoEditApi(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Wan2VideoEditApi", + display_name="Wan 2.7 Video Edit", + category="api node/video/Wan", + description="Edit a video using text instructions, reference images, or style transfer.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "wan2.7-videoedit", + [ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Editing instructions or style transfer requirements.", + ), + IO.Combo.Input( + "resolution", + options=["720P", "1080P"], + ), + IO.Combo.Input( + "ratio", + options=["16:9", "9:16", "1:1", "4:3", "3:4"], + tooltip="Aspect ratio. If not changed, approximates the input video ratio.", + ), + IO.Combo.Input( + "duration", + options=["auto", "2", "3", "4", "5", "6", "7", "8", "9", "10"], + default="auto", + tooltip="Output duration in seconds. 'auto' matches the input video duration. " + "A specific value truncates from the start of the video.", + ), + IO.Autogrow.Input( + "reference_images", + template=IO.Autogrow.TemplateNames( + IO.Image.Input("reference_image"), + names=[ + "image1", + "image2", + "image3", + "image4", + ], + min=0, + ), + ), + ], + ), + ], + ), + IO.Video.Input( + "video", + tooltip="The video to edit.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + ), + IO.Combo.Input( + "audio_setting", + options=["auto", "origin"], + default="auto", + tooltip="'auto': model decides whether to regenerate audio based on the prompt. " + "'origin': preserve the original audio from the input video.", + advanced=True, + ), + IO.Boolean.Input( + "watermark", + default=False, + tooltip="Whether to add an AI-generated watermark to the result.", + advanced=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $dur := $lookup(widgets, "model.duration"); + $ppsTable := { "720p": 0.1, "1080p": 0.15 }; + $pps := $lookup($ppsTable, $res); + { "type": "usd", "usd": $pps, "format": { "suffix": "/second", "note": "(input + output)" } } + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + video: Input.Video, + seed: int, + audio_setting: str, + watermark: bool, + ): + validate_string(model["prompt"], strip_whitespace=False, min_length=1) + validate_video_duration(video, min_duration=2, max_duration=10) + duration = 0 if model["duration"] == "auto" else int(model["duration"]) + media = [Wan27MediaItem(type="video", url=await upload_video_to_comfyapi(cls, video))] + reference_images = model.get("reference_images", {}) + for key in reference_images: + media.append( + Wan27MediaItem( + type="reference_image", url=await upload_image_to_comfyapi(cls, image=reference_images[key]) + ) + ) + initial_response = await sync_op( + cls, + ApiEndpoint( + path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + method="POST", + ), + response_model=TaskCreationResponse, + data=Wan27VideoEditTaskCreationRequest( + model=model["model"], + input=Wan27VideoEditInputField(prompt=model["prompt"], media=media), + parameters=Wan27VideoEditParametersField( + resolution=model["resolution"], + ratio=model["ratio"], + duration=duration, + audio_setting=audio_setting, + watermark=watermark, + seed=seed, + ), + ), + ) + if not initial_response.output: + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), + response_model=VideoTaskStatusResponse, + status_extractor=lambda x: x.output.task_status, + poll_interval=7, + ) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + +class Wan2ReferenceVideoApi(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Wan2ReferenceVideoApi", + display_name="Wan 2.7 Reference to Video", + category="api node/video/Wan", + description="Generate a video featuring a person or object from reference materials. " + "Supports single-character performances and multi-character interactions.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "wan2.7-r2v", + [ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt describing the video. Use identifiers such as 'character1' and " + "'character2' to refer to the reference characters.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative prompt describing what to avoid.", + ), + IO.Combo.Input( + "resolution", + options=["720P", "1080P"], + ), + IO.Combo.Input( + "ratio", + options=["16:9", "9:16", "1:1", "4:3", "3:4"], + ), + IO.Int.Input( + "duration", + default=5, + min=2, + max=10, + step=1, + display_mode=IO.NumberDisplay.number, + ), + IO.Autogrow.Input( + "reference_videos", + template=IO.Autogrow.TemplateNames( + IO.Video.Input("reference_video"), + names=["video1", "video2", "video3"], + min=0, + ), + ), + IO.Autogrow.Input( + "reference_images", + template=IO.Autogrow.TemplateNames( + IO.Image.Input("reference_image"), + names=["image1", "image2", "image3", "image4", "image5"], + min=0, + ), + ), + ], + ), + ], + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + ), + IO.Boolean.Input( + "watermark", + default=False, + tooltip="Whether to add an AI-generated watermark to the result.", + advanced=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $dur := $lookup(widgets, "model.duration"); + $ppsTable := { "720p": 0.1, "1080p": 0.15 }; + $pps := $lookup($ppsTable, $res); + $outputPrice := $pps * $dur; + { + "type": "range_usd", + "min_usd": $outputPrice, + "max_usd": 5 * $pps + $outputPrice + } + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + seed: int, + watermark: bool, + ): + validate_string(model["prompt"], strip_whitespace=False, min_length=1) + media = [] + reference_videos = model.get("reference_videos", {}) + for key in reference_videos: + media.append( + Wan27MediaItem(type="reference_video", url=await upload_video_to_comfyapi(cls, reference_videos[key])) + ) + reference_images = model.get("reference_images", {}) + for key in reference_images: + media.append( + Wan27MediaItem( + type="reference_image", + url=await upload_image_to_comfyapi(cls, image=reference_images[key]), + ) + ) + if not media: + raise ValueError("At least one reference video or reference image must be provided.") + if len(media) > 5: + raise ValueError( + f"Too many references ({len(media)}). The maximum total of reference videos and images is 5." + ) + + initial_response = await sync_op( + cls, + ApiEndpoint( + path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", + method="POST", + ), + response_model=TaskCreationResponse, + data=Wan27ReferenceVideoTaskCreationRequest( + model=model["model"], + input=Wan27ReferenceVideoInputField( + prompt=model["prompt"], + negative_prompt=model["negative_prompt"] or None, + media=media, + ), + parameters=Wan27ReferenceVideoParametersField( + resolution=model["resolution"], + ratio=model["ratio"], + duration=model["duration"], + watermark=watermark, + seed=seed, + ), + ), + ) + if not initial_response.output: + raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), + response_model=VideoTaskStatusResponse, + status_extractor=lambda x: x.output.task_status, + poll_interval=7, + ) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) + + class WanApiExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -945,6 +1655,11 @@ class WanApiExtension(ComfyExtension): WanTextToVideoApi, WanImageToVideoApi, WanReferenceVideoApi, + Wan2TextToVideoApi, + Wan2ImageToVideoApi, + Wan2VideoContinuationApi, + Wan2VideoEditApi, + Wan2ReferenceVideoApi, ] From eb0686bbb60c83e44c3a3e4f7defd0f589cfef10 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Fri, 3 Apr 2026 14:52:10 +0800 Subject: [PATCH 25/29] Update template to 0.9.43 (#13265) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0d88fdcfb..1031ffa88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.8 -comfyui-workflow-templates==0.9.41 +comfyui-workflow-templates==0.9.43 comfyui-embedded-docs==0.4.3 torch torchsde From f21f6b22125a80d714be6f2a3a0f3a58850daee5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 3 Apr 2026 12:29:06 -0700 Subject: [PATCH 26/29] Add portable release for intel XPU. (#13272) --- .github/workflows/release-stable-all.yml | 36 ++++++++++++------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml index 8f07a7b1c..d7cf69fe2 100644 --- a/.github/workflows/release-stable-all.yml +++ b/.github/workflows/release-stable-all.yml @@ -20,29 +20,12 @@ jobs: git_tag: ${{ inputs.git_tag }} cache_tag: "cu130" python_minor: "13" - python_patch: "11" + python_patch: "12" rel_name: "nvidia" rel_extra_name: "" test_release: true secrets: inherit - release_nvidia_cu128: - permissions: - contents: "write" - packages: "write" - pull-requests: "read" - name: "Release NVIDIA cu128" - uses: ./.github/workflows/stable-release.yml - with: - git_tag: ${{ inputs.git_tag }} - cache_tag: "cu128" - python_minor: "12" - python_patch: "10" - rel_name: "nvidia" - rel_extra_name: "_cu128" - test_release: true - secrets: inherit - release_nvidia_cu126: permissions: contents: "write" @@ -76,3 +59,20 @@ jobs: rel_extra_name: "" test_release: false secrets: inherit + + release_xpu: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + name: "Release Intel XPU" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "xpu" + python_minor: "13" + python_patch: "12" + rel_name: "intel" + rel_extra_name: "" + test_release: true + secrets: inherit From 13917b388028aee922efce6b4714db96e8cfea36 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 4 Apr 2026 16:02:47 -0700 Subject: [PATCH 27/29] Nightly Nvidia pytorch is now cu132 (#13288) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a47506fc8..7a7dd5614 100644 --- a/README.md +++ b/README.md @@ -276,7 +276,7 @@ Nvidia users should install stable pytorch using this command: This is the command to install pytorch nightly instead which might have performance improvements. -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu132``` #### Troubleshooting From 8cbbea8f6a571d8d2a859608bb9434103de769d7 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Sun, 5 Apr 2026 13:31:11 +0800 Subject: [PATCH 28/29] chore: update workflow templates to v0.9.44 (#13290) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1031ffa88..1a8e1ea1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.42.8 -comfyui-workflow-templates==0.9.43 +comfyui-workflow-templates==0.9.44 comfyui-embedded-docs==0.4.3 torch torchsde From 4b1444fc7a7d1dc542020f509dab2e2b90a4f16a Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sun, 5 Apr 2026 16:37:27 -0700 Subject: [PATCH 29/29] Update README.md with new frontend release cycle. (#13301) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a7dd5614..1eeb810de 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang - Builds a new release using the latest stable core version 3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)** - - Weekly frontend updates are merged into the core repository + - Every 2+ weeks frontend updates are merged into the core repository - Features are frozen for the upcoming core release - Development continues for the next release cycle