From e2a800e7ef225260c078ce484c75bb40161d9d94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Tue, 9 Dec 2025 23:59:16 +0200 Subject: [PATCH 01/21] Fix for HunyuanVideo1.5 meanflow distil (#11212) --- comfy/ldm/hunyuan_video/model.py | 3 ++- comfy/model_detection.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index 2749c53f5..55ab550f8 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -43,6 +43,7 @@ class HunyuanVideoParams: meanflow: bool use_cond_type_embedding: bool vision_in_dim: int + meanflow_sum: bool class SelfAttentionRef(nn.Module): @@ -317,7 +318,7 @@ class HunyuanVideo(nn.Module): timesteps_r = transformer_options['sample_sigmas'][w[0] + 1] timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype) vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype)) - vec = (vec + vec_r) / 2 + vec = (vec + vec_r) if self.params.meanflow_sum else (vec + vec_r) / 2 if ref_latent is not None: ref_latent_ids = self.img_ids(ref_latent) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 19e6aa954..1f5d34bdd 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -180,8 +180,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["use_cond_type_embedding"] = False if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys: dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0] + dit_config["meanflow_sum"] = True else: dit_config["vision_in_dim"] = None + dit_config["meanflow_sum"] = False return dit_config if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight) From 791e30ff5037fa5e7aa4e1396099ea8d6bfb020b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 9 Dec 2025 14:03:21 -0800 Subject: [PATCH 02/21] Fix nan issue when quantizing fp16 tensor. (#11213) --- comfy/quant_ops.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index 571d3f760..cd96541d7 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -399,7 +399,10 @@ class TensorCoreFP8Layout(QuantizedLayout): orig_dtype = tensor.dtype if isinstance(scale, str) and scale == "recalculate": - scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max + scale = torch.amax(tensor.abs()).to(dtype=torch.float32) / torch.finfo(dtype).max + if tensor.dtype not in [torch.float32, torch.bfloat16]: # Prevent scale from being too small + tensor_info = torch.finfo(tensor.dtype) + scale = (1.0 / torch.clamp((1.0 / scale), min=tensor_info.min, max=tensor_info.max)) if scale is not None: if not isinstance(scale, torch.Tensor): From fc657f471a29d07696ca16b566000e8e555d67d1 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 9 Dec 2025 18:22:09 -0500 Subject: [PATCH 03/21] ComfyUI version v0.4.0 From now on ComfyUI will do version numbers a bit differently, every stable off the master branch will increment the minor version. Anytime a fix needs to be backported onto a stable version the patch version will be incremented. Example: We release v0.6.0 off the master branch then a day later a bug is discovered and we decide to backport the fix onto the v0.6.0 stable, this will be done in a separate branch in the main repository and this new stable will be tagged v0.6.1 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 4b039356e..2f083edaf 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.76" +__version__ = "0.4.0" diff --git a/pyproject.toml b/pyproject.toml index 02b94a0ce..e4d3d616a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.76" +version = "0.4.0" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From f668c2e3c99df40561b416cf62b0fd9eec96007a Mon Sep 17 00:00:00 2001 From: Benjamin Lu Date: Tue, 9 Dec 2025 19:27:07 -0800 Subject: [PATCH 04/21] bump comfyui-frontend-package to 1.34.8 (#11220) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 11a7ac245..9e9b25328 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.33.13 +comfyui-frontend-package==1.34.8 comfyui-workflow-templates==0.7.54 comfyui-embedded-docs==0.3.1 torch From 36357bbcc3c515e37a742457a2b2ab4b7ccc17a8 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 10 Dec 2025 21:55:09 +0200 Subject: [PATCH 05/21] process the NodeV1 dict results correctly (#11237) --- comfy_api/latest/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 313a5af20..79217c813 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1815,7 +1815,7 @@ class NodeOutput(_NodeOutputInternal): ui = data["ui"] if "expand" in data: expand = data["expand"] - return cls(args=args, ui=ui, expand=expand) + return cls(*args, ui=ui, expand=expand) def __getitem__(self, index) -> Any: return self.args[index] From 17c92a9f2843d7b9b727531066be2378b350a6ae Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 10 Dec 2025 16:59:48 -0800 Subject: [PATCH 06/21] Tweak Z Image memory estimation. (#11254) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 383c82c3e..dd0f09f32 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1026,7 +1026,7 @@ class ZImage(Lumina2): "shift": 3.0, } - memory_usage_factor = 1.7 + memory_usage_factor = 2.0 supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] From 57ddb7fd13d817e7259c2c992a852832b6b0f07a Mon Sep 17 00:00:00 2001 From: Johnpaul Chiwetelu <49923152+Myestery@users.noreply.github.com> Date: Thu, 11 Dec 2025 03:49:49 +0100 Subject: [PATCH 07/21] Fix: filter hidden files from /internal/files endpoint (#11191) --- api_server/routes/internal/internal_routes.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api_server/routes/internal/internal_routes.py b/api_server/routes/internal/internal_routes.py index 613b0f7c7..b224306da 100644 --- a/api_server/routes/internal/internal_routes.py +++ b/api_server/routes/internal/internal_routes.py @@ -58,8 +58,13 @@ class InternalRoutes: return web.json_response({"error": "Invalid directory type"}, status=400) directory = get_directory_by_type(directory_type) + + def is_visible_file(entry: os.DirEntry) -> bool: + """Filter out hidden files (e.g., .DS_Store on macOS).""" + return entry.is_file() and not entry.name.startswith('.') + sorted_files = sorted( - (entry for entry in os.scandir(directory) if entry.is_file()), + (entry for entry in os.scandir(directory) if is_visible_file(entry)), key=lambda entry: -entry.stat().st_mtime ) return web.json_response([entry.name for entry in sorted_files], status=200) From e711aaf1a75120195c56ebd1f1ce829c6b7b84db Mon Sep 17 00:00:00 2001 From: Farshore <168402472+jiangchengchengark@users.noreply.github.com> Date: Thu, 11 Dec 2025 11:02:26 +0800 Subject: [PATCH 08/21] =?UTF-8?q?Lower=20VAE=20loading=20requirements?= =?UTF-8?q?=EF=BC=9ACreate=20a=20new=20branch=20for=20GPU=20memory=20calcu?= =?UTF-8?q?lations=20in=20qwen-image=20vae=20(#11199)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- comfy/sd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index a16f2d14f..1cad98aef 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -549,8 +549,10 @@ class VAE: ddconfig = {"dim": dim, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0} self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig) self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32] - self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: 7000 * shape[3] * shape[4] * (8 * 8) * model_management.dtype_size(dtype) + self.memory_used_encode = lambda shape, dtype: (1500 if shape[2]<=4 else 6000) * shape[3] * shape[4] * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (2200 if shape[2]<=4 else 7000) * shape[3] * shape[4] * (8*8) * model_management.dtype_size(dtype) + + # Hunyuan 3d v2 2.0 & 2.1 elif "geo_decoder.cross_attn_decoder.ln_1.bias" in sd: From 93948e3fc598c14082f744fe82fae056b64ff481 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Thu, 11 Dec 2025 08:11:12 +0200 Subject: [PATCH 09/21] feat(api-nodes): enable Kling Omni O1 node (#11229) --- comfy_api_nodes/nodes_kling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 6c840dc47..a2cc87d84 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -2056,7 +2056,7 @@ class KlingExtension(ComfyExtension): OmniProImageToVideoNode, OmniProVideoToVideoNode, OmniProEditVideoNode, - # OmniProImageNode, # need support from backend + OmniProImageNode, ] From f8321eb57b29a4b34cecd27d5d6365adf5e6e601 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 10 Dec 2025 22:30:31 -0800 Subject: [PATCH 10/21] Adjust memory usage factor. (#11257) --- comfy/supported_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index dd0f09f32..ef8c75c09 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -541,7 +541,7 @@ class SD3(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.SD3 - memory_usage_factor = 1.2 + memory_usage_factor = 1.6 text_encoder_key_prefix = ["text_encoders."] From fdebe182966d1dd9bee3138264937137bd2302d8 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:09:35 -0800 Subject: [PATCH 11/21] Fix regular chroma radiance (#11276) --- comfy/model_detection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 1f5d34bdd..94b54b7c2 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -261,6 +261,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_embedder_dtype"] = torch.float32 if "__x0__" in state_dict_keys: # x0 pred dit_config["use_x0"] = True + else: + dit_config["use_x0"] = False else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys From ae65433a602470eea271df47af0eb871d146a002 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:15:00 -0800 Subject: [PATCH 12/21] This only works on radiance. (#11277) --- comfy/model_detection.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 94b54b7c2..dd6a703f6 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -259,10 +259,10 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_tile_size"] = 512 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 - if "__x0__" in state_dict_keys: # x0 pred - dit_config["use_x0"] = True - else: - dit_config["use_x0"] = False + if "__x0__" in state_dict_keys: # x0 pred + dit_config["use_x0"] = True + else: + dit_config["use_x0"] = False else: dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys From eeb020b9b77e1f3c0c2806bc1e38c7ba9576439e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 14:33:09 -0800 Subject: [PATCH 13/21] Better chroma radiance and other models vram estimation. (#11278) --- comfy/supported_models.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index ef8c75c09..834dfcffc 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -965,7 +965,7 @@ class CosmosT2IPredict2(supported_models_base.BASE): def __init__(self, unet_config): super().__init__(unet_config) - self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.9 + self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95 def get_model(self, state_dict, prefix="", device=None): out = model_base.CosmosPredict2(self, device=device) @@ -1289,7 +1289,7 @@ class ChromaRadiance(Chroma): latent_format = comfy.latent_formats.ChromaRadiance # Pixel-space model, no spatial compression for model input. - memory_usage_factor = 0.038 + memory_usage_factor = 0.044 def get_model(self, state_dict, prefix="", device=None): return model_base.ChromaRadiance(self, device=device) @@ -1332,7 +1332,7 @@ class Omnigen2(supported_models_base.BASE): "shift": 2.6, } - memory_usage_factor = 1.65 #TODO + memory_usage_factor = 1.95 #TODO unet_extra_config = {} latent_format = latent_formats.Flux @@ -1397,7 +1397,7 @@ class HunyuanImage21(HunyuanVideo): latent_format = latent_formats.HunyuanImage21 - memory_usage_factor = 7.7 + memory_usage_factor = 8.7 supported_inference_dtypes = [torch.bfloat16, torch.float32] @@ -1488,7 +1488,7 @@ class Kandinsky5(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.HunyuanVideo - memory_usage_factor = 1.1 #TODO + memory_usage_factor = 1.25 #TODO supported_inference_dtypes = [torch.bfloat16, torch.float32] @@ -1517,7 +1517,7 @@ class Kandinsky5Image(Kandinsky5): } latent_format = latent_formats.Flux - memory_usage_factor = 1.1 #TODO + memory_usage_factor = 1.25 #TODO def get_model(self, state_dict, prefix="", device=None): out = model_base.Kandinsky5Image(self, device=device) From 338d9ae3bbf24a9a06996cdf1c2f228acc65fd96 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 15:56:33 -0800 Subject: [PATCH 14/21] Make portable updater work with repos in unmerged state. (#11281) --- .ci/update_windows/update.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py index 59ece5130..fe646a6ed 100755 --- a/.ci/update_windows/update.py +++ b/.ci/update_windows/update.py @@ -53,6 +53,16 @@ try: repo.stash(ident) except KeyError: print("nothing to stash") # noqa: T201 +except: + print("Could not stash, cleaning index and trying again.") # noqa: T201 + repo.state_cleanup() + repo.index.read_tree(repo.head.peel().tree) + repo.index.write() + try: + repo.stash(ident) + except KeyError: + print("nothing to stash.") # noqa: T201 + backup_branch_name = 'backup_branch_{}'.format(datetime.today().strftime('%Y-%m-%d_%H_%M_%S')) print("creating backup branch: {}".format(backup_branch_name)) # noqa: T201 try: From 982876d59a659adb085be5e236aacc4f2c54c19c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Fri, 12 Dec 2025 05:29:34 +0200 Subject: [PATCH 15/21] WanMove support (#11247) --- comfy_api/latest/_io.py | 8 + comfy_extras/nodes_wanmove.py | 535 ++++++++++++++++++++++++++++++++++ nodes.py | 1 + 3 files changed, 544 insertions(+) create mode 100644 comfy_extras/nodes_wanmove.py diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 79217c813..2b634d172 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -774,6 +774,13 @@ class AudioEncoder(ComfyTypeIO): class AudioEncoderOutput(ComfyTypeIO): Type = Any +@comfytype(io_type="TRACKS") +class Tracks(ComfyTypeIO): + class TrackDict(TypedDict): + track_path: torch.Tensor + track_visibility: torch.Tensor + Type = TrackDict + @comfytype(io_type="COMFY_MULTITYPED_V3") class MultiType: Type = Any @@ -1894,6 +1901,7 @@ __all__ = [ "SEGS", "AnyType", "MultiType", + "Tracks", # Dynamic Types "MatchType", # "DynamicCombo", diff --git a/comfy_extras/nodes_wanmove.py b/comfy_extras/nodes_wanmove.py new file mode 100644 index 000000000..5f39afa46 --- /dev/null +++ b/comfy_extras/nodes_wanmove.py @@ -0,0 +1,535 @@ +import nodes +import node_helpers +import torch +import torchvision.transforms.functional as TF +import comfy.model_management +import comfy.utils +import numpy as np +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io +from comfy_extras.nodes_wan import parse_json_tracks + +# https://github.com/ali-vilab/Wan-Move/blob/main/wan/modules/trajectory.py +from PIL import Image, ImageDraw + +SKIP_ZERO = False + +def get_pos_emb( + pos_k: torch.Tensor, # A 1D tensor containing positions for which to generate embeddings. + pos_emb_dim: int, + theta_func: callable = lambda i, d: torch.pow(10000, torch.mul(2, torch.div(i.to(torch.float32), d))), #Function to compute thetas based on position and embedding dimensions. + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float32, +) -> torch.Tensor: # The position embeddings (batch_size, pos_emb_dim) + + assert pos_emb_dim % 2 == 0, "The dimension of position embeddings must be even." + pos_k = pos_k.to(device, dtype) + if SKIP_ZERO: + pos_k = pos_k + 1 + batch_size = pos_k.size(0) + + denominator = torch.arange(0, pos_emb_dim // 2, device=device, dtype=dtype) + # Expand denominator to match the shape needed for broadcasting + denominator_expanded = denominator.view(1, -1).expand(batch_size, -1) + + thetas = theta_func(denominator_expanded, pos_emb_dim) + + # Ensure pos_k is in the correct shape for broadcasting + pos_k_expanded = pos_k.view(-1, 1).to(dtype) + sin_thetas = torch.sin(torch.div(pos_k_expanded, thetas)) + cos_thetas = torch.cos(torch.div(pos_k_expanded, thetas)) + + # Concatenate sine and cosine embeddings along the last dimension + pos_emb = torch.cat([sin_thetas, cos_thetas], dim=-1) + + return pos_emb + +def create_pos_embeddings( + pred_tracks: torch.Tensor, # the predicted tracks, [T, N, 2] + pred_visibility: torch.Tensor, # the predicted visibility [T, N] + downsample_ratios: list[int], # the ratios for downsampling time, height, and width + height: int, # the height of the feature map + width: int, # the width of the feature map + track_num: int = -1, # the number of tracks to use + t_down_strategy: str = "sample", # the strategy for downsampling time dimension +): + assert t_down_strategy in ["sample", "average"], "Invalid strategy for downsampling time dimension." + + t, n, _ = pred_tracks.shape + t_down, h_down, w_down = downsample_ratios + track_pos = - torch.ones(n, (t-1) // t_down + 1, 2, dtype=torch.long) + + if track_num == -1: + track_num = n + + tracks_idx = torch.randperm(n)[:track_num] + tracks = pred_tracks[:, tracks_idx] + visibility = pred_visibility[:, tracks_idx] + + for t_idx in range(0, t, t_down): + if t_down_strategy == "sample" or t_idx == 0: + cur_tracks = tracks[t_idx] # [N, 2] + cur_visibility = visibility[t_idx] # [N] + else: + cur_tracks = tracks[t_idx:t_idx+t_down].mean(dim=0) + cur_visibility = torch.any(visibility[t_idx:t_idx+t_down], dim=0) + + for i in range(track_num): + if not cur_visibility[i] or cur_tracks[i][0] < 0 or cur_tracks[i][1] < 0 or cur_tracks[i][0] >= width or cur_tracks[i][1] >= height: + continue + x, y = cur_tracks[i] + x, y = int(x // w_down), int(y // h_down) + track_pos[i, t_idx // t_down, 0], track_pos[i, t_idx // t_down, 1] = y, x + + return track_pos # the position embeddings, [N, T', 2], 2 = height, width + +def replace_feature( + vae_feature: torch.Tensor, # [B, C', T', H', W'] + track_pos: torch.Tensor, # [B, N, T', 2] + strength: float = 1.0 +) -> torch.Tensor: + b, _, t, h, w = vae_feature.shape + assert b == track_pos.shape[0], "Batch size mismatch." + n = track_pos.shape[1] + + # Shuffle the trajectory order + track_pos = track_pos[:, torch.randperm(n), :, :] + + # Extract coordinates at time steps ≥ 1 and generate a valid mask + current_pos = track_pos[:, :, 1:, :] # [B, N, T-1, 2] + mask = (current_pos[..., 0] >= 0) & (current_pos[..., 1] >= 0) # [B, N, T-1] + + # Get all valid indices + valid_indices = mask.nonzero(as_tuple=False) # [num_valid, 3] + num_valid = valid_indices.shape[0] + + if num_valid == 0: + return vae_feature + + # Decompose valid indices into each dimension + batch_idx = valid_indices[:, 0] + track_idx = valid_indices[:, 1] + t_rel = valid_indices[:, 2] + t_target = t_rel + 1 # Convert to original time step indices + + # Extract target position coordinates + h_target = current_pos[batch_idx, track_idx, t_rel, 0].long() # Ensure integer indices + w_target = current_pos[batch_idx, track_idx, t_rel, 1].long() + + # Extract source position coordinates (t=0) + h_source = track_pos[batch_idx, track_idx, 0, 0].long() + w_source = track_pos[batch_idx, track_idx, 0, 1].long() + + # Get source features and assign to target positions + src_features = vae_feature[batch_idx, :, 0, h_source, w_source] + dst_features = vae_feature[batch_idx, :, t_target, h_target, w_target] + + vae_feature[batch_idx, :, t_target, h_target, w_target] = dst_features + (src_features - dst_features) * strength + + + return vae_feature + +# Visualize functions + +def _draw_gradient_polyline_on_overlay(overlay, line_width, points, start_color, opacity=1.0): + draw = ImageDraw.Draw(overlay, 'RGBA') + points = points[::-1] + + # Compute total length + total_length = 0 + segment_lengths = [] + for i in range(len(points) - 1): + dx = points[i + 1][0] - points[i][0] + dy = points[i + 1][1] - points[i][1] + length = (dx * dx + dy * dy) ** 0.5 + segment_lengths.append(length) + total_length += length + + if total_length == 0: + return + + accumulated_length = 0 + + # Draw the gradient polyline + for idx, (start_point, end_point) in enumerate(zip(points[:-1], points[1:])): + segment_length = segment_lengths[idx] + steps = max(int(segment_length), 1) + + for i in range(steps): + current_length = accumulated_length + (i / steps) * segment_length + ratio = current_length / total_length + + alpha = int(255 * (1 - ratio) * opacity) + color = (*start_color, alpha) + + x = int(start_point[0] + (end_point[0] - start_point[0]) * i / steps) + y = int(start_point[1] + (end_point[1] - start_point[1]) * i / steps) + + dynamic_line_width = max(int(line_width * (1 - ratio)), 1) + draw.line([(x, y), (x + 1, y)], fill=color, width=dynamic_line_width) + + accumulated_length += segment_length + + +def add_weighted(rgb, track): + rgb = np.array(rgb) # [H, W, C] "RGB" + track = np.array(track) # [H, W, C] "RGBA" + + alpha = track[:, :, 3] / 255.0 + alpha = np.stack([alpha] * 3, axis=-1) + blend_img = track[:, :, :3] * alpha + rgb * (1 - alpha) + + return Image.fromarray(blend_img.astype(np.uint8)) + +def draw_tracks_on_video(video, tracks, visibility=None, track_frame=24, circle_size=12, opacity=0.5, line_width=16): + color_map = [(102, 153, 255), (0, 255, 255), (255, 255, 0), (255, 102, 204), (0, 255, 0)] + + video = video.byte().cpu().numpy() # (81, 480, 832, 3) + tracks = tracks[0].long().detach().cpu().numpy() + if visibility is not None: + visibility = visibility[0].detach().cpu().numpy() + + num_frames, height, width = video.shape[:3] + num_tracks = tracks.shape[1] + alpha_opacity = int(255 * opacity) + + output_frames = [] + for t in range(num_frames): + frame_rgb = video[t].astype(np.float32) + + # Create a single RGBA overlay for all tracks in this frame + overlay = Image.new("RGBA", (width, height), (0, 0, 0, 0)) + draw_overlay = ImageDraw.Draw(overlay) + + polyline_data = [] + + # Draw all circles on a single overlay + for n in range(num_tracks): + if visibility is not None and visibility[t, n] == 0: + continue + + track_coord = tracks[t, n] + color = color_map[n % len(color_map)] + circle_color = color + (alpha_opacity,) + + draw_overlay.ellipse((track_coord[0] - circle_size, track_coord[1] - circle_size, track_coord[0] + circle_size, track_coord[1] + circle_size), + fill=circle_color + ) + + # Store polyline data for batch processing + tracks_coord = tracks[max(t - track_frame, 0):t + 1, n] + if len(tracks_coord) > 1: + polyline_data.append((tracks_coord, color)) + + # Blend circles overlay once + overlay_np = np.array(overlay) + alpha = overlay_np[:, :, 3:4] / 255.0 + frame_rgb = overlay_np[:, :, :3] * alpha + frame_rgb * (1 - alpha) + + # Draw all polylines on a single overlay + if polyline_data: + polyline_overlay = Image.new("RGBA", (width, height), (0, 0, 0, 0)) + for tracks_coord, color in polyline_data: + _draw_gradient_polyline_on_overlay(polyline_overlay, line_width, tracks_coord, color, opacity) + + # Blend polylines overlay once + polyline_np = np.array(polyline_overlay) + alpha = polyline_np[:, :, 3:4] / 255.0 + frame_rgb = polyline_np[:, :, :3] * alpha + frame_rgb * (1 - alpha) + + output_frames.append(Image.fromarray(frame_rgb.astype(np.uint8))) + + return output_frames + + +class WanMoveVisualizeTracks(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveVisualizeTracks", + category="conditioning/video_models", + inputs=[ + io.Image.Input("images"), + io.Tracks.Input("tracks", optional=True), + io.Int.Input("line_resolution", default=24, min=1, max=1024), + io.Int.Input("circle_size", default=12, min=1, max=128), + io.Float.Input("opacity", default=0.75, min=0.0, max=1.0, step=0.01), + io.Int.Input("line_width", default=16, min=1, max=128), + ], + outputs=[ + io.Image.Output(), + ], + ) + + @classmethod + def execute(cls, images, line_resolution, circle_size, opacity, line_width, tracks=None) -> io.NodeOutput: + if tracks is None: + return io.NodeOutput(images) + + track_path = tracks["track_path"].unsqueeze(0) + track_visibility = tracks["track_visibility"].unsqueeze(0) + images_in = images * 255.0 + if images_in.shape[0] != track_path.shape[1]: + repeat_count = track_path.shape[1] // images.shape[0] + images_in = images_in.repeat(repeat_count, 1, 1, 1) + track_video = draw_tracks_on_video(images_in, track_path, track_visibility, track_frame=line_resolution, circle_size=circle_size, opacity=opacity, line_width=line_width) + track_video = torch.stack([TF.to_tensor(frame) for frame in track_video], dim=0).movedim(1, -1).float() + + return io.NodeOutput(track_video.to(comfy.model_management.intermediate_device())) + + +class WanMoveTracksFromCoords(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveTracksFromCoords", + category="conditioning/video_models", + inputs=[ + io.String.Input("track_coords", force_input=True, default="[]", optional=True), + io.Mask.Input("track_mask", optional=True), + ], + outputs=[ + io.Tracks.Output(), + io.Int.Output(display_name="track_length"), + ], + ) + + @classmethod + def execute(cls, track_coords, track_mask=None) -> io.NodeOutput: + device=comfy.model_management.intermediate_device() + + tracks_data = parse_json_tracks(track_coords) + track_length = len(tracks_data[0]) + + track_list = [ + [[track[frame]['x'], track[frame]['y']] for track in tracks_data] + for frame in range(len(tracks_data[0])) + ] + tracks = torch.tensor(track_list, dtype=torch.float32, device=device) # [frames, num_tracks, 2] + + num_tracks = tracks.shape[-2] + if track_mask is None: + track_visibility = torch.ones((track_length, num_tracks), dtype=torch.bool, device=device) + else: + track_visibility = (track_mask > 0).any(dim=(1, 2)).unsqueeze(-1) + + out_track_info = {} + out_track_info["track_path"] = tracks + out_track_info["track_visibility"] = track_visibility + return io.NodeOutput(out_track_info, track_length) + + +class GenerateTracks(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="GenerateTracks", + category="conditioning/video_models", + inputs=[ + io.Int.Input("width", default=832, min=16, max=4096, step=16), + io.Int.Input("height", default=480, min=16, max=4096, step=16), + io.Float.Input("start_x", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized X coordinate (0-1) for start position."), + io.Float.Input("start_y", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized Y coordinate (0-1) for start position."), + io.Float.Input("end_x", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized X coordinate (0-1) for end position."), + io.Float.Input("end_y", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="Normalized Y coordinate (0-1) for end position."), + io.Int.Input("num_frames", default=81, min=1, max=1024), + io.Int.Input("num_tracks", default=5, min=1, max=100), + io.Float.Input("track_spread", default=0.025, min=0.0, max=1.0, step=0.001, tooltip="Normalized distance between tracks. Tracks are spread perpendicular to the motion direction."), + io.Boolean.Input("bezier", default=False, tooltip="Enable Bezier curve path using the mid point as control point."), + io.Float.Input("mid_x", default=0.5, min=0.0, max=1.0, step=0.01, tooltip="Normalized X control point for Bezier curve. Only used when 'bezier' is enabled."), + io.Float.Input("mid_y", default=0.5, min=0.0, max=1.0, step=0.01, tooltip="Normalized Y control point for Bezier curve. Only used when 'bezier' is enabled."), + io.Combo.Input( + "interpolation", + options=["linear", "ease_in", "ease_out", "ease_in_out", "constant"], + tooltip="Controls the timing/speed of movement along the path.", + ), + io.Mask.Input("track_mask", optional=True, tooltip="Optional mask to indicate visible frames."), + ], + outputs=[ + io.Tracks.Output(), + io.Int.Output(display_name="track_length"), + ], + ) + + @classmethod + def execute(cls, width, height, start_x, start_y, mid_x, mid_y, end_x, end_y, num_frames, num_tracks, + track_spread, bezier=False, interpolation="linear", track_mask=None) -> io.NodeOutput: + device = comfy.model_management.intermediate_device() + track_length = num_frames + + # normalized coordinates to pixel coordinates + start_x_px = start_x * width + start_y_px = start_y * height + mid_x_px = mid_x * width + mid_y_px = mid_y * height + end_x_px = end_x * width + end_y_px = end_y * height + + track_spread_px = track_spread * (width + height) / 2 # Use average of width/height for spread to keep it proportional + + t = torch.linspace(0, 1, num_frames, device=device) + if interpolation == "constant": # All points stay at start position + interp_values = torch.zeros_like(t) + elif interpolation == "linear": + interp_values = t + elif interpolation == "ease_in": + interp_values = t ** 2 + elif interpolation == "ease_out": + interp_values = 1 - (1 - t) ** 2 + elif interpolation == "ease_in_out": + interp_values = t * t * (3 - 2 * t) + + if bezier: # apply interpolation to t for timing control along the bezier path + t_interp = interp_values + one_minus_t = 1 - t_interp + x_positions = one_minus_t ** 2 * start_x_px + 2 * one_minus_t * t_interp * mid_x_px + t_interp ** 2 * end_x_px + y_positions = one_minus_t ** 2 * start_y_px + 2 * one_minus_t * t_interp * mid_y_px + t_interp ** 2 * end_y_px + tangent_x = 2 * one_minus_t * (mid_x_px - start_x_px) + 2 * t_interp * (end_x_px - mid_x_px) + tangent_y = 2 * one_minus_t * (mid_y_px - start_y_px) + 2 * t_interp * (end_y_px - mid_y_px) + else: # calculate base x and y positions for each frame (center track) + x_positions = start_x_px + (end_x_px - start_x_px) * interp_values + y_positions = start_y_px + (end_y_px - start_y_px) * interp_values + # For non-bezier, tangent is constant (direction from start to end) + tangent_x = torch.full_like(t, end_x_px - start_x_px) + tangent_y = torch.full_like(t, end_y_px - start_y_px) + + track_list = [] + for frame_idx in range(num_frames): + # Calculate perpendicular direction at this frame + tx = tangent_x[frame_idx].item() + ty = tangent_y[frame_idx].item() + length = (tx ** 2 + ty ** 2) ** 0.5 + + if length > 0: # Perpendicular unit vector (rotate 90 degrees) + perp_x = -ty / length + perp_y = tx / length + else: # If tangent is zero, spread horizontally + perp_x = 1.0 + perp_y = 0.0 + + frame_tracks = [] + for track_idx in range(num_tracks): # center tracks around the main path offset ranges from -(num_tracks-1)/2 to +(num_tracks-1)/2 + offset = (track_idx - (num_tracks - 1) / 2) * track_spread_px + track_x = x_positions[frame_idx].item() + perp_x * offset + track_y = y_positions[frame_idx].item() + perp_y * offset + frame_tracks.append([track_x, track_y]) + track_list.append(frame_tracks) + + tracks = torch.tensor(track_list, dtype=torch.float32, device=device) # [frames, num_tracks, 2] + + if track_mask is None: + track_visibility = torch.ones((track_length, num_tracks), dtype=torch.bool, device=device) + else: + track_visibility = (track_mask > 0).any(dim=(1, 2)).unsqueeze(-1) + + out_track_info = {} + out_track_info["track_path"] = tracks + out_track_info["track_visibility"] = track_visibility + return io.NodeOutput(out_track_info, track_length) + + +class WanMoveConcatTrack(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveConcatTrack", + category="conditioning/video_models", + inputs=[ + io.Tracks.Input("tracks_1"), + io.Tracks.Input("tracks_2", optional=True), + ], + outputs=[ + io.Tracks.Output(), + ], + ) + + @classmethod + def execute(cls, tracks_1=None, tracks_2=None) -> io.NodeOutput: + if tracks_2 is None: + return io.NodeOutput(tracks_1) + + tracks_out = torch.cat([tracks_1["track_path"], tracks_2["track_path"]], dim=1) # Concatenate along the track dimension + mask_out = torch.cat([tracks_1["track_visibility"], tracks_2["track_visibility"]], dim=-1) + + out_track_info = {} + out_track_info["track_path"] = tracks_out + out_track_info["track_visibility"] = mask_out + return io.NodeOutput(out_track_info) + + +class WanMoveTrackToVideo(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="WanMoveTrackToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Tracks.Input("tracks", optional=True), + io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01, tooltip="Strength of the track conditioning."), + io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Image.Input("start_image"), + io.ClipVisionOutput.Input("clip_vision_output", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, width, height, length, batch_size, strength, tracks=None, start_image=None, clip_vision_output=None) -> io.NodeOutput: + device=comfy.model_management.intermediate_device() + latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=device) + if start_image is not None: + start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) + image = torch.ones((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) * 0.5 + image[:start_image.shape[0]] = start_image + + concat_latent_image = vae.encode(image[:, :, :, :3]) + mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) + mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 + + if tracks is not None and strength > 0.0: + tracks_path = tracks["track_path"][:length] # [T, N, 2] + num_tracks = tracks_path.shape[-2] + + track_visibility = tracks.get("track_visibility", torch.ones((length, num_tracks), dtype=torch.bool, device=device)) + + track_pos = create_pos_embeddings(tracks_path, track_visibility, [4, 8, 8], height, width, track_num=num_tracks) + track_pos = comfy.utils.resize_to_batch_size(track_pos.unsqueeze(0), batch_size) + concat_latent_image_pos = replace_feature(concat_latent_image, track_pos, strength) + else: + concat_latent_image_pos = concat_latent_image + + positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image_pos, "concat_mask": mask}) + negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask}) + + if clip_vision_output is not None: + positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output}) + negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output}) + + out_latent = {} + out_latent["samples"] = latent + return io.NodeOutput(positive, negative, out_latent) + + +class WanMoveExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + WanMoveTrackToVideo, + WanMoveTracksFromCoords, + WanMoveConcatTrack, + WanMoveVisualizeTracks, + GenerateTracks, + ] + +async def comfy_entrypoint() -> WanMoveExtension: + return WanMoveExtension() diff --git a/nodes.py b/nodes.py index 8d28a725d..8678f510a 100644 --- a/nodes.py +++ b/nodes.py @@ -2358,6 +2358,7 @@ async def init_builtin_extra_nodes(): "nodes_logic.py", "nodes_nop.py", "nodes_kandinsky5.py", + "nodes_wanmove.py", ] import_failed = [] From 5495589db38409353a85b06df7d10f8de2f9c78d Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 11 Dec 2025 20:32:27 -0800 Subject: [PATCH 16/21] Respect the dtype the op was initialized in for non quant mixed op. (#11282) --- comfy/ops.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 6f34d50fc..6ae6e791a 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -497,8 +497,10 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec ) -> None: super().__init__() - self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} - # self.factory_kwargs = {"device": device, "dtype": dtype} + if dtype is None: + dtype = MixedPrecisionOps._compute_dtype + + self.factory_kwargs = {"device": device, "dtype": dtype} self.in_features = in_features self.out_features = out_features @@ -530,7 +532,10 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec layer_conf = json.loads(layer_conf.numpy().tobytes()) if layer_conf is None: - self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) + dtype = self.factory_kwargs["dtype"] + self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False) + if dtype != MixedPrecisionOps._compute_dtype: + self.comfy_cast_weights = True else: self.quant_format = layer_conf.get("format", None) if not self._full_precision_mm: From 908fd7d7496f6de88722263e1e00fcd3d22e584f Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:18:31 +0200 Subject: [PATCH 17/21] feat(api-nodes): new TextToVideoWithAudio and ImageToVideoWithAudio nodes (#11267) --- comfy_api_nodes/apis/kling_api.py | 28 ++++- comfy_api_nodes/nodes_kling.py | 169 ++++++++++++++++++++++++++---- 2 files changed, 174 insertions(+), 23 deletions(-) diff --git a/comfy_api_nodes/apis/kling_api.py b/comfy_api_nodes/apis/kling_api.py index d8949f8ac..80a758466 100644 --- a/comfy_api_nodes/apis/kling_api.py +++ b/comfy_api_nodes/apis/kling_api.py @@ -51,25 +51,25 @@ class TaskStatusImageResult(BaseModel): url: str = Field(..., description="URL for generated image") -class OmniTaskStatusResults(BaseModel): +class TaskStatusResults(BaseModel): videos: list[TaskStatusVideoResult] | None = Field(None) images: list[TaskStatusImageResult] | None = Field(None) -class OmniTaskStatusResponseData(BaseModel): +class TaskStatusResponseData(BaseModel): created_at: int | None = Field(None, description="Task creation time") updated_at: int | None = Field(None, description="Task update time") task_status: str | None = None task_status_msg: str | None = Field(None, description="Additional failure reason. Only for polling endpoint.") task_id: str | None = Field(None, description="Task ID") - task_result: OmniTaskStatusResults | None = Field(None) + task_result: TaskStatusResults | None = Field(None) -class OmniTaskStatusResponse(BaseModel): +class TaskStatusResponse(BaseModel): code: int | None = Field(None, description="Error code") message: str | None = Field(None, description="Error message") request_id: str | None = Field(None, description="Request ID") - data: OmniTaskStatusResponseData | None = Field(None) + data: TaskStatusResponseData | None = Field(None) class OmniImageParamImage(BaseModel): @@ -84,3 +84,21 @@ class OmniProImageRequest(BaseModel): mode: str = Field("pro") n: int | None = Field(1, le=9) image_list: list[OmniImageParamImage] | None = Field(..., max_length=10) + + +class TextToVideoWithAudioRequest(BaseModel): + model_name: str = Field(..., description="kling-v2-6") + aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'") + duration: str = Field(..., description="'5' or '10'") + prompt: str = Field(...) + mode: str = Field("pro") + sound: str = Field(..., description="'on' or 'off'") + + +class ImageToVideoWithAudioRequest(BaseModel): + model_name: str = Field(..., description="kling-v2-6") + image: str = Field(...) + duration: str = Field(..., description="'5' or '10'") + prompt: str = Field(...) + mode: str = Field("pro") + sound: str = Field(..., description="'on' or 'off'") diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index a2cc87d84..e545fe490 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -50,6 +50,7 @@ from comfy_api_nodes.apis import ( KlingSingleImageEffectModelName, ) from comfy_api_nodes.apis.kling_api import ( + ImageToVideoWithAudioRequest, OmniImageParamImage, OmniParamImage, OmniParamVideo, @@ -57,7 +58,8 @@ from comfy_api_nodes.apis.kling_api import ( OmniProImageRequest, OmniProReferences2VideoRequest, OmniProText2VideoRequest, - OmniTaskStatusResponse, + TaskStatusResponse, + TextToVideoWithAudioRequest, ) from comfy_api_nodes.util import ( ApiEndpoint, @@ -242,7 +244,7 @@ def normalize_omni_prompt_references(prompt: str) -> str: return re.sub(r"(?\d*)(?!\w)", _video_repl, prompt) -async def finish_omni_video_task(cls: type[IO.ComfyNode], response: OmniTaskStatusResponse) -> IO.NodeOutput: +async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusResponse) -> IO.NodeOutput: if response.code: raise RuntimeError( f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" @@ -250,7 +252,7 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: OmniTaskStat final_response = await poll_op( cls, ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, status_extractor=lambda r: (r.data.task_status if r.data else None), max_poll_attempts=160, ) @@ -483,12 +485,12 @@ async def execute_image2video( task_id = task_creation_response.data.task_id final_response = await poll_op( - cls, - ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"), - response_model=KlingImage2VideoResponse, - estimated_duration=AVERAGE_DURATION_I2V, - status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), - ) + cls, + ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"), + response_model=KlingImage2VideoResponse, + estimated_duration=AVERAGE_DURATION_I2V, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), + ) validate_video_result_response(final_response) video = get_video_from_response(final_response) @@ -834,7 +836,7 @@ class OmniProTextToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProText2VideoRequest( model_name=model_name, prompt=prompt, @@ -929,7 +931,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProFirstLastFrameRequest( model_name=model_name, prompt=prompt, @@ -997,7 +999,7 @@ class OmniProImageToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1081,7 +1083,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1162,7 +1164,7 @@ class OmniProEditVideoNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProReferences2VideoRequest( model_name=model_name, prompt=prompt, @@ -1237,7 +1239,7 @@ class OmniProImageNode(IO.ComfyNode): response = await sync_op( cls, ApiEndpoint(path="/proxy/kling/v1/images/omni-image", method="POST"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, data=OmniProImageRequest( model_name=model_name, prompt=prompt, @@ -1253,7 +1255,7 @@ class OmniProImageNode(IO.ComfyNode): final_response = await poll_op( cls, ApiEndpoint(path=f"/proxy/kling/v1/images/omni-image/{response.data.task_id}"), - response_model=OmniTaskStatusResponse, + response_model=TaskStatusResponse, status_extractor=lambda r: (r.data.task_status if r.data else None), ) return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.task_result.images[0].url)) @@ -1328,9 +1330,8 @@ class KlingImage2VideoNode(IO.ComfyNode): def define_schema(cls) -> IO.Schema: return IO.Schema( node_id="KlingImage2VideoNode", - display_name="Kling Image to Video", + display_name="Kling Image(First Frame) to Video", category="api node/video/Kling", - description="Kling Image to Video Node", inputs=[ IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."), IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -2034,6 +2035,136 @@ class KlingImageGenerationNode(IO.ComfyNode): return IO.NodeOutput(await image_result_to_node_output(images)) +class TextToVideoWithAudio(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingTextToVideoWithAudio", + display_name="Kling Text to Video with Audio", + category="api node/video/Kling", + inputs=[ + IO.Combo.Input("model_name", options=["kling-v2-6"]), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."), + IO.Combo.Input("mode", options=["pro"]), + IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]), + IO.Combo.Input("duration", options=[5, 10]), + IO.Boolean.Input("generate_audio", default=True), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + prompt: str, + mode: str, + aspect_ratio: str, + duration: int, + generate_audio: bool, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/text2video", method="POST"), + response_model=TaskStatusResponse, + data=TextToVideoWithAudioRequest( + model_name=model_name, + prompt=prompt, + mode=mode, + aspect_ratio=aspect_ratio, + duration=str(duration), + sound="on" if generate_audio else "off", + ), + ) + if response.code: + raise RuntimeError( + f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/kling/v1/videos/text2video/{response.data.task_id}"), + response_model=TaskStatusResponse, + status_extractor=lambda r: (r.data.task_status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) + + +class ImageToVideoWithAudio(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingImageToVideoWithAudio", + display_name="Kling Image(First Frame) to Video with Audio", + category="api node/video/Kling", + inputs=[ + IO.Combo.Input("model_name", options=["kling-v2-6"]), + IO.Image.Input("start_frame"), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."), + IO.Combo.Input("mode", options=["pro"]), + IO.Combo.Input("duration", options=[5, 10]), + IO.Boolean.Input("generate_audio", default=True), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model_name: str, + start_frame: Input.Image, + prompt: str, + mode: str, + duration: int, + generate_audio: bool, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=2500) + validate_image_dimensions(start_frame, min_width=300, min_height=300) + validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1)) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"), + response_model=TaskStatusResponse, + data=ImageToVideoWithAudioRequest( + model_name=model_name, + image=(await upload_images_to_comfyapi(cls, start_frame))[0], + prompt=prompt, + mode=mode, + duration=str(duration), + sound="on" if generate_audio else "off", + ), + ) + if response.code: + raise RuntimeError( + f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" + ) + final_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"), + response_model=TaskStatusResponse, + status_extractor=lambda r: (r.data.task_status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) + + class KlingExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -2057,6 +2188,8 @@ class KlingExtension(ComfyExtension): OmniProVideoToVideoNode, OmniProEditVideoNode, OmniProImageNode, + TextToVideoWithAudio, + ImageToVideoWithAudio, ] From c5a47a16924e1be96241553a1448b298e57e50a1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Dec 2025 08:49:35 -0800 Subject: [PATCH 18/21] Fix bias dtype issue in mixed ops. (#11293) --- comfy/ops.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 6ae6e791a..0384c8717 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -504,10 +504,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.in_features = in_features self.out_features = out_features - if bias: - self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) - else: - self.register_parameter("bias", None) + self._has_bias = bias self.tensor_class = None self._full_precision_mm = MixedPrecisionOps._full_precision_mm @@ -536,6 +533,10 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False) if dtype != MixedPrecisionOps._compute_dtype: self.comfy_cast_weights = True + if self._has_bias: + self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=dtype)) + else: + self.register_parameter("bias", None) else: self.quant_format = layer_conf.get("format", None) if not self._full_precision_mm: @@ -565,6 +566,11 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec requires_grad=False ) + if self._has_bias: + self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=MixedPrecisionOps._compute_dtype)) + else: + self.register_parameter("bias", None) + for param_name in qconfig["parameters"]: param_key = f"{prefix}{param_name}" _v = state_dict.pop(param_key, None) From da2bfb5b0af26c7a1c44ec951dbd0fffe413c793 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 12 Dec 2025 22:39:11 -0800 Subject: [PATCH 19/21] Basic implementation of z image fun control union 2.0 (#11304) The inpaint part is currently missing and will be implemented later. I think they messed up this model pretty bad. They added some control_noise_refiner blocks but don't actually use them. There is a typo in their code so instead of doing control_noise_refiner -> control_layers it runs the whole control_layers twice. Unfortunately they trained with this typo so the model works but is kind of slow and would probably perform a lot better if they corrected their code and trained it again. --- comfy/ldm/lumina/controlnet.py | 95 +++++++++++++++++++++++-------- comfy/ldm/lumina/model.py | 16 +++++- comfy/model_patcher.py | 3 + comfy_extras/nodes_model_patch.py | 72 +++++++++++++++++------ 4 files changed, 142 insertions(+), 44 deletions(-) diff --git a/comfy/ldm/lumina/controlnet.py b/comfy/ldm/lumina/controlnet.py index fd7ce3b5c..8e2de7977 100644 --- a/comfy/ldm/lumina/controlnet.py +++ b/comfy/ldm/lumina/controlnet.py @@ -41,6 +41,11 @@ class ZImage_Control(torch.nn.Module): ffn_dim_multiplier: float = (8.0 / 3.0), norm_eps: float = 1e-5, qk_norm: bool = True, + n_control_layers=6, + control_in_dim=16, + additional_in_dim=0, + broken=False, + refiner_control=False, dtype=None, device=None, operations=None, @@ -49,10 +54,11 @@ class ZImage_Control(torch.nn.Module): super().__init__() operation_settings = {"operations": operations, "device": device, "dtype": dtype} - self.additional_in_dim = 0 - self.control_in_dim = 16 + self.broken = broken + self.additional_in_dim = additional_in_dim + self.control_in_dim = control_in_dim n_refiner_layers = 2 - self.n_control_layers = 6 + self.n_control_layers = n_control_layers self.control_layers = nn.ModuleList( [ ZImageControlTransformerBlock( @@ -74,28 +80,49 @@ class ZImage_Control(torch.nn.Module): all_x_embedder = {} patch_size = 2 f_patch_size = 1 - x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True, device=device, dtype=dtype) + x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * (self.control_in_dim + self.additional_in_dim), dim, bias=True, device=device, dtype=dtype) all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder + self.refiner_control = refiner_control + self.control_all_x_embedder = nn.ModuleDict(all_x_embedder) - self.control_noise_refiner = nn.ModuleList( - [ - JointTransformerBlock( - layer_id, - dim, - n_heads, - n_kv_heads, - multiple_of, - ffn_dim_multiplier, - norm_eps, - qk_norm, - modulation=True, - z_image_modulation=True, - operation_settings=operation_settings, - ) - for layer_id in range(n_refiner_layers) - ] - ) + if self.refiner_control: + self.control_noise_refiner = nn.ModuleList( + [ + ZImageControlTransformerBlock( + layer_id, + dim, + n_heads, + n_kv_heads, + multiple_of, + ffn_dim_multiplier, + norm_eps, + qk_norm, + block_id=layer_id, + operation_settings=operation_settings, + ) + for layer_id in range(n_refiner_layers) + ] + ) + else: + self.control_noise_refiner = nn.ModuleList( + [ + JointTransformerBlock( + layer_id, + dim, + n_heads, + n_kv_heads, + multiple_of, + ffn_dim_multiplier, + norm_eps, + qk_norm, + modulation=True, + z_image_modulation=True, + operation_settings=operation_settings, + ) + for layer_id in range(n_refiner_layers) + ] + ) def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input): patch_size = 2 @@ -105,9 +132,29 @@ class ZImage_Control(torch.nn.Module): control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)) x_attn_mask = None - for layer in self.control_noise_refiner: - control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input) + if not self.refiner_control: + for layer in self.control_noise_refiner: + control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input) + return control_context + def forward_noise_refiner_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input): + if self.refiner_control: + if self.broken: + if layer_id == 0: + return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) + if layer_id > 0: + out = None + for i in range(1, len(self.control_layers)): + o, control_context = self.control_layers[i](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) + if out is None: + out = o + + return (out, control_context) + else: + return self.control_noise_refiner[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) + else: + return (None, control_context) + def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input): return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input) diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index c47df49ca..96cb37fa6 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -536,6 +536,7 @@ class NextDiT(nn.Module): bsz = len(x) pH = pW = self.patch_size device = x[0].device + orig_x = x if self.pad_tokens_multiple is not None: pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple @@ -572,13 +573,21 @@ class NextDiT(nn.Module): freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2) + patches = transformer_options.get("patches", {}) + # refine context for layer in self.context_refiner: cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options) padded_img_mask = None - for layer in self.noise_refiner: + x_input = x + for i, layer in enumerate(self.noise_refiner): x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options) + if "noise_refiner" in patches: + for p in patches["noise_refiner"]: + out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"}) + if "img" in out: + x = out["img"] padded_full_embed = torch.cat((cap_feats, x), dim=1) mask = None @@ -622,14 +631,15 @@ class NextDiT(nn.Module): patches = transformer_options.get("patches", {}) x_is_tensor = isinstance(x, torch.Tensor) - img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options) + img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options) freqs_cis = freqs_cis.to(img.device) + img_input = img for i, layer in enumerate(self.layers): img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options) if "double_block" in patches: for p in patches["double_block"]: - out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options}) + out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options}) if "img" in out: img[:, cap_size[0]:] = out["img"] if "txt" in out: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a486c2723..93d26c690 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -454,6 +454,9 @@ class ModelPatcher: def set_model_post_input_patch(self, patch): self.set_model_patch(patch, "post_input") + def set_model_noise_refiner_patch(self, patch): + self.set_model_patch(patch, "noise_refiner") + def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs): rope_options = self.model_options["transformer_options"].get("rope_options", {}) rope_options["scale_x"] = scale_x diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index c61810dbf..ec0e790dc 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -243,7 +243,13 @@ class ModelPatchLoader: model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) elif 'control_all_x_embedder.2-1.weight' in sd: # alipai z image fun controlnet sd = z_image_convert(sd) - model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast) + config = {} + if 'control_layers.14.adaLN_modulation.0.weight' in sd: + config['n_control_layers'] = 15 + config['additional_in_dim'] = 17 + config['refiner_control'] = True + config['broken'] = True + model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast, **config) model.load_state_dict(sd) model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device()) @@ -297,56 +303,86 @@ class DiffSynthCnetPatch: return [self.model_patch] class ZImageControlPatch: - def __init__(self, model_patch, vae, image, strength): + def __init__(self, model_patch, vae, image, strength, inpaint_image=None, mask=None): self.model_patch = model_patch self.vae = vae self.image = image + self.inpaint_image = inpaint_image + self.mask = mask self.strength = strength self.encoded_image = self.encode_latent_cond(image) self.encoded_image_size = (image.shape[1], image.shape[2]) self.temp_data = None - def encode_latent_cond(self, image): - latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(image)) - return latent_image + def encode_latent_cond(self, control_image, inpaint_image=None): + latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(control_image)) + if self.model_patch.model.additional_in_dim > 0: + if self.mask is None: + mask_ = torch.zeros_like(latent_image)[:, :1] + else: + mask_ = comfy.utils.common_upscale(self.mask.mean(dim=1, keepdim=True), latent_image.shape[-1], latent_image.shape[-2], "bilinear", "none") + if inpaint_image is None: + inpaint_image = torch.ones_like(control_image) * 0.5 + + inpaint_image_latent = comfy.latent_formats.Flux().process_in(self.vae.encode(inpaint_image)) + + return torch.cat([latent_image, mask_, inpaint_image_latent], dim=1) + else: + return latent_image def __call__(self, kwargs): x = kwargs.get("x") img = kwargs.get("img") + img_input = kwargs.get("img_input") txt = kwargs.get("txt") pe = kwargs.get("pe") vec = kwargs.get("vec") block_index = kwargs.get("block_index") + block_type = kwargs.get("block_type", "") spacial_compression = self.vae.spacial_compression_encode() if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression): image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center") + inpaint_scaled = None + if self.inpaint_image is not None: + inpaint_scaled = comfy.utils.common_upscale(self.inpaint_image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center").movedim(1, -1) loaded_models = comfy.model_management.loaded_models(only_currently_used=True) - self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1)) + self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1), inpaint_scaled) self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1]) comfy.model_management.load_models_gpu(loaded_models) - cnet_index = (block_index // 5) - cnet_index_float = (block_index / 5) + cnet_blocks = self.model_patch.model.n_control_layers + div = round(30 / cnet_blocks) + + cnet_index = (block_index // div) + cnet_index_float = (block_index / div) kwargs.pop("img") # we do ops in place kwargs.pop("txt") - cnet_blocks = self.model_patch.model.n_control_layers if cnet_index_float > (cnet_blocks - 1): self.temp_data = None return kwargs if self.temp_data is None or self.temp_data[0] > cnet_index: - self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) + if block_type == "noise_refiner": + self.temp_data = (-3, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) + else: + self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec))) - while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks: + if block_type == "noise_refiner": next_layer = self.temp_data[0] + 1 - self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) + self.temp_data = (next_layer, self.model_patch.model.forward_noise_refiner_block(block_index, self.temp_data[1][1], img_input[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) + if self.temp_data[1][0] is not None: + img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) + else: + while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks: + next_layer = self.temp_data[0] + 1 + self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img_input[:, :self.temp_data[1][1].shape[1]], None, pe, vec)) - if cnet_index_float == self.temp_data[0]: - img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) - if cnet_blocks == self.temp_data[0] + 1: - self.temp_data = None + if cnet_index_float == self.temp_data[0]: + img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength) + if cnet_blocks == self.temp_data[0] + 1: + self.temp_data = None return kwargs @@ -386,7 +422,9 @@ class QwenImageDiffsynthControlnet: mask = 1.0 - mask if isinstance(model_patch.model, comfy.ldm.lumina.controlnet.ZImage_Control): - model_patched.set_model_double_block_patch(ZImageControlPatch(model_patch, vae, image, strength)) + patch = ZImageControlPatch(model_patch, vae, image, strength, mask=mask) + model_patched.set_model_noise_refiner_patch(patch) + model_patched.set_model_double_block_patch(patch) else: model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask)) return (model_patched,) From 971cefe7d4ca15c949d5d901a663cb66562a4f10 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Sat, 13 Dec 2025 15:45:23 -0800 Subject: [PATCH 20/21] Fix pytorch warnings. (#11314) --- comfy/ops.py | 2 +- comfy/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 0384c8717..16889bb82 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -592,7 +592,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec quant_conf = {"format": self.quant_format} if self._full_precision_mm: quant_conf["full_precision_matrix_mult"] = True - sd["{}comfy_quant".format(prefix)] = torch.frombuffer(json.dumps(quant_conf).encode('utf-8'), dtype=torch.uint8) + sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8) return sd def _forward(self, input, weight, bias): diff --git a/comfy/utils.py b/comfy/utils.py index 9dc0d76ac..3866cda2e 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -1262,6 +1262,6 @@ def convert_old_quants(state_dict, model_prefix="", metadata={}): if quant_metadata is not None: layers = quant_metadata["layers"] for k, v in layers.items(): - state_dict["{}.comfy_quant".format(k)] = torch.frombuffer(json.dumps(v).encode('utf-8'), dtype=torch.uint8) + state_dict["{}.comfy_quant".format(k)] = torch.tensor(list(json.dumps(v).encode('utf-8')), dtype=torch.uint8) return state_dict, metadata From 6592bffc609da4738b111dbffca1f473972f3574 Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Sun, 14 Dec 2025 13:03:29 +0800 Subject: [PATCH 21/21] seeds_2: add phi_2 variant and sampler node (#11309) * Add phi_2 solver type to seeds_2 * Add sampler node of seeds_2 --- comfy/k_diffusion/sampling.py | 15 ++++++++++++--- comfy_extras/nodes_custom_sampler.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 0e2cda291..753c66afa 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1557,10 +1557,13 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None @torch.no_grad() -def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5): +def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5, solver_type="phi_1"): """SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VP Data Prediction) stage 2. arXiv: https://arxiv.org/abs/2305.14267 (NeurIPS 2023) """ + if solver_type not in {"phi_1", "phi_2"}: + raise ValueError("solver_type must be 'phi_1' or 'phi_2'") + extra_args = {} if extra_args is None else extra_args seed = extra_args.get("seed", None) noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler @@ -1600,8 +1603,14 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args) # Step 2 - denoised_d = torch.lerp(denoised, denoised_2, fac) - x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d + if solver_type == "phi_1": + denoised_d = torch.lerp(denoised, denoised_2, fac) + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * ei_h_phi_1(-h_eta) * denoised_d + elif solver_type == "phi_2": + b2 = ei_h_phi_2(-h_eta) / r + b1 = ei_h_phi_1(-h_eta) - b2 + x = sigmas[i + 1] / sigmas[i] * (-h * eta).exp() * x - alpha_t * (b1 * denoised + b2 * denoised_2) + if inject_noise: segment_factor = (r - 1) * h * eta sde_noise = sde_noise * segment_factor.exp() diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index fbb080886..71ea4e9ec 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -659,6 +659,31 @@ class SamplerSASolver(io.ComfyNode): get_sampler = execute +class SamplerSEEDS2(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SamplerSEEDS2", + category="sampling/custom_sampling/samplers", + inputs=[ + io.Combo.Input("solver_type", options=["phi_1", "phi_2"]), + io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength"), + io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="SDE noise multiplier"), + io.Float.Input("r", default=0.5, min=0.01, max=1.0, step=0.01, round=False, tooltip="Relative step size for the intermediate stage (c2 node)"), + ], + outputs=[io.Sampler.Output()] + ) + + @classmethod + def execute(cls, solver_type, eta, s_noise, r) -> io.NodeOutput: + sampler_name = "seeds_2" + sampler = comfy.samplers.ksampler( + sampler_name, + {"eta": eta, "s_noise": s_noise, "r": r, "solver_type": solver_type}, + ) + return io.NodeOutput(sampler) + + class Noise_EmptyNoise: def __init__(self): self.seed = 0 @@ -996,6 +1021,7 @@ class CustomSamplersExtension(ComfyExtension): SamplerDPMAdaptative, SamplerER_SDE, SamplerSASolver, + SamplerSEEDS2, SplitSigmas, SplitSigmasDenoise, FlipSigmas,