Merge branch 'master' into dr-support-pip-cm

2026-07-11 17:07:14 +08:00 · 2025-10-14 07:36:42 +09:00 · 2025-10-14 07:36:42 +09:00 · 2b47f4a38e
commit 2b47f4a38e
parent a3af8f35c2 3dfdcf66b6
5 changed files with 189 additions and 137 deletions
--- a/comfy/ldm/wan/vae2_2.py
+++ b/comfy/ldm/wan/vae2_2.py
@ -657,51 +657,51 @@ class WanVAE(nn.Module):
        )
    def encode(self, x):
-        self.clear_cache()
+        conv_idx = [0]
        feat_map = [None] * count_conv3d(self.encoder)
        x = patchify(x, patch_size=2)
        t = x.shape[2]
        iter_ = 1 + (t - 1) // 4
        for i in range(iter_):
-            self._enc_conv_idx = [0]
+            conv_idx = [0]
            if i == 0:
                out = self.encoder(
                    x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
+                    feat_cache=feat_map,
-                    feat_idx=self._enc_conv_idx,
+                    feat_idx=conv_idx,
                )
            else:
                out_ = self.encoder(
                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
+                    feat_cache=feat_map,
-                    feat_idx=self._enc_conv_idx,
+                    feat_idx=conv_idx,
                )
                out = torch.cat([out, out_], 2)
        mu, log_var = self.conv1(out).chunk(2, dim=1)
        self.clear_cache()
        return mu
    def decode(self, z):
-        self.clear_cache()
+        conv_idx = [0]
        feat_map = [None] * count_conv3d(self.decoder)
        iter_ = z.shape[2]
        x = self.conv2(z)
        for i in range(iter_):
-            self._conv_idx = [0]
+            conv_idx = [0]
            if i == 0:
                out = self.decoder(
                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
+                    feat_cache=feat_map,
-                    feat_idx=self._conv_idx,
+                    feat_idx=conv_idx,
                    first_chunk=True,
                )
            else:
                out_ = self.decoder(
                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
+                    feat_cache=feat_map,
-                    feat_idx=self._conv_idx,
+                    feat_idx=conv_idx,
                )
                out = torch.cat([out, out_], 2)
        out = unpatchify(out, patch_size=2)
        self.clear_cache()
        return out
    def reparameterize(self, mu, log_var):
@ -715,12 +715,3 @@ class WanVAE(nn.Module):
            return mu
        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
        return mu + std * torch.randn_like(std)
    def clear_cache(self):
        self._conv_num = count_conv3d(self.decoder)
        self._conv_idx = [0]
        self._feat_map = [None] * self._conv_num
        # cache encode
        self._enc_conv_num = count_conv3d(self.encoder)
        self._enc_conv_idx = [0]
        self._enc_feat_map = [None] * self._enc_conv_num
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -138,6 +138,7 @@ class BaseModel(torch.nn.Module):
            else:
                operations = model_config.custom_operations
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
            self.diffusion_model.eval()
            if comfy.model_management.force_channels_last():
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
@ -669,7 +670,6 @@ class Lotus(BaseModel):
 class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageC)
        self.diffusion_model.eval().requires_grad_(False)
    def extra_conds(self, **kwargs):
        out = {}
@ -698,7 +698,6 @@ class StableCascade_C(BaseModel):
 class StableCascade_B(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageB)
        self.diffusion_model.eval().requires_grad_(False)
    def extra_conds(self, **kwargs):
        out = {}
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@ -2,42 +2,60 @@ import nodes
 import node_helpers
 import torch
 import comfy.model_management
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-class CLIPTextEncodeHunyuanDiT:
+class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {
+        return io.Schema(
-            "clip": ("CLIP", ),
+            node_id="CLIPTextEncodeHunyuanDiT",
-            "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            category="advanced/conditioning",
-            "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            inputs=[
-            }}
+                io.Clip.Input("clip"),
-    RETURN_TYPES = ("CONDITIONING",)
+                io.String.Input("bert", multiline=True, dynamic_prompts=True),
-    FUNCTION = "encode"
+                io.String.Input("mt5xl", multiline=True, dynamic_prompts=True),
            ],
            outputs=[
                io.Conditioning.Output(),
            ],
        )
-    CATEGORY = "advanced/conditioning"
+    @classmethod
-
+    def execute(cls, clip, bert, mt5xl) -> io.NodeOutput:
    def encode(self, clip, bert, mt5xl):
        tokens = clip.tokenize(bert)
        tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]
-        return (clip.encode_from_tokens_scheduled(tokens), )
+        return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
-class EmptyHunyuanLatentVideo:
+    encode = execute  # TODO: remove
 class EmptyHunyuanLatentVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+        return io.Schema(
-                              "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+            node_id="EmptyHunyuanLatentVideo",
-                              "length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+            category="latent/video",
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
+            inputs=[
-    RETURN_TYPES = ("LATENT",)
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
-    FUNCTION = "generate"
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
                io.Int.Input("length", default=25, min=1, max=nodes.MAX_RESOLUTION, step=4),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
            ],
            outputs=[
                io.Latent.Output(),
            ],
        )
-    CATEGORY = "latent/video"
+    @classmethod
-
+    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
    def generate(self, width, height, length, batch_size=1):
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        return ({"samples":latent}, )
+        return io.NodeOutput({"samples":latent})
    generate = execute  # TODO: remove
 PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
@ -50,45 +68,61 @@ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
    "<|start_header_id|>assistant<|end_header_id|>\n\n"
 )
-class TextEncodeHunyuanVideo_ImageToVideo:
+class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {
+        return io.Schema(
-            "clip": ("CLIP", ),
+            node_id="TextEncodeHunyuanVideo_ImageToVideo",
-            "clip_vision_output": ("CLIP_VISION_OUTPUT", ),
+            category="advanced/conditioning",
-            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            inputs=[
-            "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
+                io.Clip.Input("clip"),
-            }}
+                io.ClipVisionOutput.Input("clip_vision_output"),
-    RETURN_TYPES = ("CONDITIONING",)
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
-    FUNCTION = "encode"
+                io.Int.Input(
                    "image_interleave",
                    default=2,
                    min=1,
                    max=512,
                    tooltip="How much the image influences things vs the text prompt. Higher number means more influence from the text prompt.",
                ),
            ],
            outputs=[
                io.Conditioning.Output(),
            ],
        )
-    CATEGORY = "advanced/conditioning"
+    @classmethod
-
+    def execute(cls, clip, clip_vision_output, prompt, image_interleave) -> io.NodeOutput:
    def encode(self, clip, clip_vision_output, prompt, image_interleave):
        tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
-        return (clip.encode_from_tokens_scheduled(tokens), )
+        return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
-class HunyuanImageToVideo:
+    encode = execute  # TODO: remove
 class HunyuanImageToVideo(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"positive": ("CONDITIONING", ),
+        return io.Schema(
-                             "vae": ("VAE", ),
+            node_id="HunyuanImageToVideo",
-                             "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+            category="conditioning/video_models",
-                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+            inputs=[
-                             "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                io.Conditioning.Input("positive"),
-                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                io.Vae.Input("vae"),
-                             "guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], )
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
-                },
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
-                "optional": {"start_image": ("IMAGE", ),
+                io.Int.Input("length", default=53, min=1, max=nodes.MAX_RESOLUTION, step=4),
-                }}
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
                io.Combo.Input("guidance_type", options=["v1 (concat)", "v2 (replace)", "custom"]),
                io.Image.Input("start_image", optional=True),
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Latent.Output(display_name="latent"),
            ],
        )
-    RETURN_TYPES = ("CONDITIONING", "LATENT")
+    @classmethod
-    RETURN_NAMES = ("positive", "latent")
+    def execute(cls, positive, vae, width, height, length, batch_size, guidance_type, start_image=None) -> io.NodeOutput:
    FUNCTION = "encode"
    CATEGORY = "conditioning/video_models"
    def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        out_latent = {}
@ -111,51 +145,76 @@ class HunyuanImageToVideo:
            positive = node_helpers.conditioning_set_values(positive, cond)
        out_latent["samples"] = latent
-        return (positive, out_latent)
+        return io.NodeOutput(positive, out_latent)
-class EmptyHunyuanImageLatent:
+    encode = execute  # TODO: remove
 class EmptyHunyuanImageLatent(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
+        return io.Schema(
-                              "height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
+            node_id="EmptyHunyuanImageLatent",
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
+            category="latent",
-    RETURN_TYPES = ("LATENT",)
+            inputs=[
-    FUNCTION = "generate"
+                io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
                io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
            ],
            outputs=[
                io.Latent.Output(),
            ],
        )
-    CATEGORY = "latent"
+    @classmethod
-
+    def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
    def generate(self, width, height, batch_size=1):
        latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
-        return ({"samples":latent}, )
+        return io.NodeOutput({"samples":latent})
-class HunyuanRefinerLatent:
+    generate = execute  # TODO: remove
 class HunyuanRefinerLatent(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"positive": ("CONDITIONING", ),
+        return io.Schema(
-                             "negative": ("CONDITIONING", ),
+            node_id="HunyuanRefinerLatent",
-                             "latent": ("LATENT", ),
+            inputs=[
-                             "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}),
+                io.Conditioning.Input("positive"),
-                             }}
+                io.Conditioning.Input("negative"),
                io.Latent.Input("latent"),
                io.Float.Input("noise_augmentation", default=0.10, min=0.0, max=1.0, step=0.01),
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+            ],
-    RETURN_NAMES = ("positive", "negative", "latent")
+            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent"),
            ],
        )
-    FUNCTION = "execute"
+    @classmethod
-
+    def execute(cls, positive, negative, latent, noise_augmentation) -> io.NodeOutput:
    def execute(self, positive, negative, latent, noise_augmentation):
        latent = latent["samples"]
        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
        out_latent = {}
        out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
-        return (positive, negative, out_latent)
+        return io.NodeOutput(positive, negative, out_latent)
-NODE_CLASS_MAPPINGS = {
+class HunyuanExtension(ComfyExtension):
-    "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
+    @override
-    "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-    "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
+        return [
-    "HunyuanImageToVideo": HunyuanImageToVideo,
+            CLIPTextEncodeHunyuanDiT,
-    "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
+            TextEncodeHunyuanVideo_ImageToVideo,
-    "HunyuanRefinerLatent": HunyuanRefinerLatent,
+            EmptyHunyuanLatentVideo,
-}
+            HunyuanImageToVideo,
            EmptyHunyuanImageLatent,
            HunyuanRefinerLatent,
        ]
 async def comfy_entrypoint() -> HunyuanExtension:
    return HunyuanExtension()
--- a/comfy_extras/nodes_preview_any.py
+++ b/comfy_extras/nodes_preview_any.py
@ -25,7 +25,7 @@ class PreviewAny():
            value = str(source)
        elif source is not None:
            try:
-                value = json.dumps(source)
+                value = json.dumps(source, indent=4)
            except Exception:
                try:
                    value = str(source)
--- a/extra_model_paths.yaml.example
+++ b/extra_model_paths.yaml.example
@ -1,25 +1,5 @@
 #Rename this to extra_model_paths.yaml and ComfyUI will load it
 #config for a1111 ui
 #all you have to do is change the base_path to where yours is installed
 a111:
    base_path: path/to/stable-diffusion-webui/
    checkpoints: models/Stable-diffusion
    configs: models/Stable-diffusion
    vae: models/VAE
    loras: |
         models/Lora
         models/LyCORIS
    upscale_models: |
                  models/ESRGAN
                  models/RealESRGAN
                  models/SwinIR
    embeddings: embeddings
    hypernetworks: models/hypernetworks
    controlnet: models/ControlNet
 #config for comfyui
 #your base path should be either an existing comfy install or a central folder where you store all of your models, loras, etc.
@ -41,6 +21,29 @@ a111:
 #     loras: models/loras/
 #     upscale_models: models/upscale_models/
 #     vae: models/vae/
 #     audio_encoders: models/audio_encoders/
 #     model_patches: models/model_patches/
 #config for a1111 ui
 #all you have to do is uncomment this (remove the #) and change the base_path to where yours is installed
 #a111:
 #     base_path: path/to/stable-diffusion-webui/
 #     checkpoints: models/Stable-diffusion
 #     configs: models/Stable-diffusion
 #     vae: models/VAE
 #     loras: |
 #          models/Lora
 #          models/LyCORIS
 #     upscale_models: |
 #                   models/ESRGAN
 #                   models/RealESRGAN
 #                   models/SwinIR
 #     embeddings: embeddings
 #     hypernetworks: models/hypernetworks
 #     controlnet: models/ControlNet
 # For a full list of supported keys (style_models, vae_approx, hypernetworks, photomaker,
 # model_patches, audio_encoders, classifiers, etc.) see folder_paths.py.