From 8d7c930246bd33c32eb957b01ab0d364af6e81c0 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 10 Sep 2025 10:51:02 -0400
Subject: [PATCH 01/10] ComfyUI version v0.3.58

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index 4cc3c8647..37361bd75 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.57"
+__version__ = "0.3.58"
diff --git a/pyproject.toml b/pyproject.toml
index d75cd04a2..f02ab9126 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.57"
+version = "0.3.58"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"

From 9b0553809cbac084aac0576892aca3e448eb07c7 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Thu, 11 Sep 2025 00:13:18 +0300
Subject: [PATCH 02/10] add new ByteDanceSeedream (4.0) node (#9802)

---
 comfy_api_nodes/nodes_bytedance.py | 208 ++++++++++++++++++++++++++++-
 1 file changed, 207 insertions(+), 1 deletion(-)

diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py
index 064df2d10..369a3a4fe 100644
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@@ -77,6 +77,22 @@ class Image2ImageTaskCreationRequest(BaseModel):
     watermark: Optional[bool] = Field(True)
 
 
+class Seedream4Options(BaseModel):
+    max_images: int = Field(15)
+
+
+class Seedream4TaskCreationRequest(BaseModel):
+    model: str = Field("seedream-4-0-250828")
+    prompt: str = Field(...)
+    response_format: str = Field("url")
+    image: Optional[list[str]] = Field(None, description="Image URLs")
+    size: str = Field(...)
+    seed: int = Field(..., ge=0, le=2147483647)
+    sequential_image_generation: str = Field("disabled")
+    sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15))
+    watermark: bool = Field(True)
+
+
 class ImageTaskCreationResponse(BaseModel):
     model: str = Field(...)
     created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.")
@@ -143,6 +159,19 @@ RECOMMENDED_PRESETS = [
     ("Custom", None, None),
 ]
 
+RECOMMENDED_PRESETS_SEEDREAM_4 = [
+    ("2048x2048 (1:1)", 2048, 2048),
+    ("2304x1728 (4:3)", 2304, 1728),
+    ("1728x2304 (3:4)", 1728, 2304),
+    ("2560x1440 (16:9)", 2560, 1440),
+    ("1440x2560 (9:16)", 1440, 2560),
+    ("2496x1664 (3:2)", 2496, 1664),
+    ("1664x2496 (2:3)", 1664, 2496),
+    ("3024x1296 (21:9)", 3024, 1296),
+    ("4096x4096 (1:1)", 4096, 4096),
+    ("Custom", None, None),
+]
+
 # The time in this dictionary are given for 10 seconds duration.
 VIDEO_TASKS_EXECUTION_TIME = {
     "seedance-1-0-lite-t2v-250428": {
@@ -348,7 +377,7 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode):
         return comfy_io.Schema(
             node_id="ByteDanceImageEditNode",
             display_name="ByteDance Image Edit",
-            category="api node/video/ByteDance",
+            category="api node/image/ByteDance",
             description="Edit images using ByteDance models via api based on prompt",
             inputs=[
                 comfy_io.Combo.Input(
@@ -451,6 +480,182 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode):
         return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
 
 
+class ByteDanceSeedreamNode(comfy_io.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return comfy_io.Schema(
+            node_id="ByteDanceSeedreamNode",
+            display_name="ByteDance Seedream 4",
+            category="api node/image/ByteDance",
+            description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.",
+            inputs=[
+                comfy_io.Combo.Input(
+                    "model",
+                    options=["seedream-4-0-250828"],
+                    tooltip="Model name",
+                ),
+                comfy_io.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Text prompt for creating or editing an image.",
+                ),
+                comfy_io.Image.Input(
+                    "image",
+                    tooltip="Input image(s) for image-to-image generation. "
+                            "List of 1-10 images for single or multi-reference generation.",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "size_preset",
+                    options=[label for label, _, _ in RECOMMENDED_PRESETS_SEEDREAM_4],
+                    tooltip="Pick a recommended size. Select Custom to use the width and height below.",
+                ),
+                comfy_io.Int.Input(
+                    "width",
+                    default=2048,
+                    min=1024,
+                    max=4096,
+                    step=64,
+                    tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "height",
+                    default=2048,
+                    min=1024,
+                    max=4096,
+                    step=64,
+                    tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
+                    optional=True,
+                ),
+                comfy_io.Combo.Input(
+                    "sequential_image_generation",
+                    options=["disabled", "auto"],
+                    tooltip="Group image generation mode. "
+                            "'disabled' generates a single image. "
+                            "'auto' lets the model decide whether to generate multiple related images "
+                            "(e.g., story scenes, character variations).",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "max_images",
+                    default=1,
+                    min=1,
+                    max=15,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    tooltip="Maximum number of images to generate when sequential_image_generation='auto'. "
+                            "Total images (input + generated) cannot exceed 15.",
+                    optional=True,
+                ),
+                comfy_io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=2147483647,
+                    step=1,
+                    display_mode=comfy_io.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed to use for generation.",
+                    optional=True,
+                ),
+                comfy_io.Boolean.Input(
+                    "watermark",
+                    default=True,
+                    tooltip="Whether to add an \"AI generated\" watermark to the image.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                comfy_io.Image.Output(),
+            ],
+            hidden=[
+                comfy_io.Hidden.auth_token_comfy_org,
+                comfy_io.Hidden.api_key_comfy_org,
+                comfy_io.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        prompt: str,
+        image: torch.Tensor = None,
+        size_preset: str = RECOMMENDED_PRESETS_SEEDREAM_4[0][0],
+        width: int = 2048,
+        height: int = 2048,
+        sequential_image_generation: str = "disabled",
+        max_images: int = 1,
+        seed: int = 0,
+        watermark: bool = True,
+    ) -> comfy_io.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
+        w = h = None
+        for label, tw, th in RECOMMENDED_PRESETS_SEEDREAM_4:
+            if label == size_preset:
+                w, h = tw, th
+                break
+
+        if w is None or h is None:
+            w, h = width, height
+            if not (1024 <= w <= 4096) or not (1024 <= h <= 4096):
+                raise ValueError(
+                    f"Custom size out of range: {w}x{h}. "
+                    "Both width and height must be between 1024 and 4096 pixels."
+                )
+        n_input_images = get_number_of_images(image) if image is not None else 0
+        if n_input_images > 10:
+            raise ValueError(f"Maximum of 10 reference images are supported, but {n_input_images} received.")
+        if sequential_image_generation == "auto" and n_input_images + max_images > 15:
+            raise ValueError(
+                "The maximum number of generated images plus the number of reference images cannot exceed 15."
+            )
+        auth_kwargs = {
+            "auth_token": cls.hidden.auth_token_comfy_org,
+            "comfy_api_key": cls.hidden.api_key_comfy_org,
+        }
+        reference_images_urls = []
+        if n_input_images:
+            for i in image:
+                validate_image_aspect_ratio_range(i, (1, 3), (3, 1))
+            reference_images_urls = (await upload_images_to_comfyapi(
+                image,
+                max_images=n_input_images,
+                mime_type="image/png",
+                auth_kwargs=auth_kwargs,
+            ))
+        payload = Seedream4TaskCreationRequest(
+            model=model,
+            prompt=prompt,
+            image=reference_images_urls,
+            size=f"{w}x{h}",
+            seed=seed,
+            sequential_image_generation=sequential_image_generation,
+            sequential_image_generation_options=Seedream4Options(max_images=max_images),
+            watermark=watermark,
+        )
+        response = await SynchronousOperation(
+            endpoint=ApiEndpoint(
+                path=BYTEPLUS_IMAGE_ENDPOINT,
+                method=HttpMethod.POST,
+                request_model=Seedream4TaskCreationRequest,
+                response_model=ImageTaskCreationResponse,
+            ),
+            request=payload,
+            auth_kwargs=auth_kwargs,
+        ).execute()
+
+        if len(response.data) == 1:
+            return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
+        return comfy_io.NodeOutput(
+            torch.cat([await download_url_to_image_tensor(str(i["url"])) for i in response.data])
+        )
+
+
 class ByteDanceTextToVideoNode(comfy_io.ComfyNode):
 
     @classmethod
@@ -1001,6 +1206,7 @@ class ByteDanceExtension(ComfyExtension):
         return [
             ByteDanceImageNode,
             ByteDanceImageEditNode,
+            ByteDanceSeedreamNode,
             ByteDanceTextToVideoNode,
             ByteDanceImageToVideoNode,
             ByteDanceFirstLastFrameNode,

From df34f1549a431c85a6326e87075a206228697cde Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Thu, 11 Sep 2025 05:16:41 +0800
Subject: [PATCH 03/10] Update template to 0.1.78 (#9806)

* Update template to 0.1.77

* Update template to 0.1.78
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ea1931d78..d31df0fec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.25.11
-comfyui-workflow-templates==0.1.76
+comfyui-workflow-templates==0.1.78
 comfyui-embedded-docs==0.2.6
 torch
 torchsde

From 72212fef660bcd7d9702fa52011d089c027a64d8 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 10 Sep 2025 17:25:41 -0400
Subject: [PATCH 04/10] ComfyUI version 0.3.59

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index 37361bd75..ee58205f5 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.58"
+__version__ = "0.3.59"
diff --git a/pyproject.toml b/pyproject.toml
index f02ab9126..a7fc1a5a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.58"
+version = "0.3.59"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"

From e01e99d075852b94e93f27ea64ab862a49a7d2cc Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 10 Sep 2025 20:17:34 -0700
Subject: [PATCH 05/10] Support hunyuan image distilled model. (#9807)

---
 comfy/ldm/hunyuan_video/model.py | 14 ++++++++++++++
 comfy/model_detection.py         | 12 ++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py
index ca289c5bd..7732182a4 100644
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -41,6 +41,7 @@ class HunyuanVideoParams:
     qkv_bias: bool
     guidance_embed: bool
     byt5: bool
+    meanflow: bool
 
 
 class SelfAttentionRef(nn.Module):
@@ -256,6 +257,11 @@ class HunyuanVideo(nn.Module):
         else:
             self.byt5_in = None
 
+        if params.meanflow:
+            self.time_r_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
+        else:
+            self.time_r_in = None
+
         if final_layer:
             self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)
 
@@ -282,6 +288,14 @@ class HunyuanVideo(nn.Module):
         img = self.img_in(img)
         vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))
 
+        if self.time_r_in is not None:
+            w = torch.where(transformer_options['sigmas'][0] == transformer_options['sample_sigmas'])[0]  # This most likely could be improved
+            if len(w) > 0:
+                timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
+                timesteps_r = timesteps_r.unsqueeze(0).to(device=timesteps.device, dtype=timesteps.dtype)
+                vec_r = self.time_r_in(timestep_embedding(timesteps_r, 256, time_factor=1000.0).to(img.dtype))
+                vec = (vec + vec_r) / 2
+
         if ref_latent is not None:
             ref_latent_ids = self.img_ids(ref_latent)
             ref_latent = self.img_in(ref_latent)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index dbcbe5f5a..fe983cede 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -142,12 +142,20 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         dit_config["in_channels"] = in_w.shape[1] #SkyReels img2video has 32 input channels
         dit_config["patch_size"] = list(in_w.shape[2:])
         dit_config["out_channels"] = out_w.shape[0] // math.prod(dit_config["patch_size"])
-        if '{}vector_in.in_layer.weight'.format(key_prefix) in state_dict:
+        if any(s.startswith('{}vector_in.'.format(key_prefix)) for s in state_dict_keys):
             dit_config["vec_in_dim"] = 768
-            dit_config["axes_dim"] = [16, 56, 56]
         else:
             dit_config["vec_in_dim"] = None
+
+        if len(dit_config["patch_size"]) == 2:
             dit_config["axes_dim"] = [64, 64]
+        else:
+            dit_config["axes_dim"] = [16, 56, 56]
+
+        if any(s.startswith('{}time_r_in.'.format(key_prefix)) for s in state_dict_keys):
+            dit_config["meanflow"] = True
+        else:
+            dit_config["meanflow"] = False
 
         dit_config["context_in_dim"] = state_dict['{}txt_in.input_embedder.weight'.format(key_prefix)].shape[1]
         dit_config["hidden_size"] = in_w.shape[0]

From df6850fae8a75126cb7a645e38d58cebcfd51096 Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Fri, 12 Sep 2025 02:59:26 +0800
Subject: [PATCH 06/10] Update template to 0.1.81 (#9811)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index d31df0fec..0e21967ef 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.25.11
-comfyui-workflow-templates==0.1.78
+comfyui-workflow-templates==0.1.81
 comfyui-embedded-docs==0.2.6
 torch
 torchsde

From 18de0b28305fd8bf002d74e91c0630bd76b01d6b Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 11 Sep 2025 16:33:02 -0700
Subject: [PATCH 07/10] Fast preview for hunyuan image. (#9814)

---
 comfy/latent_formats.py | 68 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index 859ae8421..f975b5e11 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -538,6 +538,74 @@ class HunyuanImage21(LatentFormat):
     latent_dimensions = 2
     scale_factor = 0.75289
 
+    latent_rgb_factors = [
+        [-0.0154, -0.0397, -0.0521],
+        [ 0.0005,  0.0093,  0.0006],
+        [-0.0805, -0.0773, -0.0586],
+        [-0.0494, -0.0487, -0.0498],
+        [-0.0212, -0.0076, -0.0261],
+        [-0.0179, -0.0417, -0.0505],
+        [ 0.0158,  0.0310,  0.0239],
+        [ 0.0409,  0.0516,  0.0201],
+        [ 0.0350,  0.0553,  0.0036],
+        [-0.0447, -0.0327, -0.0479],
+        [-0.0038, -0.0221, -0.0365],
+        [-0.0423, -0.0718, -0.0654],
+        [ 0.0039,  0.0368,  0.0104],
+        [ 0.0655,  0.0217,  0.0122],
+        [ 0.0490,  0.1638,  0.2053],
+        [ 0.0932,  0.0829,  0.0650],
+        [-0.0186, -0.0209, -0.0135],
+        [-0.0080, -0.0076, -0.0148],
+        [-0.0284, -0.0201,  0.0011],
+        [-0.0642, -0.0294, -0.0777],
+        [-0.0035,  0.0076, -0.0140],
+        [ 0.0519,  0.0731,  0.0887],
+        [-0.0102,  0.0095,  0.0704],
+        [ 0.0068,  0.0218, -0.0023],
+        [-0.0726, -0.0486, -0.0519],
+        [ 0.0260,  0.0295,  0.0263],
+        [ 0.0250,  0.0333,  0.0341],
+        [ 0.0168, -0.0120, -0.0174],
+        [ 0.0226,  0.1037,  0.0114],
+        [ 0.2577,  0.1906,  0.1604],
+        [-0.0646, -0.0137, -0.0018],
+        [-0.0112,  0.0309,  0.0358],
+        [-0.0347,  0.0146, -0.0481],
+        [ 0.0234,  0.0179,  0.0201],
+        [ 0.0157,  0.0313,  0.0225],
+        [ 0.0423,  0.0675,  0.0524],
+        [-0.0031,  0.0027, -0.0255],
+        [ 0.0447,  0.0555,  0.0330],
+        [-0.0152,  0.0103,  0.0299],
+        [-0.0755, -0.0489, -0.0635],
+        [ 0.0853,  0.0788,  0.1017],
+        [-0.0272, -0.0294, -0.0471],
+        [ 0.0440,  0.0400, -0.0137],
+        [ 0.0335,  0.0317, -0.0036],
+        [-0.0344, -0.0621, -0.0984],
+        [-0.0127, -0.0630, -0.0620],
+        [-0.0648,  0.0360,  0.0924],
+        [-0.0781, -0.0801, -0.0409],
+        [ 0.0363,  0.0613,  0.0499],
+        [ 0.0238,  0.0034,  0.0041],
+        [-0.0135,  0.0258,  0.0310],
+        [ 0.0614,  0.1086,  0.0589],
+        [ 0.0428,  0.0350,  0.0205],
+        [ 0.0153,  0.0173, -0.0018],
+        [-0.0288, -0.0455, -0.0091],
+        [ 0.0344,  0.0109, -0.0157],
+        [-0.0205, -0.0247, -0.0187],
+        [ 0.0487,  0.0126,  0.0064],
+        [-0.0220, -0.0013,  0.0074],
+        [-0.0203, -0.0094, -0.0048],
+        [-0.0719,  0.0429, -0.0442],
+        [ 0.1042,  0.0497,  0.0356],
+        [-0.0659, -0.0578, -0.0280],
+        [-0.0060, -0.0322, -0.0234]]
+
+    latent_rgb_factors_bias = [0.0007, -0.0256, -0.0206]
+
 class Hunyuan3Dv2(LatentFormat):
     latent_channels = 64
     latent_dimensions = 1

From 33bd9ed9cb941127b335244c6cc0a8cdc1ac1696 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 11 Sep 2025 21:43:20 -0700
Subject: [PATCH 08/10] Implement hunyuan image refiner model. (#9817)

---
 comfy/latent_formats.py                     |   5 +
 comfy/ldm/hunyuan_video/model.py            |  11 +-
 comfy/ldm/hunyuan_video/vae_refiner.py      | 268 ++++++++++++++++++++
 comfy/ldm/models/autoencoder.py             |   6 +
 comfy/ldm/modules/diffusionmodules/model.py |  10 +-
 comfy/model_base.py                         |  20 ++
 comfy/sd.py                                 |  17 +-
 comfy/supported_models.py                   |  19 +-
 comfy_extras/nodes_hunyuan.py               |  23 ++
 9 files changed, 367 insertions(+), 12 deletions(-)
 create mode 100644 comfy/ldm/hunyuan_video/vae_refiner.py

diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index f975b5e11..894540879 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -606,6 +606,11 @@ class HunyuanImage21(LatentFormat):
 
     latent_rgb_factors_bias = [0.0007, -0.0256, -0.0206]
 
+class HunyuanImage21Refiner(LatentFormat):
+    latent_channels = 64
+    latent_dimensions = 3
+    scale_factor = 1.03682
+
 class Hunyuan3Dv2(LatentFormat):
     latent_channels = 64
     latent_dimensions = 1
diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py
index 7732182a4..ca86b8bb1 100644
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -278,6 +278,7 @@ class HunyuanVideo(nn.Module):
         guidance: Tensor = None,
         guiding_frame_index=None,
         ref_latent=None,
+        disable_time_r=False,
         control=None,
         transformer_options={},
     ) -> Tensor:
@@ -288,7 +289,7 @@ class HunyuanVideo(nn.Module):
         img = self.img_in(img)
         vec = self.time_in(timestep_embedding(timesteps, 256, time_factor=1.0).to(img.dtype))
 
-        if self.time_r_in is not None:
+        if (self.time_r_in is not None) and (not disable_time_r):
             w = torch.where(transformer_options['sigmas'][0] == transformer_options['sample_sigmas'])[0]  # This most likely could be improved
             if len(w) > 0:
                 timesteps_r = transformer_options['sample_sigmas'][w[0] + 1]
@@ -428,14 +429,14 @@ class HunyuanVideo(nn.Module):
         img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
         return repeat(img_ids, "h w c -> b (h w) c", b=bs)
 
-    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
         return comfy.patcher_extension.WrapperExecutor.new_class_executor(
             self._forward,
             self,
             comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, control, transformer_options, **kwargs)
+        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
 
-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, control=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
         bs = x.shape[0]
         if len(self.patch_size) == 3:
             img_ids = self.img_ids(x)
@@ -443,5 +444,5 @@ class HunyuanVideo(nn.Module):
         else:
             img_ids = self.img_ids_2d(x)
             txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
         return out
diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py
new file mode 100644
index 000000000..e3fff9bbe
--- /dev/null
+++ b/comfy/ldm/hunyuan_video/vae_refiner.py
@@ -0,0 +1,268 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d
+import comfy.ops
+import comfy.ldm.models.autoencoder
+ops = comfy.ops.disable_weight_init
+
+class RMS_norm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        shape = (dim, 1, 1, 1)
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.empty(shape))
+
+    def forward(self, x):
+        return F.normalize(x, dim=1) * self.scale * self.gamma
+
+class DnSmpl(nn.Module):
+    def __init__(self, ic, oc, tds=True):
+        super().__init__()
+        fct = 2 * 2 * 2 if tds else 1 * 2 * 2
+        assert oc % fct == 0
+        self.conv = VideoConv3d(ic, oc // fct, kernel_size=3)
+
+        self.tds = tds
+        self.gs = fct * ic // oc
+
+    def forward(self, x):
+        r1 = 2 if self.tds else 1
+        h = self.conv(x)
+
+        if self.tds:
+            hf = h[:, :, :1, :, :]
+            b, c, f, ht, wd = hf.shape
+            hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
+            hf = hf.permute(0, 4, 6, 1, 2, 3, 5)
+            hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
+            hf = torch.cat([hf, hf], dim=1)
+
+            hn = h[:, :, 1:, :, :]
+            b, c, frms, ht, wd = hn.shape
+            nf = frms // r1
+            hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
+            hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6)
+            hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
+
+            h = torch.cat([hf, hn], dim=2)
+
+            xf = x[:, :, :1, :, :]
+            b, ci, f, ht, wd = xf.shape
+            xf = xf.reshape(b, ci, f, ht // 2, 2, wd // 2, 2)
+            xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
+            xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
+            B, C, T, H, W = xf.shape
+            xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2)
+
+            xn = x[:, :, 1:, :, :]
+            b, ci, frms, ht, wd = xn.shape
+            nf = frms // r1
+            xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
+            xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6)
+            xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
+            B, C, T, H, W = xn.shape
+            xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+            sc = torch.cat([xf, xn], dim=2)
+        else:
+            b, c, frms, ht, wd = h.shape
+            nf = frms // r1
+            h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
+            h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
+            h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
+
+            b, ci, frms, ht, wd = x.shape
+            nf = frms // r1
+            sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
+            sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6)
+            sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
+            B, C, T, H, W = sc.shape
+            sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+
+        return h + sc
+
+
+class UpSmpl(nn.Module):
+    def __init__(self, ic, oc, tus=True):
+        super().__init__()
+        fct = 2 * 2 * 2 if tus else 1 * 2 * 2
+        self.conv = VideoConv3d(ic, oc * fct, kernel_size=3)
+
+        self.tus = tus
+        self.rp = fct * oc // ic
+
+    def forward(self, x):
+        r1 = 2 if self.tus else 1
+        h = self.conv(x)
+
+        if self.tus:
+            hf = h[:, :, :1, :, :]
+            b, c, f, ht, wd = hf.shape
+            nc = c // (2 * 2)
+            hf = hf.reshape(b, 2, 2, nc, f, ht, wd)
+            hf = hf.permute(0, 3, 4, 5, 1, 6, 2)
+            hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
+            hf = hf[:, : hf.shape[1] // 2]
+
+            hn = h[:, :, 1:, :, :]
+            b, c, frms, ht, wd = hn.shape
+            nc = c // (r1 * 2 * 2)
+            hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+            hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3)
+            hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+
+            h = torch.cat([hf, hn], dim=2)
+
+            xf = x[:, :, :1, :, :]
+            b, ci, f, ht, wd = xf.shape
+            xf = xf.repeat_interleave(repeats=self.rp // 2, dim=1)
+            b, c, f, ht, wd = xf.shape
+            nc = c // (2 * 2)
+            xf = xf.reshape(b, 2, 2, nc, f, ht, wd)
+            xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
+            xf = xf.reshape(b, nc, f, ht * 2, wd * 2)
+
+            xn = x[:, :, 1:, :, :]
+            xn = xn.repeat_interleave(repeats=self.rp, dim=1)
+            b, c, frms, ht, wd = xn.shape
+            nc = c // (r1 * 2 * 2)
+            xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+            xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3)
+            xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+            sc = torch.cat([xf, xn], dim=2)
+        else:
+            b, c, frms, ht, wd = h.shape
+            nc = c // (r1 * 2 * 2)
+            h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+            h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
+            h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+
+            sc = x.repeat_interleave(repeats=self.rp, dim=1)
+            b, c, frms, ht, wd = sc.shape
+            nc = c // (r1 * 2 * 2)
+            sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+            sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3)
+            sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+
+        return h + sc
+
+class Encoder(nn.Module):
+    def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
+                 ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_):
+        super().__init__()
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = VideoConv3d(in_channels, block_out_channels[0], 3, 1, 1)
+
+        self.down = nn.ModuleList()
+        ch = block_out_channels[0]
+        depth = (ffactor_spatial >> 1).bit_length()
+        depth_temporal = ((ffactor_spatial // ffactor_temporal) >> 1).bit_length()
+
+        for i, tgt in enumerate(block_out_channels):
+            stage = nn.Module()
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                     out_channels=tgt,
+                                                     temb_channels=0,
+                                                     conv_op=VideoConv3d, norm_op=RMS_norm)
+                                        for j in range(num_res_blocks)])
+            ch = tgt
+            if i < depth:
+                nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch
+                stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal)
+                ch = nxt
+            self.down.append(stage)
+
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+
+        self.norm_out = RMS_norm(ch)
+        self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1)
+
+        self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer()
+
+    def forward(self, x):
+        x = x.unsqueeze(2)
+        x = self.conv_in(x)
+
+        for stage in self.down:
+            for blk in stage.block:
+                x = blk(x)
+            if hasattr(stage, 'downsample'):
+                x = stage.downsample(x)
+
+        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
+
+        b, c, t, h, w = x.shape
+        grp = c // (self.z_channels << 1)
+        skip = x.view(b, c // grp, grp, t, h, w).mean(2)
+
+        out = self.conv_out(F.silu(self.norm_out(x))) + skip
+        out = self.regul(out)[0]
+
+        out = torch.cat((out[:, :, :1], out), dim=2)
+        out = out.permute(0, 2, 1, 3, 4)
+        b, f_times_2, c, h, w = out.shape
+        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
+        out = out.permute(0, 2, 1, 3, 4).contiguous()
+        return out
+
+class Decoder(nn.Module):
+    def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks,
+                 ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_):
+        super().__init__()
+        block_out_channels = block_out_channels[::-1]
+        self.z_channels = z_channels
+        self.block_out_channels = block_out_channels
+        self.num_res_blocks = num_res_blocks
+
+        ch = block_out_channels[0]
+        self.conv_in = VideoConv3d(z_channels, ch, 3)
+
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+
+        self.up = nn.ModuleList()
+        depth = (ffactor_spatial >> 1).bit_length()
+        depth_temporal = (ffactor_temporal >> 1).bit_length()
+
+        for i, tgt in enumerate(block_out_channels):
+            stage = nn.Module()
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                     out_channels=tgt,
+                                                     temb_channels=0,
+                                                     conv_op=VideoConv3d, norm_op=RMS_norm)
+                                        for j in range(num_res_blocks + 1)])
+            ch = tgt
+            if i < depth:
+                nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch
+                stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal)
+                ch = nxt
+            self.up.append(stage)
+
+        self.norm_out = RMS_norm(ch)
+        self.conv_out = VideoConv3d(ch, out_channels, 3)
+
+    def forward(self, z):
+        z = z.permute(0, 2, 1, 3, 4)
+        b, f, c, h, w = z.shape
+        z = z.reshape(b, f, 2, c // 2, h, w)
+        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
+        z = z.permute(0, 2, 1, 3, 4)
+        z = z[:, :, 1:]
+
+        x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
+        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
+
+        for stage in self.up:
+            for blk in stage.block:
+                x = blk(x)
+            if hasattr(stage, 'upsample'):
+                x = stage.upsample(x)
+
+        return self.conv_out(F.silu(self.norm_out(x)))
diff --git a/comfy/ldm/models/autoencoder.py b/comfy/ldm/models/autoencoder.py
index 13bd6e16b..611d36a1b 100644
--- a/comfy/ldm/models/autoencoder.py
+++ b/comfy/ldm/models/autoencoder.py
@@ -26,6 +26,12 @@ class DiagonalGaussianRegularizer(torch.nn.Module):
             z = posterior.mode()
         return z, None
 
+class EmptyRegularizer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, dict]:
+        return z, None
 
 class AbstractAutoencoder(torch.nn.Module):
     """
diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py
index 8f598a848..4245eedca 100644
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@@ -145,7 +145,7 @@ class Downsample(nn.Module):
 
 class ResnetBlock(nn.Module):
     def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
-                 dropout=0.0, temb_channels=512, conv_op=ops.Conv2d):
+                 dropout=0.0, temb_channels=512, conv_op=ops.Conv2d, norm_op=Normalize):
         super().__init__()
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
@@ -153,7 +153,7 @@ class ResnetBlock(nn.Module):
         self.use_conv_shortcut = conv_shortcut
 
         self.swish = torch.nn.SiLU(inplace=True)
-        self.norm1 = Normalize(in_channels)
+        self.norm1 = norm_op(in_channels)
         self.conv1 = conv_op(in_channels,
                                      out_channels,
                                      kernel_size=3,
@@ -162,7 +162,7 @@ class ResnetBlock(nn.Module):
         if temb_channels > 0:
             self.temb_proj = ops.Linear(temb_channels,
                                              out_channels)
-        self.norm2 = Normalize(out_channels)
+        self.norm2 = norm_op(out_channels)
         self.dropout = torch.nn.Dropout(dropout, inplace=True)
         self.conv2 = conv_op(out_channels,
                                      out_channels,
@@ -305,11 +305,11 @@ def vae_attention():
         return normal_attention
 
 class AttnBlock(nn.Module):
-    def __init__(self, in_channels, conv_op=ops.Conv2d):
+    def __init__(self, in_channels, conv_op=ops.Conv2d, norm_op=Normalize):
         super().__init__()
         self.in_channels = in_channels
 
-        self.norm = Normalize(in_channels)
+        self.norm = norm_op(in_channels)
         self.q = conv_op(in_channels,
                                  in_channels,
                                  kernel_size=1,
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 993ff65e6..c69a9d1ad 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1432,3 +1432,23 @@ class HunyuanImage21(BaseModel):
             out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
 
         return out
+
+class HunyuanImage21Refiner(HunyuanImage21):
+    def concat_cond(self, **kwargs):
+        noise = kwargs.get("noise", None)
+        image = kwargs.get("concat_latent_image", None)
+        device = kwargs["device"]
+
+        if image is None:
+            shape_image = list(noise.shape)
+            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+        else:
+            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            image = self.process_latent_in(image)
+            image = utils.resize_to_batch_size(image, noise.shape[0])
+        return image
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        out['disable_time_r'] = comfy.conds.CONDConstant(True)
+        return out
diff --git a/comfy/sd.py b/comfy/sd.py
index 9dd9a74d4..02ddc7239 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -285,6 +285,7 @@ class VAE:
         self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
         self.working_dtypes = [torch.bfloat16, torch.float32]
         self.disable_offload = False
+        self.not_video = False
 
         self.downscale_index_formula = None
         self.upscale_index_formula = None
@@ -409,6 +410,20 @@ class VAE:
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 7) / 8)), 32, 32)
                 self.downscale_index_formula = (8, 32, 32)
                 self.working_dtypes = [torch.bfloat16, torch.float32]
+            elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32:
+                ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True}
+                self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
+                self.downscale_ratio = 16
+                self.upscale_ratio = 16
+                self.latent_dim = 3
+                self.not_video = True
+                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+                self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.EmptyRegularizer"},
+                                                            encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig},
+                                                            decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
+
+                self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
             elif "decoder.conv_in.conv.weight" in sd:
                 ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
                 ddconfig["conv3d"] = True
@@ -669,7 +684,7 @@ class VAE:
         self.throw_exception_if_invalid()
         pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
         pixel_samples = pixel_samples.movedim(-1, 1)
-        if self.latent_dim == 3 and pixel_samples.ndim < 5:
+        if not self.not_video and self.latent_dim == 3 and pixel_samples.ndim < 5:
             pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
         try:
             memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index aa953b462..ba1b8c313 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1321,6 +1321,23 @@ class HunyuanImage21(HunyuanVideo):
         hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage]
+class HunyuanImage21Refiner(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "patch_size": [1, 1, 1],
+        "vec_in_dim": None,
+    }
+
+    sampling_settings = {
+        "shift": 1.0,
+    }
+
+    latent_format = latent_formats.HunyuanImage21Refiner
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanImage21Refiner(self, device=device)
+        return out
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ACEStep, Omnigen2, QwenImage]
 
 models += [SVD_img2vid]
diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py
index ce031ceb2..351a7e2cb 100644
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -128,6 +128,28 @@ class EmptyHunyuanImageLatent:
         latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
         return ({"samples":latent}, )
 
+class HunyuanRefinerLatent:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"positive": ("CONDITIONING", ),
+                             "negative": ("CONDITIONING", ),
+                             "latent": ("LATENT", ),
+                             }}
+
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative", "latent")
+
+    FUNCTION = "execute"
+
+    def execute(self, positive, negative, latent):
+        latent = latent["samples"]
+
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent})
+        out_latent = {}
+        out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
+        return (positive, negative, out_latent)
+
 
 NODE_CLASS_MAPPINGS = {
     "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
@@ -135,4 +157,5 @@ NODE_CLASS_MAPPINGS = {
     "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
     "HunyuanImageToVideo": HunyuanImageToVideo,
     "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
+    "HunyuanRefinerLatent": HunyuanRefinerLatent,
 }

From 15ec9ea958d1c5d374add598b571a585541d4863 Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Thu, 11 Sep 2025 21:44:20 -0700
Subject: [PATCH 09/10] Add Output to V3 Combo type to match what is possible
 with V1 (#9813)

---
 comfy_api/latest/_io.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index f770109d5..4826818df 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -331,7 +331,7 @@ class String(ComfyTypeIO):
             })
 
 @comfytype(io_type="COMBO")
-class Combo(ComfyTypeI):
+class Combo(ComfyTypeIO):
     Type = str
     class Input(WidgetInput):
         """Combo input (dropdown)."""
@@ -360,6 +360,14 @@ class Combo(ComfyTypeI):
                 "remote": self.remote.as_dict() if self.remote else None,
             })
 
+    class Output(Output):
+        def __init__(self, id: str=None, display_name: str=None, options: list[str]=None, tooltip: str=None, is_output_list=False):
+            super().__init__(id, display_name, tooltip, is_output_list)
+            self.options = options if options is not None else []
+
+        @property
+        def io_type(self):
+            return self.options
 
 @comfytype(io_type="COMBO")
 class MultiCombo(ComfyTypeI):

From d6b977b2e680e98ad18a37ee13783da4f30e15f4 Mon Sep 17 00:00:00 2001
From: Benjamin Lu <benjaminlu1107@gmail.com>
Date: Thu, 11 Sep 2025 21:46:01 -0700
Subject: [PATCH 10/10] Bump frontend to 1.26.11 (#9809)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0e21967ef..de5af5fac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.25.11
+comfyui-frontend-package==1.26.11
 comfyui-workflow-templates==0.1.81
 comfyui-embedded-docs==0.2.6
 torch