From c4a8cf60ab5d6eaf052b7a08f5ee97104acf7a2f Mon Sep 17 00:00:00 2001
From: AustinMroz <austin@comfy.org>
Date: Tue, 30 Sep 2025 22:12:32 -0700
Subject: [PATCH 01/10] Bump frontend to 1.27.7 (#10133)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 45d3e1607..588c5dcf0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.26.13
+comfyui-frontend-package==1.27.7
 comfyui-workflow-templates==0.1.91
 comfyui-embedded-docs==0.2.6
 torch

From 638097829d2352a1c78ab4fbb1e028d1e7cff012 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 09:00:22 +0300
Subject: [PATCH 02/10] convert nodes_audio_encoder.py to V3 schema (#10123)

---
 comfy_api/latest/_io.py             |  1 +
 comfy_extras/nodes_audio_encoder.py | 68 ++++++++++++++++++-----------
 2 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index 4826818df..2d95cffd6 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -1605,6 +1605,7 @@ class _IO:
     Model = Model
     ClipVision = ClipVision
     ClipVisionOutput = ClipVisionOutput
+    AudioEncoder = AudioEncoder
     AudioEncoderOutput = AudioEncoderOutput
     StyleModel = StyleModel
     Gligen = Gligen
diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py
index 39a140fef..13aacd41a 100644
--- a/comfy_extras/nodes_audio_encoder.py
+++ b/comfy_extras/nodes_audio_encoder.py
@@ -1,44 +1,62 @@
 import folder_paths
 import comfy.audio_encoders.audio_encoders
 import comfy.utils
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
 
 
-class AudioEncoderLoader:
+class AudioEncoderLoader(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "audio_encoder_name": (folder_paths.get_filename_list("audio_encoders"), ),
-                             }}
-    RETURN_TYPES = ("AUDIO_ENCODER",)
-    FUNCTION = "load_model"
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="AudioEncoderLoader",
+            category="loaders",
+            inputs=[
+                io.Combo.Input(
+                    "audio_encoder_name",
+                    options=folder_paths.get_filename_list("audio_encoders"),
+                ),
+            ],
+            outputs=[io.AudioEncoder.Output()],
+        )
 
-    CATEGORY = "loaders"
-
-    def load_model(self, audio_encoder_name):
+    @classmethod
+    def execute(cls, audio_encoder_name) -> io.NodeOutput:
         audio_encoder_name = folder_paths.get_full_path_or_raise("audio_encoders", audio_encoder_name)
         sd = comfy.utils.load_torch_file(audio_encoder_name, safe_load=True)
         audio_encoder = comfy.audio_encoders.audio_encoders.load_audio_encoder_from_sd(sd)
         if audio_encoder is None:
             raise RuntimeError("ERROR: audio encoder file is invalid and does not contain a valid model.")
-        return (audio_encoder,)
+        return io.NodeOutput(audio_encoder)
 
 
-class AudioEncoderEncode:
+class AudioEncoderEncode(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "audio_encoder": ("AUDIO_ENCODER",),
-                              "audio": ("AUDIO",),
-                             }}
-    RETURN_TYPES = ("AUDIO_ENCODER_OUTPUT",)
-    FUNCTION = "encode"
+    def define_schema(cls) -> io.Schema:
+        return io.Schema(
+            node_id="AudioEncoderEncode",
+            category="conditioning",
+            inputs=[
+                io.AudioEncoder.Input("audio_encoder"),
+                io.Audio.Input("audio"),
+            ],
+            outputs=[io.AudioEncoderOutput.Output()],
+        )
 
-    CATEGORY = "conditioning"
-
-    def encode(self, audio_encoder, audio):
+    @classmethod
+    def execute(cls, audio_encoder, audio) -> io.NodeOutput:
         output = audio_encoder.encode_audio(audio["waveform"], audio["sample_rate"])
-        return (output,)
+        return io.NodeOutput(output)
 
 
-NODE_CLASS_MAPPINGS = {
-    "AudioEncoderLoader": AudioEncoderLoader,
-    "AudioEncoderEncode": AudioEncoderEncode,
-}
+class AudioEncoder(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            AudioEncoderLoader,
+            AudioEncoderEncode,
+        ]
+
+
+async def comfy_entrypoint() -> AudioEncoder:
+    return AudioEncoder()

From 7eb7160db487feb891ceabdf985b09f9a8091869 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 22:16:59 +0300
Subject: [PATCH 03/10] convert nodes_gits.py to V3 schema (#9949)

---
 comfy_extras/nodes_gits.py | 49 ++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/comfy_extras/nodes_gits.py b/comfy_extras/nodes_gits.py
index 47b1dd049..25367560a 100644
--- a/comfy_extras/nodes_gits.py
+++ b/comfy_extras/nodes_gits.py
@@ -1,6 +1,8 @@
 # from https://github.com/zju-pi/diff-sampler/tree/main/gits-main
 import numpy as np
 import torch
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
 
 def loglinear_interp(t_steps, num_steps):
     """
@@ -333,25 +335,28 @@ NOISE_LEVELS = {
     ],
 }
 
-class GITSScheduler:
+class GITSScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"coeff": ("FLOAT", {"default": 1.20, "min": 0.80, "max": 1.50, "step": 0.05}),
-                     "steps": ("INT", {"default": 10, "min": 2, "max": 1000}),
-                     "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                      }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="GITSScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Float.Input("coeff", default=1.20, min=0.80, max=1.50, step=0.05),
+                io.Int.Input("steps", default=10, min=2, max=1000),
+                io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01),
+            ],
+            outputs=[
+                io.Sigmas.Output(),
+            ],
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, coeff, steps, denoise):
+    @classmethod
+    def execute(cls, coeff, steps, denoise):
         total_steps = steps
         if denoise < 1.0:
             if denoise <= 0.0:
-                return (torch.FloatTensor([]),)
+                return io.NodeOutput(torch.FloatTensor([]))
             total_steps = round(steps * denoise)
 
         if steps <= 20:
@@ -362,8 +367,16 @@ class GITSScheduler:
 
         sigmas = sigmas[-(total_steps + 1):]
         sigmas[-1] = 0
-        return (torch.FloatTensor(sigmas), )
+        return io.NodeOutput(torch.FloatTensor(sigmas))
 
-NODE_CLASS_MAPPINGS = {
-    "GITSScheduler": GITSScheduler,
-}
+
+class GITSSchedulerExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            GITSScheduler,
+        ]
+
+
+async def comfy_entrypoint() -> GITSSchedulerExtension:
+    return GITSSchedulerExtension()

From e0210ce0a7140e0c61bce7fdb964b5e5e8d31619 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 22:17:33 +0300
Subject: [PATCH 04/10] convert nodes_differential_diffusion.py to V3 schema
 (#10056)

---
 comfy_extras/nodes_differential_diffusion.py | 69 ++++++++++++--------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py
index 255ac420d..6dfdf466c 100644
--- a/comfy_extras/nodes_differential_diffusion.py
+++ b/comfy_extras/nodes_differential_diffusion.py
@@ -1,34 +1,41 @@
 # code adapted from https://github.com/exx8/differential-diffusion
 
+from typing_extensions import override
+
 import torch
+from comfy_api.latest import ComfyExtension, io
 
-class DifferentialDiffusion():
+
+class DifferentialDiffusion(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "model": ("MODEL", ),
-            },
-            "optional": {
-                "strength": ("FLOAT", {
-                    "default": 1.0,
-                    "min": 0.0,
-                    "max": 1.0,
-                    "step": 0.01,
-                }),
-            }
-        }
-    RETURN_TYPES = ("MODEL",)
-    FUNCTION = "apply"
-    CATEGORY = "_for_testing"
-    INIT = False
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DifferentialDiffusion",
+            display_name="Differential Diffusion",
+            category="_for_testing",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input(
+                    "strength",
+                    default=1.0,
+                    min=0.0,
+                    max=1.0,
+                    step=0.01,
+                    optional=True,
+                ),
+            ],
+            outputs=[io.Model.Output()],
+            is_experimental=True,
+        )
 
-    def apply(self, model, strength=1.0):
+    @classmethod
+    def execute(cls, model, strength=1.0) -> io.NodeOutput:
         model = model.clone()
-        model.set_model_denoise_mask_function(lambda *args, **kwargs: self.forward(*args, **kwargs, strength=strength))
-        return (model, )
+        model.set_model_denoise_mask_function(lambda *args, **kwargs: cls.forward(*args, **kwargs, strength=strength))
+        return io.NodeOutput(model)
 
-    def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float):
+    @classmethod
+    def forward(cls, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float):
         model = extra_options["model"]
         step_sigmas = extra_options["sigmas"]
         sigma_to = model.inner_model.model_sampling.sigma_min
@@ -53,9 +60,13 @@ class DifferentialDiffusion():
             return binary_mask
 
 
-NODE_CLASS_MAPPINGS = {
-    "DifferentialDiffusion": DifferentialDiffusion,
-}
-NODE_DISPLAY_NAME_MAPPINGS = {
-    "DifferentialDiffusion": "Differential Diffusion",
-}
+class DifferentialDiffusionExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            DifferentialDiffusion,
+        ]
+
+
+async def comfy_entrypoint() -> DifferentialDiffusionExtension:
+    return DifferentialDiffusionExtension()

From 3af1881455fb0c44c3030b2d61b79302933386d2 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 22:18:04 +0300
Subject: [PATCH 05/10] convert nodes_optimalsteps.py to V3 schema (#10074)

---
 comfy_extras/nodes_optimalsteps.py | 52 +++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py
index e7c851ca2..73f0104d8 100644
--- a/comfy_extras/nodes_optimalsteps.py
+++ b/comfy_extras/nodes_optimalsteps.py
@@ -1,9 +1,12 @@
 # from https://github.com/bebebe666/OptimalSteps
 
-
 import numpy as np
 import torch
 
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
 def loglinear_interp(t_steps, num_steps):
     """
     Performs log-linear interpolation of a given array of decreasing numbers.
@@ -23,25 +26,28 @@ NOISE_LEVELS = {"FLUX": [0.9968, 0.9886, 0.9819, 0.975, 0.966, 0.9471, 0.9158, 0
 "Chroma": [0.992, 0.99, 0.988, 0.985, 0.982, 0.978, 0.973, 0.968, 0.961, 0.953, 0.943, 0.931, 0.917, 0.9, 0.881, 0.858, 0.832, 0.802, 0.769, 0.731, 0.69, 0.646, 0.599, 0.55, 0.501, 0.451, 0.402, 0.355, 0.311, 0.27, 0.232, 0.199, 0.169, 0.143, 0.12, 0.101, 0.084, 0.07, 0.058, 0.048, 0.001],
 }
 
-class OptimalStepsScheduler:
+class OptimalStepsScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model_type": (["FLUX", "Wan", "Chroma"], ),
-                     "steps": ("INT", {"default": 20, "min": 3, "max": 1000}),
-                     "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                      }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="OptimalStepsScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Combo.Input("model_type", options=["FLUX", "Wan", "Chroma"]),
+                io.Int.Input("steps", default=20, min=3, max=1000),
+                io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01),
+            ],
+            outputs=[
+                io.Sigmas.Output(),
+            ],
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, model_type, steps, denoise):
+    @classmethod
+    def execute(cls, model_type, steps, denoise) ->io.NodeOutput:
         total_steps = steps
         if denoise < 1.0:
             if denoise <= 0.0:
-                return (torch.FloatTensor([]),)
+                return io.NodeOutput(torch.FloatTensor([]))
             total_steps = round(steps * denoise)
 
         sigmas = NOISE_LEVELS[model_type][:]
@@ -50,8 +56,16 @@ class OptimalStepsScheduler:
 
         sigmas = sigmas[-(total_steps + 1):]
         sigmas[-1] = 0
-        return (torch.FloatTensor(sigmas), )
+        return io.NodeOutput(torch.FloatTensor(sigmas))
 
-NODE_CLASS_MAPPINGS = {
-    "OptimalStepsScheduler": OptimalStepsScheduler,
-}
+
+class OptimalStepsExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            OptimalStepsScheduler,
+        ]
+
+
+async def comfy_entrypoint() -> OptimalStepsExtension:
+    return OptimalStepsExtension()

From 11bab7be76d0bfdb326e8aea53cdfebd99b42cc5 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 22:18:49 +0300
Subject: [PATCH 06/10] convert nodes_pag.py to V3 schema (#10080)

---
 comfy_extras/nodes_pag.py | 49 +++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py
index eb28196f4..79fea5f0c 100644
--- a/comfy_extras/nodes_pag.py
+++ b/comfy_extras/nodes_pag.py
@@ -3,25 +3,30 @@
 
 #My modified one here is more basic but has less chances of breaking with ComfyUI updates.
 
+from typing_extensions import override
+
 import comfy.model_patcher
 import comfy.samplers
+from comfy_api.latest import ComfyExtension, io
 
-class PerturbedAttentionGuidance:
+
+class PerturbedAttentionGuidance(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "model": ("MODEL",),
-                "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": 0.01}),
-            }
-        }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="PerturbedAttentionGuidance",
+            category="model_patches/unet",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input("scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01),
+            ],
+            outputs=[
+                io.Model.Output(),
+            ],
+        )
 
-    RETURN_TYPES = ("MODEL",)
-    FUNCTION = "patch"
-
-    CATEGORY = "model_patches/unet"
-
-    def patch(self, model, scale):
+    @classmethod
+    def execute(cls, model, scale) -> io.NodeOutput:
         unet_block = "middle"
         unet_block_id = 0
         m = model.clone()
@@ -49,8 +54,16 @@ class PerturbedAttentionGuidance:
 
         m.set_model_sampler_post_cfg_function(post_cfg_function)
 
-        return (m,)
+        return io.NodeOutput(m)
 
-NODE_CLASS_MAPPINGS = {
-    "PerturbedAttentionGuidance": PerturbedAttentionGuidance,
-}
+
+class PAGExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            PerturbedAttentionGuidance,
+        ]
+
+
+async def comfy_entrypoint() -> PAGExtension:
+    return PAGExtension()

From d9c0a4053d955c7fd3400be07001bc4e774591e1 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 22:19:56 +0300
Subject: [PATCH 07/10] convert nodes_lt.py to V3 schema (#10084)

---
 comfy_extras/nodes_lt.py | 412 ++++++++++++++++++++++-----------------
 1 file changed, 228 insertions(+), 184 deletions(-)

diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py
index f82337a67..b51d15804 100644
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -1,4 +1,3 @@
-import io
 import nodes
 import node_helpers
 import torch
@@ -8,46 +7,60 @@ import comfy.utils
 import math
 import numpy as np
 import av
+from io import BytesIO
+from typing_extensions import override
 from comfy.ldm.lightricks.symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
+from comfy_api.latest import ComfyExtension, io
 
-class EmptyLTXVLatentVideo:
+class EmptyLTXVLatentVideo(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
-                              "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
-                              "length": ("INT", {"default": 97, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}),
-                              "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "generate"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyLTXVLatentVideo",
+            category="latent/video/ltxv",
+            inputs=[
+                io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32),
+                io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32),
+                io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=8),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[
+                io.Latent.Output(),
+            ],
+        )
 
-    CATEGORY = "latent/video/ltxv"
-
-    def generate(self, width, height, length, batch_size=1):
+    @classmethod
+    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
         latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
-        return ({"samples": latent}, )
+        return io.NodeOutput({"samples": latent})
 
 
-class LTXVImgToVideo:
+class LTXVImgToVideo(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "vae": ("VAE",),
-                             "image": ("IMAGE",),
-                             "width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
-                             "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
-                             "length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}),
-                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0}),
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LTXVImgToVideo",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Image.Input("image"),
+                io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32),
+                io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32),
+                io.Int.Input("length", default=97, min=9, max=nodes.MAX_RESOLUTION, step=8),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Float.Input("strength", default=1.0, min=0.0, max=1.0),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
 
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
-
-    CATEGORY = "conditioning/video_models"
-    FUNCTION = "generate"
-
-    def generate(self, positive, negative, image, vae, width, height, length, batch_size, strength):
+    @classmethod
+    def execute(cls, positive, negative, image, vae, width, height, length, batch_size, strength) -> io.NodeOutput:
         pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
         encode_pixels = pixels[:, :, :, :3]
         t = vae.encode(encode_pixels)
@@ -62,7 +75,7 @@ class LTXVImgToVideo:
         )
         conditioning_latent_frames_mask[:, :, :t.shape[2]] = 1.0 - strength
 
-        return (positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}, )
+        return io.NodeOutput(positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask})
 
 
 def conditioning_get_any_value(conditioning, key, default=None):
@@ -93,35 +106,46 @@ def get_keyframe_idxs(cond):
     num_keyframes = torch.unique(keyframe_idxs[:, 0]).shape[0]
     return keyframe_idxs, num_keyframes
 
-class LTXVAddGuide:
+class LTXVAddGuide(io.ComfyNode):
+    NUM_PREFIX_FRAMES = 2
+    PATCHIFIER = SymmetricPatchifier(1)
+
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "vae": ("VAE",),
-                             "latent": ("LATENT",),
-                             "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames."
-                                                 "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames."}),
-                             "frame_idx": ("INT", {"default": 0, "min": -9999, "max": 9999,
-                                                   "tooltip": "Frame index to start the conditioning at. For single-frame images or "
-                                                   "videos with 1-8 frames, any frame_idx value is acceptable. For videos with 9+ "
-                                                   "frames, frame_idx must be divisible by 8, otherwise it will be rounded down to "
-                                                   "the nearest multiple of 8. Negative values are counted from the end of the video."}),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                             }
-            }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LTXVAddGuide",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Latent.Input("latent"),
+                io.Image.Input(
+                    "image",
+                    tooltip="Image or video to condition the latent video on. Must be 8*n + 1 frames. "
+                            "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames.",
+                ),
+                io.Int.Input(
+                    "frame_idx",
+                    default=0,
+                    min=-9999,
+                    max=9999,
+                    tooltip="Frame index to start the conditioning at. "
+                            "For single-frame images or videos with 1-8 frames, any frame_idx value is acceptable. "
+                            "For videos with 9+ frames, frame_idx must be divisible by 8, otherwise it will be rounded "
+                            "down to the nearest multiple of 8. Negative values are counted from the end of the video.",
+                ),
+                io.Float.Input("strength", default=1.0, min=0.0, max=1.0, step=0.01),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
 
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
-
-    CATEGORY = "conditioning/video_models"
-    FUNCTION = "generate"
-
-    def __init__(self):
-        self._num_prefix_frames = 2
-        self._patchifier = SymmetricPatchifier(1)
-
-    def encode(self, vae, latent_width, latent_height, images, scale_factors):
+    @classmethod
+    def encode(cls, vae, latent_width, latent_height, images, scale_factors):
         time_scale_factor, width_scale_factor, height_scale_factor = scale_factors
         images = images[:(images.shape[0] - 1) // time_scale_factor * time_scale_factor + 1]
         pixels = comfy.utils.common_upscale(images.movedim(-1, 1), latent_width * width_scale_factor, latent_height * height_scale_factor, "bilinear", crop="disabled").movedim(1, -1)
@@ -129,7 +153,8 @@ class LTXVAddGuide:
         t = vae.encode(encode_pixels)
         return encode_pixels, t
 
-    def get_latent_index(self, cond, latent_length, guide_length, frame_idx, scale_factors):
+    @classmethod
+    def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors):
         time_scale_factor, _, _ = scale_factors
         _, num_keyframes = get_keyframe_idxs(cond)
         latent_count = latent_length - num_keyframes
@@ -141,9 +166,10 @@ class LTXVAddGuide:
 
         return frame_idx, latent_idx
 
-    def add_keyframe_index(self, cond, frame_idx, guiding_latent, scale_factors):
+    @classmethod
+    def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors):
         keyframe_idxs, _ = get_keyframe_idxs(cond)
-        _, latent_coords = self._patchifier.patchify(guiding_latent)
+        _, latent_coords = cls.PATCHIFIER.patchify(guiding_latent)
         pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, causal_fix=frame_idx == 0)  # we need the causal fix only if we're placing the new latents at index 0
         pixel_coords[:, 0] += frame_idx
         if keyframe_idxs is None:
@@ -152,8 +178,9 @@ class LTXVAddGuide:
             keyframe_idxs = torch.cat([keyframe_idxs, pixel_coords], dim=2)
         return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs})
 
-    def append_keyframe(self, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors):
-        _, latent_idx = self.get_latent_index(
+    @classmethod
+    def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors):
+        _, latent_idx = cls.get_latent_index(
             cond=positive,
             latent_length=latent_image.shape[2],
             guide_length=guiding_latent.shape[2],
@@ -162,8 +189,8 @@ class LTXVAddGuide:
         )
         noise_mask[:, :, latent_idx:latent_idx + guiding_latent.shape[2]] = 1.0
 
-        positive = self.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors)
-        negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors)
+        positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors)
+        negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors)
 
         mask = torch.full(
             (noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]),
@@ -176,7 +203,8 @@ class LTXVAddGuide:
         noise_mask = torch.cat([noise_mask, mask], dim=2)
         return positive, negative, latent_image, noise_mask
 
-    def replace_latent_frames(self, latent_image, noise_mask, guiding_latent, latent_idx, strength):
+    @classmethod
+    def replace_latent_frames(cls, latent_image, noise_mask, guiding_latent, latent_idx, strength):
         cond_length = guiding_latent.shape[2]
         assert latent_image.shape[2] >= latent_idx + cond_length, "Conditioning frames exceed the length of the latent sequence."
 
@@ -195,20 +223,21 @@ class LTXVAddGuide:
 
         return latent_image, noise_mask
 
-    def generate(self, positive, negative, vae, latent, image, frame_idx, strength):
+    @classmethod
+    def execute(cls, positive, negative, vae, latent, image, frame_idx, strength) -> io.NodeOutput:
         scale_factors = vae.downscale_index_formula
         latent_image = latent["samples"]
         noise_mask = get_noise_mask(latent)
 
         _, _, latent_length, latent_height, latent_width = latent_image.shape
-        image, t = self.encode(vae, latent_width, latent_height, image, scale_factors)
+        image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors)
 
-        frame_idx, latent_idx = self.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors)
+        frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors)
         assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence."
 
-        num_prefix_frames = min(self._num_prefix_frames, t.shape[2])
+        num_prefix_frames = min(cls.NUM_PREFIX_FRAMES, t.shape[2])
 
-        positive, negative, latent_image, noise_mask = self.append_keyframe(
+        positive, negative, latent_image, noise_mask = cls.append_keyframe(
             positive,
             negative,
             frame_idx,
@@ -223,9 +252,9 @@ class LTXVAddGuide:
 
         t = t[:, :, num_prefix_frames:]
         if t.shape[2] == 0:
-            return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
+            return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})
 
-        latent_image, noise_mask = self.replace_latent_frames(
+        latent_image, noise_mask = cls.replace_latent_frames(
             latent_image,
             noise_mask,
             t,
@@ -233,34 +262,35 @@ class LTXVAddGuide:
             strength,
         )
 
-        return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
+        return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})
 
 
-class LTXVCropGuides:
+class LTXVCropGuides(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "latent": ("LATENT",),
-                             }
-            }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LTXVCropGuides",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Latent.Input("latent"),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
 
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
-
-    CATEGORY = "conditioning/video_models"
-    FUNCTION = "crop"
-
-    def __init__(self):
-        self._patchifier = SymmetricPatchifier(1)
-
-    def crop(self, positive, negative, latent):
+    @classmethod
+    def execute(cls, positive, negative, latent) -> io.NodeOutput:
         latent_image = latent["samples"].clone()
         noise_mask = get_noise_mask(latent)
 
         _, num_keyframes = get_keyframe_idxs(positive)
         if num_keyframes == 0:
-            return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
+            return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
 
         latent_image = latent_image[:, :, :-num_keyframes]
         noise_mask = noise_mask[:, :, :-num_keyframes]
@@ -268,44 +298,52 @@ class LTXVCropGuides:
         positive = node_helpers.conditioning_set_values(positive, {"keyframe_idxs": None})
         negative = node_helpers.conditioning_set_values(negative, {"keyframe_idxs": None})
 
-        return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
+        return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})
 
 
-class LTXVConditioning:
+class LTXVConditioning(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "frame_rate": ("FLOAT", {"default": 25.0, "min": 0.0, "max": 1000.0, "step": 0.01}),
-                             }}
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
-    RETURN_NAMES = ("positive", "negative")
-    FUNCTION = "append"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LTXVConditioning",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Float.Input("frame_rate", default=25.0, min=0.0, max=1000.0, step=0.01),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+            ],
+        )
 
-    CATEGORY = "conditioning/video_models"
-
-    def append(self, positive, negative, frame_rate):
+    @classmethod
+    def execute(cls, positive, negative, frame_rate) -> io.NodeOutput:
         positive = node_helpers.conditioning_set_values(positive, {"frame_rate": frame_rate})
         negative = node_helpers.conditioning_set_values(negative, {"frame_rate": frame_rate})
-        return (positive, negative)
+        return io.NodeOutput(positive, negative)
 
 
-class ModelSamplingLTXV:
+class ModelSamplingLTXV(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": { "model": ("MODEL",),
-                              "max_shift": ("FLOAT", {"default": 2.05, "min": 0.0, "max": 100.0, "step":0.01}),
-                              "base_shift": ("FLOAT", {"default": 0.95, "min": 0.0, "max": 100.0, "step":0.01}),
-                              },
-                "optional": {"latent": ("LATENT",), }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ModelSamplingLTXV",
+            category="advanced/model",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01),
+                io.Float.Input("base_shift", default=0.95, min=0.0, max=100.0, step=0.01),
+                io.Latent.Input("latent", optional=True),
+            ],
+            outputs=[
+                io.Model.Output(),
+            ],
+        )
 
-    RETURN_TYPES = ("MODEL",)
-    FUNCTION = "patch"
-
-    CATEGORY = "advanced/model"
-
-    def patch(self, model, max_shift, base_shift, latent=None):
+    @classmethod
+    def execute(cls, model, max_shift, base_shift, latent=None) -> io.NodeOutput:
         m = model.clone()
 
         if latent is None:
@@ -329,37 +367,41 @@ class ModelSamplingLTXV:
         model_sampling.set_parameters(shift=shift)
         m.add_object_patch("model_sampling", model_sampling)
 
-        return (m, )
+        return io.NodeOutput(m)
 
 
-class LTXVScheduler:
+class LTXVScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "max_shift": ("FLOAT", {"default": 2.05, "min": 0.0, "max": 100.0, "step":0.01}),
-                     "base_shift": ("FLOAT", {"default": 0.95, "min": 0.0, "max": 100.0, "step":0.01}),
-                     "stretch": ("BOOLEAN", {
-                        "default": True,
-                        "tooltip": "Stretch the sigmas to be in the range [terminal, 1]."
-                    }),
-                     "terminal": (
-                        "FLOAT",
-                        {
-                            "default": 0.1, "min": 0.0, "max": 0.99, "step": 0.01,
-                            "tooltip": "The terminal value of the sigmas after stretching."
-                        },
-                    ),
-                    },
-                "optional": {"latent": ("LATENT",), }
-               }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LTXVScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01),
+                io.Float.Input("base_shift", default=0.95, min=0.0, max=100.0, step=0.01),
+                io.Boolean.Input(
+                    id="stretch",
+                    default=True,
+                    tooltip="Stretch the sigmas to be in the range [terminal, 1].",
+                ),
+                io.Float.Input(
+                    id="terminal",
+                    default=0.1,
+                    min=0.0,
+                    max=0.99,
+                    step=0.01,
+                    tooltip="The terminal value of the sigmas after stretching.",
+                ),
+                io.Latent.Input("latent", optional=True),
+            ],
+            outputs=[
+                io.Sigmas.Output(),
+            ],
+        )
 
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
-
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, steps, max_shift, base_shift, stretch, terminal, latent=None):
+    @classmethod
+    def execute(cls, steps, max_shift, base_shift, stretch, terminal, latent=None) -> io.NodeOutput:
         if latent is None:
             tokens = 4096
         else:
@@ -389,7 +431,7 @@ class LTXVScheduler:
             stretched = 1.0 - (one_minus_z / scale_factor)
             sigmas[non_zero_mask] = stretched
 
-        return (sigmas,)
+        return io.NodeOutput(sigmas)
 
 def encode_single_frame(output_file, image_array: np.ndarray, crf):
     container = av.open(output_file, "w", format="mp4")
@@ -423,52 +465,54 @@ def preprocess(image: torch.Tensor, crf=29):
         return image
 
     image_array = (image[:(image.shape[0] // 2) * 2, :(image.shape[1] // 2) * 2] * 255.0).byte().cpu().numpy()
-    with io.BytesIO() as output_file:
+    with BytesIO() as output_file:
         encode_single_frame(output_file, image_array, crf)
         video_bytes = output_file.getvalue()
-    with io.BytesIO(video_bytes) as video_file:
+    with BytesIO(video_bytes) as video_file:
         image_array = decode_single_frame(video_file)
     tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0
     return tensor
 
 
-class LTXVPreprocess:
+class LTXVPreprocess(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "image": ("IMAGE",),
-                "img_compression": (
-                    "INT",
-                    {
-                        "default": 35,
-                        "min": 0,
-                        "max": 100,
-                        "tooltip": "Amount of compression to apply on image.",
-                    },
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LTXVPreprocess",
+            category="image",
+            inputs=[
+                io.Image.Input("image"),
+                io.Int.Input(
+                    id="img_compression", default=35, min=0, max=100, tooltip="Amount of compression to apply on image."
                 ),
-            }
-        }
+            ],
+            outputs=[
+                io.Image.Output(display_name="output_image"),
+            ],
+        )
 
-    FUNCTION = "preprocess"
-    RETURN_TYPES = ("IMAGE",)
-    RETURN_NAMES = ("output_image",)
-    CATEGORY = "image"
-
-    def preprocess(self, image, img_compression):
+    @classmethod
+    def execute(cls, image, img_compression) -> io.NodeOutput:
         output_images = []
         for i in range(image.shape[0]):
             output_images.append(preprocess(image[i], img_compression))
-        return (torch.stack(output_images),)
+        return io.NodeOutput(torch.stack(output_images))
 
 
-NODE_CLASS_MAPPINGS = {
-    "EmptyLTXVLatentVideo": EmptyLTXVLatentVideo,
-    "LTXVImgToVideo": LTXVImgToVideo,
-    "ModelSamplingLTXV": ModelSamplingLTXV,
-    "LTXVConditioning": LTXVConditioning,
-    "LTXVScheduler": LTXVScheduler,
-    "LTXVAddGuide": LTXVAddGuide,
-    "LTXVPreprocess": LTXVPreprocess,
-    "LTXVCropGuides": LTXVCropGuides,
-}
+class LtxvExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            EmptyLTXVLatentVideo,
+            LTXVImgToVideo,
+            ModelSamplingLTXV,
+            LTXVConditioning,
+            LTXVScheduler,
+            LTXVAddGuide,
+            LTXVPreprocess,
+            LTXVCropGuides,
+        ]
+
+
+async def comfy_entrypoint() -> LtxvExtension:
+    return LtxvExtension()

From e4f99b479a19730bea890567129f4032b4dd4787 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 1 Oct 2025 22:20:30 +0300
Subject: [PATCH 08/10] convert nodes_ip2p.pt to V3 schema (#10097)

---
 comfy_extras/nodes_ip2p.py | 54 +++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/comfy_extras/nodes_ip2p.py b/comfy_extras/nodes_ip2p.py
index c2e70a84c..78f29915d 100644
--- a/comfy_extras/nodes_ip2p.py
+++ b/comfy_extras/nodes_ip2p.py
@@ -1,21 +1,30 @@
 import torch
 
-class InstructPixToPixConditioning:
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class InstructPixToPixConditioning(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "vae": ("VAE", ),
-                             "pixels": ("IMAGE", ),
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="InstructPixToPixConditioning",
+            category="conditioning/instructpix2pix",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Image.Input("pixels"),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
 
-    RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT")
-    RETURN_NAMES = ("positive", "negative", "latent")
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/instructpix2pix"
-
-    def encode(self, positive, negative, pixels, vae):
+    @classmethod
+    def execute(cls, positive, negative, pixels, vae) -> io.NodeOutput:
         x = (pixels.shape[1] // 8) * 8
         y = (pixels.shape[2] // 8) * 8
 
@@ -38,8 +47,17 @@ class InstructPixToPixConditioning:
                 n = [t[0], d]
                 c.append(n)
             out.append(c)
-        return (out[0], out[1], out_latent)
+        return io.NodeOutput(out[0], out[1], out_latent)
+
+
+class InstructPix2PixExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            InstructPixToPixConditioning,
+        ]
+
+
+async def comfy_entrypoint() -> InstructPix2PixExtension:
+    return InstructPix2PixExtension()
 
-NODE_CLASS_MAPPINGS = {
-    "InstructPixToPixConditioning": InstructPixToPixConditioning,
-}

From a6f83a4a1a70d720c16d66feb5d87fee5998acdf Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 1 Oct 2025 14:19:13 -0700
Subject: [PATCH 09/10] Support the new hunyuan vae. (#10150)

---
 comfy/ldm/hunyuan_video/vae_refiner.py | 112 ++++++++++++++++---------
 comfy/sd.py                            |  70 ++++++++++------
 2 files changed, 116 insertions(+), 66 deletions(-)

diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py
index c6f742710..c2a0b507d 100644
--- a/comfy/ldm/hunyuan_video/vae_refiner.py
+++ b/comfy/ldm/hunyuan_video/vae_refiner.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
 import comfy.ops
 import comfy.ldm.models.autoencoder
 ops = comfy.ops.disable_weight_init
@@ -17,11 +17,12 @@ class RMS_norm(nn.Module):
         return F.normalize(x, dim=1) * self.scale * self.gamma
 
 class DnSmpl(nn.Module):
-    def __init__(self, ic, oc, tds=True):
+    def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
         super().__init__()
         fct = 2 * 2 * 2 if tds else 1 * 2 * 2
         assert oc % fct == 0
-        self.conv = VideoConv3d(ic, oc // fct, kernel_size=3)
+        self.conv = op(ic, oc // fct, kernel_size=3, stride=1, padding=1)
+        self.refiner_vae = refiner_vae
 
         self.tds = tds
         self.gs = fct * ic // oc
@@ -30,7 +31,7 @@ class DnSmpl(nn.Module):
         r1 = 2 if self.tds else 1
         h = self.conv(x)
 
-        if self.tds:
+        if self.tds and self.refiner_vae:
             hf = h[:, :, :1, :, :]
             b, c, f, ht, wd = hf.shape
             hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
@@ -66,6 +67,7 @@ class DnSmpl(nn.Module):
             sc = torch.cat([xf, xn], dim=2)
         else:
             b, c, frms, ht, wd = h.shape
+
             nf = frms // r1
             h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
             h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
@@ -83,10 +85,11 @@ class DnSmpl(nn.Module):
 
 
 class UpSmpl(nn.Module):
-    def __init__(self, ic, oc, tus=True):
+    def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d):
         super().__init__()
         fct = 2 * 2 * 2 if tus else 1 * 2 * 2
-        self.conv = VideoConv3d(ic, oc * fct, kernel_size=3)
+        self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
+        self.refiner_vae = refiner_vae
 
         self.tus = tus
         self.rp = fct * oc // ic
@@ -95,7 +98,7 @@ class UpSmpl(nn.Module):
         r1 = 2 if self.tus else 1
         h = self.conv(x)
 
-        if self.tus:
+        if self.tus and self.refiner_vae:
             hf = h[:, :, :1, :, :]
             b, c, f, ht, wd = hf.shape
             nc = c // (2 * 2)
@@ -148,43 +151,56 @@ class UpSmpl(nn.Module):
 
 class Encoder(nn.Module):
     def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
-                 ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_):
+                 ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_):
         super().__init__()
         self.z_channels = z_channels
         self.block_out_channels = block_out_channels
         self.num_res_blocks = num_res_blocks
-        self.conv_in = VideoConv3d(in_channels, block_out_channels[0], 3, 1, 1)
+        self.ffactor_temporal = ffactor_temporal
+
+        self.refiner_vae = refiner_vae
+        if self.refiner_vae:
+            conv_op = VideoConv3d
+            norm_op = RMS_norm
+        else:
+            conv_op = ops.Conv3d
+            norm_op = Normalize
+
+        self.conv_in = conv_op(in_channels, block_out_channels[0], 3, 1, 1)
 
         self.down = nn.ModuleList()
         ch = block_out_channels[0]
         depth = (ffactor_spatial >> 1).bit_length()
-        depth_temporal = ((ffactor_spatial // ffactor_temporal) >> 1).bit_length()
+        depth_temporal = ((ffactor_spatial // self.ffactor_temporal) >> 1).bit_length()
 
         for i, tgt in enumerate(block_out_channels):
             stage = nn.Module()
             stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
                                                      out_channels=tgt,
                                                      temb_channels=0,
-                                                     conv_op=VideoConv3d, norm_op=RMS_norm)
+                                                     conv_op=conv_op, norm_op=norm_op)
                                         for j in range(num_res_blocks)])
             ch = tgt
             if i < depth:
                 nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch
-                stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal)
+                stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal, refiner_vae=self.refiner_vae, op=conv_op)
                 ch = nxt
             self.down.append(stage)
 
         self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
-        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
 
-        self.norm_out = RMS_norm(ch)
-        self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1)
+        self.norm_out = norm_op(ch)
+        self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
 
         self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer()
 
     def forward(self, x):
+        if not self.refiner_vae and x.shape[2] == 1:
+            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
+
         x = self.conv_in(x)
 
         for stage in self.down:
@@ -200,31 +216,42 @@ class Encoder(nn.Module):
         skip = x.view(b, c // grp, grp, t, h, w).mean(2)
 
         out = self.conv_out(F.silu(self.norm_out(x))) + skip
-        out = self.regul(out)[0]
 
-        out = torch.cat((out[:, :, :1], out), dim=2)
-        out = out.permute(0, 2, 1, 3, 4)
-        b, f_times_2, c, h, w = out.shape
-        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
-        out = out.permute(0, 2, 1, 3, 4).contiguous()
+        if self.refiner_vae:
+            out = self.regul(out)[0]
+
+            out = torch.cat((out[:, :, :1], out), dim=2)
+            out = out.permute(0, 2, 1, 3, 4)
+            b, f_times_2, c, h, w = out.shape
+            out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
+            out = out.permute(0, 2, 1, 3, 4).contiguous()
+
         return out
 
 class Decoder(nn.Module):
     def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks,
-                 ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_):
+                 ffactor_spatial, ffactor_temporal, upsample_match_channel=True, refiner_vae=True, **_):
         super().__init__()
         block_out_channels = block_out_channels[::-1]
         self.z_channels = z_channels
         self.block_out_channels = block_out_channels
         self.num_res_blocks = num_res_blocks
 
+        self.refiner_vae = refiner_vae
+        if self.refiner_vae:
+            conv_op = VideoConv3d
+            norm_op = RMS_norm
+        else:
+            conv_op = ops.Conv3d
+            norm_op = Normalize
+
         ch = block_out_channels[0]
-        self.conv_in = VideoConv3d(z_channels, ch, 3)
+        self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
 
         self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
-        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
 
         self.up = nn.ModuleList()
         depth = (ffactor_spatial >> 1).bit_length()
@@ -235,25 +262,26 @@ class Decoder(nn.Module):
             stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
                                                      out_channels=tgt,
                                                      temb_channels=0,
-                                                     conv_op=VideoConv3d, norm_op=RMS_norm)
+                                                     conv_op=conv_op, norm_op=norm_op)
                                         for j in range(num_res_blocks + 1)])
             ch = tgt
             if i < depth:
                 nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch
-                stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal)
+                stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal, refiner_vae=self.refiner_vae, op=conv_op)
                 ch = nxt
             self.up.append(stage)
 
-        self.norm_out = RMS_norm(ch)
-        self.conv_out = VideoConv3d(ch, out_channels, 3)
+        self.norm_out = norm_op(ch)
+        self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)
 
     def forward(self, z):
-        z = z.permute(0, 2, 1, 3, 4)
-        b, f, c, h, w = z.shape
-        z = z.reshape(b, f, 2, c // 2, h, w)
-        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
-        z = z.permute(0, 2, 1, 3, 4)
-        z = z[:, :, 1:]
+        if self.refiner_vae:
+            z = z.permute(0, 2, 1, 3, 4)
+            b, f, c, h, w = z.shape
+            z = z.reshape(b, f, 2, c // 2, h, w)
+            z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
+            z = z.permute(0, 2, 1, 3, 4)
+            z = z[:, :, 1:]
 
         x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
         x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
@@ -264,4 +292,10 @@ class Decoder(nn.Module):
             if hasattr(stage, 'upsample'):
                 x = stage.upsample(x)
 
-        return self.conv_out(F.silu(self.norm_out(x)))
+        out = self.conv_out(F.silu(self.norm_out(x)))
+
+        if not self.refiner_vae:
+            if z.shape[-3] == 1:
+                out = out[:, :, -1:]
+
+        return out
diff --git a/comfy/sd.py b/comfy/sd.py
index 2df340739..873ad20f2 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -332,35 +332,51 @@ class VAE:
                 self.first_stage_model = StageC_coder()
                 self.downscale_ratio = 32
                 self.latent_channels = 16
-            elif "decoder.conv_in.weight" in sd and sd['decoder.conv_in.weight'].shape[1] == 64:
-                ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True}
-                self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
-                self.downscale_ratio = 32
-                self.upscale_ratio = 32
-                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-                self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
-                                                            encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig},
-                                                            decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig})
-
-                self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
-                self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype)
-
             elif "decoder.conv_in.weight" in sd:
-                #default SD1.x/SD2.x VAE parameters
-                ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
-
-                if 'encoder.down.2.downsample.conv.weight' not in sd and 'decoder.up.3.upsample.conv.weight' not in sd: #Stable diffusion x4 upscaler VAE
-                    ddconfig['ch_mult'] = [1, 2, 4]
-                    self.downscale_ratio = 4
-                    self.upscale_ratio = 4
-
-                self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
-                if 'post_quant_conv.weight' in sd:
-                    self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
-                else:
+                if sd['decoder.conv_in.weight'].shape[1] == 64:
+                    ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True}
+                    self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
+                    self.downscale_ratio = 32
+                    self.upscale_ratio = 32
+                    self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
                     self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
-                                                                encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig},
-                                                                decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig})
+                                                                encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig},
+                                                                decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig})
+
+                    self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
+                    self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype)
+                elif sd['decoder.conv_in.weight'].shape[1] == 32:
+                    ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False}
+                    self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
+                    self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+                    self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
+                    self.upscale_index_formula = (4, 16, 16)
+                    self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
+                    self.downscale_index_formula = (4, 16, 16)
+                    self.latent_dim = 3
+                    self.not_video = True
+                    self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
+                                                                encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig},
+                                                                decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
+
+                    self.memory_used_encode = lambda shape, dtype: (2800 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
+                    self.memory_used_decode = lambda shape, dtype: (2800 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
+                else:
+                    #default SD1.x/SD2.x VAE parameters
+                    ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
+
+                    if 'encoder.down.2.downsample.conv.weight' not in sd and 'decoder.up.3.upsample.conv.weight' not in sd: #Stable diffusion x4 upscaler VAE
+                        ddconfig['ch_mult'] = [1, 2, 4]
+                        self.downscale_ratio = 4
+                        self.upscale_ratio = 4
+
+                    self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
+                    if 'post_quant_conv.weight' in sd:
+                        self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
+                    else:
+                        self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
+                                                                    encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig},
+                                                                    decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig})
             elif "decoder.layers.1.layers.0.beta" in sd:
                 self.first_stage_model = AudioOobleckVAE()
                 self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype)

From bb32d4ec3141333df26fcdaee0c3c08e41b7b249 Mon Sep 17 00:00:00 2001
From: Koratahiu <haidermontage191@gmail.com>
Date: Thu, 2 Oct 2025 00:59:07 +0300
Subject: [PATCH 10/10] feat: Add Epsilon Scaling node for exposure bias
 correction (#10132)

---
 comfy_extras/nodes_eps.py | 60 +++++++++++++++++++++++++++++++++++++++
 nodes.py                  |  1 +
 2 files changed, 61 insertions(+)
 create mode 100644 comfy_extras/nodes_eps.py

diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py
new file mode 100644
index 000000000..c8818f096
--- /dev/null
+++ b/comfy_extras/nodes_eps.py
@@ -0,0 +1,60 @@
+class EpsilonScaling:
+    """
+    Implements the Epsilon Scaling method from 'Elucidating the Exposure Bias in Diffusion Models'
+    (https://arxiv.org/abs/2308.15321v6).
+
+    This method mitigates exposure bias by scaling the predicted noise during sampling,
+    which can significantly improve sample quality. This implementation uses the "uniform schedule"
+    recommended by the paper for its practicality and effectiveness.
+    """
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("MODEL",),
+                "scaling_factor": ("FLOAT", {
+                    "default": 1.005,
+                    "min": 0.5,
+                    "max": 1.5,
+                    "step": 0.001,
+                    "display": "number"
+                }),
+            }
+        }
+
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "patch"
+
+    CATEGORY = "model_patches/unet"
+
+    def patch(self, model, scaling_factor):
+        # Prevent division by zero, though the UI's min value should prevent this.
+        if scaling_factor == 0:
+            scaling_factor = 1e-9
+
+        def epsilon_scaling_function(args):
+            """
+            This function is applied after the CFG guidance has been calculated.
+            It recalculates the denoised latent by scaling the predicted noise.
+            """
+            denoised = args["denoised"]
+            x = args["input"]
+
+            noise_pred = x - denoised
+
+            scaled_noise_pred = noise_pred / scaling_factor
+
+            new_denoised = x - scaled_noise_pred
+
+            return new_denoised
+
+        # Clone the model patcher to avoid modifying the original model in place
+        model_clone = model.clone()
+
+        model_clone.set_model_sampler_post_cfg_function(epsilon_scaling_function)
+
+        return (model_clone,)
+
+NODE_CLASS_MAPPINGS = {
+    "Epsilon Scaling": EpsilonScaling
+}
diff --git a/nodes.py b/nodes.py
index 1a6784b68..88d712993 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2297,6 +2297,7 @@ async def init_builtin_extra_nodes():
         "nodes_gits.py",
         "nodes_controlnet.py",
         "nodes_hunyuan.py",
+        "nodes_eps.py",
         "nodes_flux.py",
         "nodes_lora_extract.py",
         "nodes_torch_compile.py",