From ce0052c087cb1e81ba01e8afbe362bec54eeb665 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 28 Aug 2025 07:37:42 -0700
Subject: [PATCH 1/4] Fix diffsynth controlnet regression. (#9597)

---
 comfy_extras/nodes_model_patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py
index 32c40ced3..65e766b52 100644
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@@ -108,7 +108,7 @@ class DiffSynthCnetPatch:
         img = kwargs.get("img")
         block_index = kwargs.get("block_index")
         spacial_compression = self.vae.spacial_compression_encode()
-        if self.encoded_image is None or self.encoded_image_size != (x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression):
+        if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression):
             image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
             loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
             self.encoded_image = self.model_patch.model.process_input_latent_image(self.encode_latent_cond(image_scaled.movedim(1, -1)))

From 00636101771cb373354d6294cc6567deda2635f6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Thu, 28 Aug 2025 10:44:57 -0400
Subject: [PATCH 2/4] ComfyUI version 0.3.54

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index d6fdc47fe..7034953fd 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.53"
+__version__ = "0.3.54"
diff --git a/pyproject.toml b/pyproject.toml
index a71ad2bbf..9f9ac1e21 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.53"
+version = "0.3.54"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"

From edde0b50431e296f61f79205e25cb01f653013a2 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:59:48 -0700
Subject: [PATCH 3/4] WanSoundImageToVideoExtend node to manually extend s2v
 video. (#9606)

---
 comfy_extras/nodes_wan.py | 145 +++++++++++++++++++++++++-------------
 1 file changed, 97 insertions(+), 48 deletions(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 312260f00..0a55bd5d0 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -877,6 +877,67 @@ def get_audio_embed_bucket_fps(audio_embed, fps=16, batch_frames=81, m=0, video_
     return batch_audio_eb, min_batch_num
 
 
+def wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=0, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None, ref_motion_latent=None):
+    latent_t = ((length - 1) // 4) + 1
+    if audio_encoder_output is not None:
+        feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"])
+        video_rate = 30
+        fps = 16
+        feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate)
+        batch_frames = latent_t * 4
+        audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=batch_frames, m=0, video_rate=video_rate)
+        audio_embed_bucket = audio_embed_bucket.unsqueeze(0)
+        if len(audio_embed_bucket.shape) == 3:
+            audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1)
+        elif len(audio_embed_bucket.shape) == 4:
+            audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)
+
+        audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames]
+        positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
+        negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})
+        frame_offset += batch_frames
+
+    if ref_image is not None:
+        ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+        ref_latent = vae.encode(ref_image[:, :, :, :3])
+        positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
+        negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
+
+    if ref_motion is not None:
+        if ref_motion.shape[0] > 73:
+            ref_motion = ref_motion[-73:]
+
+        ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+
+        if ref_motion.shape[0] < 73:
+            r = torch.ones([73, height, width, 3]) * 0.5
+            r[-ref_motion.shape[0]:] = ref_motion
+            ref_motion = r
+
+        ref_motion_latent = vae.encode(ref_motion[:, :, :, :3])
+
+    if ref_motion_latent is not None:
+        ref_motion_latent = ref_motion_latent[:, :, -19:]
+        positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion_latent})
+        negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion_latent})
+
+    latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+
+    control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent))
+    if control_video is not None:
+        control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+        control_video = vae.encode(control_video[:, :, :, :3])
+        control_video_out[:, :, :control_video.shape[2]] = control_video
+
+    # TODO: check if zero is better than none if none provided
+    positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out})
+    negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out})
+
+    out_latent = {}
+    out_latent["samples"] = latent
+    return positive, negative, out_latent, frame_offset
+
+
 class WanSoundImageToVideo(io.ComfyNode):
     @classmethod
     def define_schema(cls):
@@ -906,57 +967,44 @@ class WanSoundImageToVideo(io.ComfyNode):
 
     @classmethod
     def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, audio_encoder_output=None, control_video=None, ref_motion=None) -> io.NodeOutput:
-        latent_t = ((length - 1) // 4) + 1
-        if audio_encoder_output is not None:
-            feat = torch.cat(audio_encoder_output["encoded_audio_all_layers"])
-            video_rate = 30
-            fps = 16
-            feat = linear_interpolation(feat, input_fps=50, output_fps=video_rate)
-            audio_embed_bucket, num_repeat = get_audio_embed_bucket_fps(feat, fps=fps, batch_frames=latent_t * 4, m=0, video_rate=video_rate)
-            audio_embed_bucket = audio_embed_bucket.unsqueeze(0)
-            if len(audio_embed_bucket.shape) == 3:
-                audio_embed_bucket = audio_embed_bucket.permute(0, 2, 1)
-            elif len(audio_embed_bucket.shape) == 4:
-                audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)
+        positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, ref_image=ref_image, audio_encoder_output=audio_encoder_output,
+                                                                          control_video=control_video, ref_motion=ref_motion)
+        return io.NodeOutput(positive, negative, out_latent)
 
-            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
-            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})
 
-        if ref_image is not None:
-            ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-            ref_latent = vae.encode(ref_image[:, :, :, :3])
-            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
-            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
+class WanSoundImageToVideoExtend(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="WanSoundImageToVideoExtend",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("length", default=77, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Latent.Input("video_latent"),
+                io.AudioEncoderOutput.Input("audio_encoder_output", optional=True),
+                io.Image.Input("ref_image", optional=True),
+                io.Image.Input("control_video", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+            is_experimental=True,
+        )
 
-        if ref_motion is not None:
-            if ref_motion.shape[0] > 73:
-                ref_motion = ref_motion[-73:]
-
-            ref_motion = comfy.utils.common_upscale(ref_motion.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-
-            if ref_motion.shape[0] < 73:
-                r = torch.ones([73, height, width, 3]) * 0.5
-                r[-ref_motion.shape[0]:] = ref_motion
-                ref_motion = r
-
-            ref_motion = vae.encode(ref_motion[:, :, :, :3])
-            positive = node_helpers.conditioning_set_values(positive, {"reference_motion": ref_motion})
-            negative = node_helpers.conditioning_set_values(negative, {"reference_motion": ref_motion})
-
-        latent = torch.zeros([batch_size, 16, latent_t, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-
-        control_video_out = comfy.latent_formats.Wan21().process_out(torch.zeros_like(latent))
-        if control_video is not None:
-            control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-            control_video = vae.encode(control_video[:, :, :, :3])
-            control_video_out[:, :, :control_video.shape[2]] = control_video
-
-        # TODO: check if zero is better than none if none provided
-        positive = node_helpers.conditioning_set_values(positive, {"control_video": control_video_out})
-        negative = node_helpers.conditioning_set_values(negative, {"control_video": control_video_out})
-
-        out_latent = {}
-        out_latent["samples"] = latent
+    @classmethod
+    def execute(cls, positive, negative, vae, length, video_latent, ref_image=None, audio_encoder_output=None, control_video=None) -> io.NodeOutput:
+        video_latent = video_latent["samples"]
+        width = video_latent.shape[-1] * 8
+        height = video_latent.shape[-2] * 8
+        batch_size = video_latent.shape[0]
+        frame_offset = video_latent.shape[-3] * 4
+        positive, negative, out_latent, frame_offset = wan_sound_to_video(positive, negative, vae, width, height, length, batch_size, frame_offset=frame_offset, ref_image=ref_image, audio_encoder_output=audio_encoder_output,
+                                                                          control_video=control_video, ref_motion=None, ref_motion_latent=video_latent)
         return io.NodeOutput(positive, negative, out_latent)
 
 
@@ -1019,6 +1067,7 @@ class WanExtension(ComfyExtension):
             WanCameraImageToVideo,
             WanPhantomSubjectToVideo,
             WanSoundImageToVideo,
+            WanSoundImageToVideoExtend,
             Wan22ImageToVideoLatent,
         ]
 

From 1c184c29eb2a8f6fdd4e49f27347809090038e3f Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 28 Aug 2025 15:34:01 -0700
Subject: [PATCH 4/4] Fix issue with s2v node when extending past audio length.
 (#9608)

---
 comfy_extras/nodes_wan.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index 0a55bd5d0..2cbc93ceb 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -893,9 +893,10 @@ def wan_sound_to_video(positive, negative, vae, width, height, length, batch_siz
             audio_embed_bucket = audio_embed_bucket.permute(0, 2, 3, 1)
 
         audio_embed_bucket = audio_embed_bucket[:, :, :, frame_offset:frame_offset + batch_frames]
-        positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
-        negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})
-        frame_offset += batch_frames
+        if audio_embed_bucket.shape[3] > 0:
+            positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_embed_bucket})
+            negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_embed_bucket * 0.0})
+            frame_offset += batch_frames
 
     if ref_image is not None:
         ref_image = comfy.utils.common_upscale(ref_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)