From fd79d32f38fd24adca5a6e8214f05050f287c9db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?=
 <40791699+kijai@users.noreply.github.com>
Date: Thu, 25 Sep 2025 01:59:29 +0300
Subject: [PATCH 1/4] Add new audio nodes (#9908)

* Add new audio nodes

- TrimAudioDuration
- SplitAudioChannels
- AudioConcat
- AudioMerge
- AudioAdjustVolume

* Update nodes_audio.py

* Add EmptyAudio -node

* Change duration to Float (allows sub seconds)
---
 comfy_extras/nodes_audio.py | 223 ++++++++++++++++++++++++++++++++++++
 1 file changed, 223 insertions(+)

diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index 3b23f65d8..51c8b9dd9 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -11,6 +11,7 @@ import json
 import random
 import hashlib
 import node_helpers
+import logging
 from comfy.cli_args import args
 from comfy.comfy_types import FileLocator
 
@@ -364,6 +365,216 @@ class RecordAudio:
         return (audio, )
 
 
+class TrimAudioDuration:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio": ("AUDIO",),
+                "start_index": ("FLOAT", {"default": 0.0, "min": -0xffffffffffffffff, "max": 0xffffffffffffffff, "step": 0.01, "tooltip": "Start time in seconds, can be negative to count from the end (supports sub-seconds)."}),
+                "duration": ("FLOAT", {"default": 60.0, "min": 0.0, "step": 0.01, "tooltip": "Duration in seconds"}),
+            },
+        }
+
+    FUNCTION = "trim"
+    RETURN_TYPES = ("AUDIO",)
+    CATEGORY = "audio"
+    DESCRIPTION = "Trim audio tensor into chosen time range."
+
+    def trim(self, audio, start_index, duration):
+        waveform = audio["waveform"]
+        sample_rate = audio["sample_rate"]
+        audio_length = waveform.shape[-1]
+
+        if start_index < 0:
+            start_frame = audio_length + int(round(start_index * sample_rate))
+        else:
+            start_frame = int(round(start_index * sample_rate))
+        start_frame = max(0, min(start_frame, audio_length - 1))
+
+        end_frame = start_frame + int(round(duration * sample_rate))
+        end_frame = max(0, min(end_frame, audio_length))
+
+        if start_frame >= end_frame:
+            raise ValueError("AudioTrim: Start time must be less than end time and be within the audio length.")
+
+        return ({"waveform": waveform[..., start_frame:end_frame], "sample_rate": sample_rate},)
+
+
+class SplitAudioChannels:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "audio": ("AUDIO",),
+        }}
+
+    RETURN_TYPES = ("AUDIO", "AUDIO")
+    RETURN_NAMES = ("left", "right")
+    FUNCTION = "separate"
+    CATEGORY = "audio"
+    DESCRIPTION = "Separates the audio into left and right channels."
+
+    def separate(self, audio):
+        waveform = audio["waveform"]
+        sample_rate = audio["sample_rate"]
+
+        if waveform.shape[1] != 2:
+            raise ValueError("AudioSplit: Input audio has only one channel.")
+
+        left_channel = waveform[..., 0:1, :]
+        right_channel = waveform[..., 1:2, :]
+
+        return ({"waveform": left_channel, "sample_rate": sample_rate}, {"waveform": right_channel, "sample_rate": sample_rate})
+
+
+def match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2):
+    if sample_rate_1 != sample_rate_2:
+        if sample_rate_1 > sample_rate_2:
+            waveform_2 = torchaudio.functional.resample(waveform_2, sample_rate_2, sample_rate_1)
+            output_sample_rate = sample_rate_1
+            logging.info(f"Resampling audio2 from {sample_rate_2}Hz to {sample_rate_1}Hz for merging.")
+        else:
+            waveform_1 = torchaudio.functional.resample(waveform_1, sample_rate_1, sample_rate_2)
+            output_sample_rate = sample_rate_2
+            logging.info(f"Resampling audio1 from {sample_rate_1}Hz to {sample_rate_2}Hz for merging.")
+    else:
+        output_sample_rate = sample_rate_1
+    return waveform_1, waveform_2, output_sample_rate
+
+
+class AudioConcat:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "audio1": ("AUDIO",),
+            "audio2": ("AUDIO",),
+            "direction": (['after', 'before'], {"default": 'after', "tooltip": "Whether to append audio2 after or before audio1."}),
+        }}
+
+    RETURN_TYPES = ("AUDIO",)
+    FUNCTION = "concat"
+    CATEGORY = "audio"
+    DESCRIPTION = "Concatenates the audio1 to audio2 in the specified direction."
+
+    def concat(self, audio1, audio2, direction):
+        waveform_1 = audio1["waveform"]
+        waveform_2 = audio2["waveform"]
+        sample_rate_1 = audio1["sample_rate"]
+        sample_rate_2 = audio2["sample_rate"]
+
+        if waveform_1.shape[1] == 1:
+            waveform_1 = waveform_1.repeat(1, 2, 1)
+            logging.info("AudioConcat: Converted mono audio1 to stereo by duplicating the channel.")
+        if waveform_2.shape[1] == 1:
+            waveform_2 = waveform_2.repeat(1, 2, 1)
+            logging.info("AudioConcat: Converted mono audio2 to stereo by duplicating the channel.")
+
+        waveform_1, waveform_2, output_sample_rate = match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2)
+
+        if direction == 'after':
+            concatenated_audio = torch.cat((waveform_1, waveform_2), dim=2)
+        elif direction == 'before':
+            concatenated_audio = torch.cat((waveform_2, waveform_1), dim=2)
+
+        return ({"waveform": concatenated_audio, "sample_rate": output_sample_rate},)
+
+
+class AudioMerge:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "audio1": ("AUDIO",),
+                "audio2": ("AUDIO",),
+                "merge_method": (["add", "mean", "subtract", "multiply"], {"tooltip": "The method used to combine the audio waveforms."}),
+            },
+        }
+
+    FUNCTION = "merge"
+    RETURN_TYPES = ("AUDIO",)
+    CATEGORY = "audio"
+    DESCRIPTION = "Combine two audio tracks by overlaying their waveforms."
+
+    def merge(self, audio1, audio2, merge_method):
+        waveform_1 = audio1["waveform"]
+        waveform_2 = audio2["waveform"]
+        sample_rate_1 = audio1["sample_rate"]
+        sample_rate_2 = audio2["sample_rate"]
+
+        waveform_1, waveform_2, output_sample_rate = match_audio_sample_rates(waveform_1, sample_rate_1, waveform_2, sample_rate_2)
+
+        length_1 = waveform_1.shape[-1]
+        length_2 = waveform_2.shape[-1]
+
+        if length_2 > length_1:
+            logging.info(f"AudioMerge: Trimming audio2 from {length_2} to {length_1} samples to match audio1 length.")
+            waveform_2 = waveform_2[..., :length_1]
+        elif length_2 < length_1:
+            logging.info(f"AudioMerge: Padding audio2 from {length_2} to {length_1} samples to match audio1 length.")
+            pad_shape = list(waveform_2.shape)
+            pad_shape[-1] = length_1 - length_2
+            pad_tensor = torch.zeros(pad_shape, dtype=waveform_2.dtype, device=waveform_2.device)
+            waveform_2 = torch.cat((waveform_2, pad_tensor), dim=-1)
+
+        if merge_method == "add":
+            waveform = waveform_1 + waveform_2
+        elif merge_method == "subtract":
+            waveform = waveform_1 - waveform_2
+        elif merge_method == "multiply":
+            waveform = waveform_1 * waveform_2
+        elif merge_method == "mean":
+            waveform = (waveform_1 + waveform_2) / 2
+
+        max_val = waveform.abs().max()
+        if max_val > 1.0:
+            waveform = waveform / max_val
+
+        return ({"waveform": waveform, "sample_rate": output_sample_rate},)
+
+
+class AudioAdjustVolume:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "audio": ("AUDIO",),
+            "volume": ("INT", {"default": 1.0, "min": -100, "max": 100, "tooltip": "Volume adjustment in decibels (dB). 0 = no change, +6 = double, -6 = half, etc"}),
+        }}
+
+    RETURN_TYPES = ("AUDIO",)
+    FUNCTION = "adjust_volume"
+    CATEGORY = "audio"
+
+    def adjust_volume(self, audio, volume):
+        if volume == 0:
+            return (audio,)
+        waveform = audio["waveform"]
+        sample_rate = audio["sample_rate"]
+
+        gain = 10 ** (volume / 20)
+        waveform = waveform * gain
+
+        return ({"waveform": waveform, "sample_rate": sample_rate},)
+
+
+class EmptyAudio:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "duration": ("FLOAT", {"default": 60.0, "min": 0.0, "max": 0xffffffffffffffff, "step": 0.01, "tooltip": "Duration of the empty audio clip in seconds"}),
+            "sample_rate": ("INT", {"default": 44100, "tooltip": "Sample rate of the empty audio clip."}),
+            "channels": ("INT", {"default": 2, "min": 1, "max": 2, "tooltip": "Number of audio channels (1 for mono, 2 for stereo)."}),
+        }}
+
+    RETURN_TYPES = ("AUDIO",)
+    FUNCTION = "create_empty_audio"
+    CATEGORY = "audio"
+
+    def create_empty_audio(self, duration, sample_rate, channels):
+        num_samples = int(round(duration * sample_rate))
+        waveform = torch.zeros((1, channels, num_samples), dtype=torch.float32)
+        return ({"waveform": waveform, "sample_rate": sample_rate},)
+
+
 NODE_CLASS_MAPPINGS = {
     "EmptyLatentAudio": EmptyLatentAudio,
     "VAEEncodeAudio": VAEEncodeAudio,
@@ -375,6 +586,12 @@ NODE_CLASS_MAPPINGS = {
     "PreviewAudio": PreviewAudio,
     "ConditioningStableAudio": ConditioningStableAudio,
     "RecordAudio": RecordAudio,
+    "TrimAudioDuration": TrimAudioDuration,
+    "SplitAudioChannels": SplitAudioChannels,
+    "AudioConcat": AudioConcat,
+    "AudioMerge": AudioMerge,
+    "AudioAdjustVolume": AudioAdjustVolume,
+    "EmptyAudio": EmptyAudio,
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
@@ -387,4 +604,10 @@ NODE_DISPLAY_NAME_MAPPINGS = {
     "SaveAudioMP3": "Save Audio (MP3)",
     "SaveAudioOpus": "Save Audio (Opus)",
     "RecordAudio": "Record Audio",
+    "TrimAudioDuration": "Trim Audio Duration",
+    "SplitAudioChannels": "Split Audio Channels",
+    "AudioConcat": "Audio Concat",
+    "AudioMerge": "Audio Merge",
+    "AudioAdjustVolume": "Audio Adjust Volume",
+    "EmptyAudio": "Empty Audio",
 }

From fccab99ec0fcd13e80fa59bc73bccff31f9450ca Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 24 Sep 2025 17:09:42 -0700
Subject: [PATCH 2/4] Fix issue with .view() in HuMo. (#10014)

---
 comfy/ldm/wan/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index 2dac5980c..54616e6eb 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1355,7 +1355,7 @@ class WanT2VCrossAttentionGather(WanSelfAttention):
 
         x = optimized_attention(q, k, v, heads=self.num_heads, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
 
-        x = x.transpose(1, 2).view(b, -1, n, d).flatten(2)
+        x = x.transpose(1, 2).reshape(b, -1, n * d)
         x = self.o(x)
         return x
 

From c8d2117f02bcad6d8316ffd8273bdc27adf83b44 Mon Sep 17 00:00:00 2001
From: Guy Niv <43928922+guyniv@users.noreply.github.com>
Date: Thu, 25 Sep 2025 05:35:12 +0300
Subject: [PATCH 3/4] Fix memory leak by properly detaching model finalizer
 (#9979)

When unloading models in load_models_gpu(), the model finalizer was not
being explicitly detached, leading to a memory leak. This caused
linear memory consumption increase over time as models are repeatedly
loaded and unloaded.

This change prevents orphaned finalizer references from accumulating in
memory during model switching operations.
---
 comfy/model_management.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index d880f1970..c5b817b62 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -645,7 +645,9 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
             if loaded_model.model.is_clone(current_loaded_models[i].model):
                 to_unload = [i] + to_unload
         for i in to_unload:
-            current_loaded_models.pop(i).model.detach(unpatch_all=False)
+            model_to_unload = current_loaded_models.pop(i)
+            model_to_unload.model.detach(unpatch_all=False)
+            model_to_unload.model_finalizer.detach()
 
     total_memory_required = {}
     for loaded_model in models_to_load:

From ce4cb2389c8ce63cf8735f200b8672a2c1be0950 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Thu, 25 Sep 2025 14:20:13 -0700
Subject: [PATCH 4/4] Make LatentCompositeMasked work with basic video latents.
 (#10023)

---
 comfy_extras/nodes_mask.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py
index 2b0f8dd5d..a5e405008 100644
--- a/comfy_extras/nodes_mask.py
+++ b/comfy_extras/nodes_mask.py
@@ -12,35 +12,38 @@ from nodes import MAX_RESOLUTION
 def composite(destination, source, x, y, mask = None, multiplier = 8, resize_source = False):
     source = source.to(destination.device)
     if resize_source:
-        source = torch.nn.functional.interpolate(source, size=(destination.shape[2], destination.shape[3]), mode="bilinear")
+        source = torch.nn.functional.interpolate(source, size=(destination.shape[-2], destination.shape[-1]), mode="bilinear")
 
     source = comfy.utils.repeat_to_batch_size(source, destination.shape[0])
 
-    x = max(-source.shape[3] * multiplier, min(x, destination.shape[3] * multiplier))
-    y = max(-source.shape[2] * multiplier, min(y, destination.shape[2] * multiplier))
+    x = max(-source.shape[-1] * multiplier, min(x, destination.shape[-1] * multiplier))
+    y = max(-source.shape[-2] * multiplier, min(y, destination.shape[-2] * multiplier))
 
     left, top = (x // multiplier, y // multiplier)
-    right, bottom = (left + source.shape[3], top + source.shape[2],)
+    right, bottom = (left + source.shape[-1], top + source.shape[-2],)
 
     if mask is None:
         mask = torch.ones_like(source)
     else:
         mask = mask.to(destination.device, copy=True)
-        mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(source.shape[2], source.shape[3]), mode="bilinear")
+        mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(source.shape[-2], source.shape[-1]), mode="bilinear")
         mask = comfy.utils.repeat_to_batch_size(mask, source.shape[0])
 
     # calculate the bounds of the source that will be overlapping the destination
     # this prevents the source trying to overwrite latent pixels that are out of bounds
     # of the destination
-    visible_width, visible_height = (destination.shape[3] - left + min(0, x), destination.shape[2] - top + min(0, y),)
+    visible_width, visible_height = (destination.shape[-1] - left + min(0, x), destination.shape[-2] - top + min(0, y),)
 
     mask = mask[:, :, :visible_height, :visible_width]
+    if mask.ndim < source.ndim:
+        mask = mask.unsqueeze(1)
+
     inverse_mask = torch.ones_like(mask) - mask
 
-    source_portion = mask * source[:, :, :visible_height, :visible_width]
-    destination_portion = inverse_mask  * destination[:, :, top:bottom, left:right]
+    source_portion = mask * source[..., :visible_height, :visible_width]
+    destination_portion = inverse_mask  * destination[..., top:bottom, left:right]
 
-    destination[:, :, top:bottom, left:right] = source_portion + destination_portion
+    destination[..., top:bottom, left:right] = source_portion + destination_portion
     return destination
 
 class LatentCompositeMasked: