Merge b947b5a4a3 into 79cdbc81cb

2026-02-06 19:42:34 +08:00 · 2026-01-23 17:03:13 +01:00
9 changed files with 159 additions and 139 deletions
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -8,7 +8,6 @@ class LatentFormat:
    latent_rgb_factors_bias = None
    latent_rgb_factors_reshape = None
    taesd_decoder_name = None
-    spacial_downscale_ratio = 8

    def process_in(self, latent):
        return latent * self.scale_factor
@ -182,7 +181,6 @@ class Flux(SD3):

 class Flux2(LatentFormat):
    latent_channels = 128
-    spacial_downscale_ratio = 16

    def __init__(self):
        self.latent_rgb_factors =[
@ -751,7 +749,6 @@ class ACEAudio(LatentFormat):

 class ChromaRadiance(LatentFormat):
    latent_channels = 3
-    spacial_downscale_ratio = 1

    def __init__(self):
        self.latent_rgb_factors = [
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -18,12 +18,12 @@ class CompressedTimestep:
    def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
        """
        tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
-        patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression
+        patches_per_frame: Number of spatial patches per frame (height * width in latent space)
        """
        self.batch_size, num_tokens, self.feature_dim = tensor.shape

        # Check if compression is valid (num_tokens must be divisible by patches_per_frame)
-        if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
+        if num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
            self.patches_per_frame = patches_per_frame
            self.num_frames = num_tokens // patches_per_frame

@ -215,9 +215,22 @@ class BasicAVTransformerBlock(nn.Module):
        return (*scale_shift_ada_values, *gate_ada_values)

    def forward(
-        self, x: Tuple[torch.Tensor, torch.Tensor], v_context=None, a_context=None, attention_mask=None, v_timestep=None, a_timestep=None,
-        v_pe=None, a_pe=None, v_cross_pe=None, a_cross_pe=None, v_cross_scale_shift_timestep=None, a_cross_scale_shift_timestep=None,
-        v_cross_gate_timestep=None, a_cross_gate_timestep=None, transformer_options=None,
+        self,
+        x: Tuple[torch.Tensor, torch.Tensor],
+        v_context=None,
+        a_context=None,
+        attention_mask=None,
+        v_timestep=None,
+        a_timestep=None,
+        v_pe=None,
+        a_pe=None,
+        v_cross_pe=None,
+        a_cross_pe=None,
+        v_cross_scale_shift_timestep=None,
+        a_cross_scale_shift_timestep=None,
+        v_cross_gate_timestep=None,
+        a_cross_gate_timestep=None,
+        transformer_options=None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        run_vx = transformer_options.get("run_vx", True)
        run_ax = transformer_options.get("run_ax", True)
@ -227,102 +240,144 @@ class BasicAVTransformerBlock(nn.Module):
        run_a2v = run_vx and transformer_options.get("a2v_cross_attn", True) and ax.numel() > 0
        run_v2a = run_ax and transformer_options.get("v2a_cross_attn", True)

-        # video
        if run_vx:
-            # video self-attention
-            vshift_msa, vscale_msa = (self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 2)))
+            vshift_msa, vscale_msa, vgate_msa = (
+                self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(0, 3))
+            )
+
            norm_vx = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_msa) + vshift_msa
-            del vshift_msa, vscale_msa
-            attn1_out = self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options)
-            del norm_vx
-            # video cross-attention
-            vgate_msa = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(2, 3))[0]
-            vx.addcmul_(attn1_out, vgate_msa)
-            del vgate_msa, attn1_out
-            vx.add_(self.attn2(comfy.ldm.common_dit.rms_norm(vx), context=v_context, mask=attention_mask, transformer_options=transformer_options))
+            vx += self.attn1(norm_vx, pe=v_pe, transformer_options=transformer_options) * vgate_msa
+            vx += self.attn2(
+                comfy.ldm.common_dit.rms_norm(vx),
+                context=v_context,
+                mask=attention_mask,
+                transformer_options=transformer_options,
+            )
+
+            del vshift_msa, vscale_msa, vgate_msa

-        # audio
        if run_ax:
-            # audio self-attention
-            ashift_msa, ascale_msa = (self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 2)))
-            norm_ax = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_msa) + ashift_msa
-            del ashift_msa, ascale_msa
-            attn1_out = self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options)
-            del norm_ax
-            # audio cross-attention
-            agate_msa = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(2, 3))[0]
-            ax.addcmul_(attn1_out, agate_msa)
-            del agate_msa, attn1_out
-            ax.add_(self.audio_attn2(comfy.ldm.common_dit.rms_norm(ax), context=a_context, mask=attention_mask, transformer_options=transformer_options))
+            ashift_msa, ascale_msa, agate_msa = (
+                self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(0, 3))
+            )

-        # video - audio cross attention.
+            norm_ax = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_msa) + ashift_msa
+            ax += (
+                self.audio_attn1(norm_ax, pe=a_pe, transformer_options=transformer_options)
+                * agate_msa
+            )
+            ax += self.audio_attn2(
+                comfy.ldm.common_dit.rms_norm(ax),
+                context=a_context,
+                mask=attention_mask,
+                transformer_options=transformer_options,
+            )
+
+            del ashift_msa, ascale_msa, agate_msa
+
+        # Audio - Video cross attention.
        if run_a2v or run_v2a:
+            # norm3
            vx_norm3 = comfy.ldm.common_dit.rms_norm(vx)
            ax_norm3 = comfy.ldm.common_dit.rms_norm(ax)

-            # audio to video cross attention
+            (
+                scale_ca_audio_hidden_states_a2v,
+                shift_ca_audio_hidden_states_a2v,
+                scale_ca_audio_hidden_states_v2a,
+                shift_ca_audio_hidden_states_v2a,
+                gate_out_v2a,
+            ) = self.get_av_ca_ada_values(
+                self.scale_shift_table_a2v_ca_audio,
+                ax.shape[0],
+                a_cross_scale_shift_timestep,
+                a_cross_gate_timestep,
+            )
+
+            (
+                scale_ca_video_hidden_states_a2v,
+                shift_ca_video_hidden_states_a2v,
+                scale_ca_video_hidden_states_v2a,
+                shift_ca_video_hidden_states_v2a,
+                gate_out_a2v,
+            ) = self.get_av_ca_ada_values(
+                self.scale_shift_table_a2v_ca_video,
+                vx.shape[0],
+                v_cross_scale_shift_timestep,
+                v_cross_gate_timestep,
+            )
+
            if run_a2v:
-                scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v = self.get_ada_values(
-                    self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[:2]
-                scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v = self.get_ada_values(
-                    self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[:2]
+                vx_scaled = (
+                    vx_norm3 * (1 + scale_ca_video_hidden_states_a2v)
+                    + shift_ca_video_hidden_states_a2v
+                )
+                ax_scaled = (
+                    ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v)
+                    + shift_ca_audio_hidden_states_a2v
+                )
+                vx += (
+                    self.audio_to_video_attn(
+                        vx_scaled,
+                        context=ax_scaled,
+                        pe=v_cross_pe,
+                        k_pe=a_cross_pe,
+                        transformer_options=transformer_options,
+                    )
+                    * gate_out_a2v
+                )

-                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_a2v_v) + shift_ca_video_hidden_states_a2v_v
-                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) + shift_ca_audio_hidden_states_a2v
-                del scale_ca_video_hidden_states_a2v_v, shift_ca_video_hidden_states_a2v_v, scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v
+                del gate_out_a2v
+                del scale_ca_video_hidden_states_a2v,\
+                    shift_ca_video_hidden_states_a2v,\
+                    scale_ca_audio_hidden_states_a2v,\
+                    shift_ca_audio_hidden_states_a2v,\

-                a2v_out = self.audio_to_video_attn(vx_scaled, context=ax_scaled, pe=v_cross_pe, k_pe=a_cross_pe, transformer_options=transformer_options)
-                del vx_scaled, ax_scaled
-
-                gate_out_a2v = self.get_ada_values(self.scale_shift_table_a2v_ca_video[4:, :], vx.shape[0], v_cross_gate_timestep)[0]
-                vx.addcmul_(a2v_out, gate_out_a2v)
-                del gate_out_a2v, a2v_out
-
-            # video to audio cross attention
            if run_v2a:
-                scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a = self.get_ada_values(
-                    self.scale_shift_table_a2v_ca_audio[:4, :], ax.shape[0], a_cross_scale_shift_timestep)[2:4]
-                scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a = self.get_ada_values(
-                    self.scale_shift_table_a2v_ca_video[:4, :], vx.shape[0], v_cross_scale_shift_timestep)[2:4]
+                ax_scaled = (
+                    ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a)
+                    + shift_ca_audio_hidden_states_v2a
+                )
+                vx_scaled = (
+                    vx_norm3 * (1 + scale_ca_video_hidden_states_v2a)
+                    + shift_ca_video_hidden_states_v2a
+                )
+                ax += (
+                    self.video_to_audio_attn(
+                        ax_scaled,
+                        context=vx_scaled,
+                        pe=a_cross_pe,
+                        k_pe=v_cross_pe,
+                        transformer_options=transformer_options,
+                    )
+                    * gate_out_v2a
+                )

-                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) + shift_ca_audio_hidden_states_v2a
-                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) + shift_ca_video_hidden_states_v2a
-                del scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a, scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a
+                del gate_out_v2a
+                del scale_ca_video_hidden_states_v2a,\
+                    shift_ca_video_hidden_states_v2a,\
+                    scale_ca_audio_hidden_states_v2a,\
+                    shift_ca_audio_hidden_states_v2a

-                v2a_out = self.video_to_audio_attn(ax_scaled, context=vx_scaled, pe=a_cross_pe, k_pe=v_cross_pe, transformer_options=transformer_options)
-                del ax_scaled, vx_scaled
-
-                gate_out_v2a = self.get_ada_values(self.scale_shift_table_a2v_ca_audio[4:, :], ax.shape[0], a_cross_gate_timestep)[0]
-                ax.addcmul_(v2a_out, gate_out_v2a)
-                del gate_out_v2a, v2a_out
-
-            del vx_norm3, ax_norm3
-
-        # video feedforward
        if run_vx:
-            vshift_mlp, vscale_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, 5))
+            vshift_mlp, vscale_mlp, vgate_mlp = (
+                self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(3, None))
+            )
+
            vx_scaled = comfy.ldm.common_dit.rms_norm(vx) * (1 + vscale_mlp) + vshift_mlp
-            del vshift_mlp, vscale_mlp
+            vx += self.ff(vx_scaled) * vgate_mlp
+            del vshift_mlp, vscale_mlp, vgate_mlp

-            ff_out = self.ff(vx_scaled)
-            del vx_scaled
-
-            vgate_mlp = self.get_ada_values(self.scale_shift_table, vx.shape[0], v_timestep, slice(5, 6))[0]
-            vx.addcmul_(ff_out, vgate_mlp)
-            del vgate_mlp, ff_out
-
-        # audio feedforward
        if run_ax:
-            ashift_mlp, ascale_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, 5))
+            ashift_mlp, ascale_mlp, agate_mlp = (
+                self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(3, None))
+            )
+
            ax_scaled = comfy.ldm.common_dit.rms_norm(ax) * (1 + ascale_mlp) + ashift_mlp
-            del ashift_mlp, ascale_mlp
+            ax += self.audio_ff(ax_scaled) * agate_mlp

-            ff_out = self.audio_ff(ax_scaled)
-            del ax_scaled
+            del ashift_mlp, ascale_mlp, agate_mlp

-            agate_mlp = self.get_ada_values(self.audio_scale_shift_table, ax.shape[0], a_timestep, slice(5, 6))[0]
-            ax.addcmul_(ff_out, agate_mlp)
-            del agate_mlp, ff_out

        return vx, ax

@ -534,20 +589,9 @@ class LTXAVModel(LTXVModel):
        audio_length = kwargs.get("audio_length", 0)
        # Separate audio and video latents
        vx, ax = self.separate_audio_and_video_latents(x, audio_length)
-
-        has_spatial_mask = False
-        if denoise_mask is not None:
-            # check if any frame has spatial variation (inpainting)
-            for frame_idx in range(denoise_mask.shape[2]):
-                frame_mask = denoise_mask[0, 0, frame_idx]
-                if frame_mask.numel() > 0 and frame_mask.min() != frame_mask.max():
-                    has_spatial_mask = True
-                    break
-
        [vx, v_pixel_coords, additional_args] = super()._process_input(
            vx, keyframe_idxs, denoise_mask, **kwargs
        )
-        additional_args["has_spatial_mask"] = has_spatial_mask

        ax, a_latent_coords = self.a_patchifier.patchify(ax)
        ax = self.audio_patchify_proj(ax)
@ -574,9 +618,8 @@ class LTXAVModel(LTXVModel):
        # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
        # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
        orig_shape = kwargs.get("orig_shape")
-        has_spatial_mask = kwargs.get("has_spatial_mask", None)
        v_patches_per_frame = None
-        if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
+        if orig_shape is not None and len(orig_shape) == 5:
            # orig_shape[3] = height, orig_shape[4] = width (in latent space)
            v_patches_per_frame = orig_shape[3] * orig_shape[4]

@ -619,11 +662,10 @@ class LTXAVModel(LTXVModel):
            )

            # Compress cross-attention timesteps (only video side, audio is too small to benefit)
-            # v_patches_per_frame is None for spatial masks, set for temporal masks or no mask
            cross_av_timestep_ss = [
                av_ca_audio_scale_shift_timestep.view(batch_size, -1, av_ca_audio_scale_shift_timestep.shape[-1]),
-                CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame),  # video - compressed if possible
-                CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame),  # video - compressed if possible
+                CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame),  # video - compressed
+                CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame),  # video - compressed
                av_ca_v2a_gate_noise_timestep.view(batch_size, -1, av_ca_v2a_gate_noise_timestep.shape[-1]),
            ]

--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from comfy.ldm.modules.diffusionmodules.model import vae_attention, torch_cat_if_needed
+from comfy.ldm.modules.diffusionmodules.model import vae_attention

 import comfy.ops
 ops = comfy.ops.disable_weight_init
@ -20,29 +20,22 @@ class CausalConv3d(ops.Conv3d):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        self._padding = 2 * self.padding[0]
-        self.padding = (0, self.padding[1], self.padding[2])
+        self._padding = (self.padding[2], self.padding[2], self.padding[1],
+                         self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)

    def forward(self, x, cache_x=None, cache_list=None, cache_idx=None):
        if cache_list is not None:
            cache_x = cache_list[cache_idx]
            cache_list[cache_idx] = None

-        if cache_x is None and x.shape[2] == 1:
-            #Fast path - the op will pad for use by truncating the weight
-            #and save math on a pile of zeros.
-            return super().forward(x, autopad="causal_zero")
-
-        if self._padding > 0:
-            padding_needed = self._padding
-            if cache_x is not None:
-                cache_x = cache_x.to(x.device)
-                padding_needed = max(0, padding_needed - cache_x.shape[2])
-            padding_shape = list(x.shape)
-            padding_shape[2] = padding_needed
-            padding = torch.zeros(padding_shape, device=x.device, dtype=x.dtype)
-            x = torch_cat_if_needed([padding, cache_x, x], dim=2)
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
            del cache_x
+        x = F.pad(x, padding)

        return super().forward(x)

--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -260,7 +260,6 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["transformer.{}".format(k[:-len(".weight")])] = to #simpletrainer and probably regular diffusers flux lora format
                key_map["lycoris_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #simpletrainer lycoris
                key_map["lora_transformer_{}".format(k[:-len(".weight")].replace(".", "_"))] = to #onetrainer
-                key_map[k[:-len(".weight")]] = to #DiffSynth lora format
        for k in sdk:
            hidden_size = model.model_config.unet_config.get("hidden_size", 0)
            if k.endswith(".weight") and ".linear1." in k:
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -203,9 +203,7 @@ class disable_weight_init:
        def reset_parameters(self):
            return None

-        def _conv_forward(self, input, weight, bias, autopad=None, *args, **kwargs):
-            if autopad == "causal_zero":
-                weight = weight[:, :, -input.shape[2]:, :, :]
+        def _conv_forward(self, input, weight, bias, *args, **kwargs):
            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
                if bias is not None:
@ -214,15 +212,15 @@ class disable_weight_init:
            else:
                return super()._conv_forward(input, weight, bias, *args, **kwargs)

-        def forward_comfy_cast_weights(self, input, autopad=None):
+        def forward_comfy_cast_weights(self, input):
            weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
-            x = self._conv_forward(input, weight, bias, autopad=autopad)
+            x = self._conv_forward(input, weight, bias)
            uncast_bias_weight(self, weight, bias, offload_stream)
            return x

        def forward(self, *args, **kwargs):
            run_every_op()
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0 or "autopad" in kwargs:
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
                return self.forward_comfy_cast_weights(*args, **kwargs)
            else:
                return super().forward(*args, **kwargs)
--- a/comfy/sample.py
+++ b/comfy/sample.py
@ -37,18 +37,12 @@ def prepare_noise(latent_image, seed, noise_inds=None):

    return noises

-def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None):
+def fix_empty_latent_channels(model, latent_image):
    if latent_image.is_nested:
        return latent_image
    latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
-    if torch.count_nonzero(latent_image) == 0:
-        if latent_format.latent_channels != latent_image.shape[1]:
-            latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
-        if downscale_ratio_spacial is not None:
-            if downscale_ratio_spacial != latent_format.spacial_downscale_ratio:
-                ratio = downscale_ratio_spacial / latent_format.spacial_downscale_ratio
-                latent_image = comfy.utils.common_upscale(latent_image, round(latent_image.shape[-1] * ratio), round(latent_image.shape[-2] * ratio), "nearest-exact", crop="disabled")
-
+    if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
+        latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
    if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
        latent_image = latent_image.unsqueeze(2)
    return latent_image
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@ -741,7 +741,7 @@ class SamplerCustom(io.ComfyNode):
        latent = latent_image
        latent_image = latent["samples"]
        latent = latent.copy()
-        latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None))
+        latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image)
        latent["samples"] = latent_image

        if not add_noise:
@ -760,7 +760,6 @@ class SamplerCustom(io.ComfyNode):
        samples = comfy.sample.sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=noise_seed)

        out = latent.copy()
-        out.pop("downscale_ratio_spacial", None)
        out["samples"] = samples
        if "x0" in x0_output:
            x0_out = model.model.process_latent_out(x0_output["x0"].cpu())
@ -940,7 +939,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
        latent = latent_image
        latent_image = latent["samples"]
        latent = latent.copy()
-        latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image, latent.get("downscale_ratio_spacial", None))
+        latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image)
        latent["samples"] = latent_image

        noise_mask = None
@ -955,7 +954,6 @@ class SamplerCustomAdvanced(io.ComfyNode):
        samples = samples.to(comfy.model_management.intermediate_device())

        out = latent.copy()
-        out.pop("downscale_ratio_spacial", None)
        out["samples"] = samples
        if "x0" in x0_output:
            x0_out = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu())
--- a/comfy_extras/nodes_sd3.py
+++ b/comfy_extras/nodes_sd3.py
@ -55,7 +55,7 @@ class EmptySD3LatentImage(io.ComfyNode):
    @classmethod
    def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 8})
+        return io.NodeOutput({"samples":latent})

    generate = execute  # TODO: remove

--- a/nodes.py
+++ b/nodes.py
@ -1230,7 +1230,7 @@ class EmptyLatentImage:

    def generate(self, width, height, batch_size=1):
        latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=self.device)
-        return ({"samples": latent, "downscale_ratio_spacial": 8}, )
+        return ({"samples":latent}, )


 class LatentFromBatch:
@ -1538,7 +1538,7 @@ class SetLatentNoiseMask:

 def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False):
    latent_image = latent["samples"]
-    latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None))
+    latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image)

    if disable_noise:
        noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu")
@ -1556,7 +1556,6 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
                                  denoise=denoise, disable_noise=disable_noise, start_step=start_step, last_step=last_step,
                                  force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
    out = latent.copy()
-    out.pop("downscale_ratio_spacial", None)
    out["samples"] = samples
    return (out, )