Merge 9ee905bc47 into 6165c38cb5

Optimize nvfp4 lora applying. (#11866 )
This changes results a bit but it also speeds up things a lot.
2026-01-29 07:40:21 +08:00 · 2026-01-14 11:48:16 +01:00 · 2026-01-14 00:49:38 -05:00 · 2026-01-13 22:41:44 -05:00 · 2025-12-31 07:37:07 -07:00 · 2025-12-30 18:29:00 -07:00
5 changed files with 94 additions and 26 deletions
--- a/comfy/float.py
+++ b/comfy/float.py
@ -137,10 +137,44 @@ def to_blocked(input_matrix, flatten: bool = True) -> torch.Tensor:
    return rearranged.reshape(padded_rows, padded_cols)


-def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
+def stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator):
    F4_E2M1_MAX = 6.0
    F8_E4M3_MAX = 448.0

+    orig_shape = x.shape
+
+    block_size = 16
+
+    x = x.reshape(orig_shape[0], -1, block_size)
+    scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
+    x = x / (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)
+
+    x = x.view(orig_shape).nan_to_num()
+    data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
+    return data_lp, scaled_block_scales_fp8
+
+
+def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
+    def roundup(x: int, multiple: int) -> int:
+        """Round up x to the nearest multiple."""
+        return ((x + multiple - 1) // multiple) * multiple
+
+    generator = torch.Generator(device=x.device)
+    generator.manual_seed(seed)
+
+    # Handle padding
+    if pad_16x:
+        rows, cols = x.shape
+        padded_rows = roundup(rows, 16)
+        padded_cols = roundup(cols, 16)
+        if padded_rows != rows or padded_cols != cols:
+            x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
+
+    x, blocked_scaled = stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator)
+    return x, to_blocked(blocked_scaled, flatten=False)
+
+
+def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=0, block_size=4096 * 4096):
    def roundup(x: int, multiple: int) -> int:
        """Round up x to the nearest multiple."""
        return ((x + multiple - 1) // multiple) * multiple
@ -158,16 +192,20 @@ def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
            # what we want to produce. If we pad here, we want the padded output.
            orig_shape = x.shape

-    block_size = 16
+    orig_shape = list(orig_shape)

-    x = x.reshape(orig_shape[0], -1, block_size)
-    scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
-    x /= (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)
+    output_fp4 = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 2], dtype=torch.uint8, device=x.device)
+    output_block = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 16], dtype=torch.float8_e4m3fn, device=x.device)

    generator = torch.Generator(device=x.device)
    generator.manual_seed(seed)

-    x = x.view(orig_shape).nan_to_num()
-    data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
-    blocked_scales = to_blocked(scaled_block_scales_fp8, flatten=False)
-    return data_lp, blocked_scales
+    num_slices = max(1, (x.numel() / block_size))
+    slice_size = max(1, (round(x.shape[0] / num_slices)))
+
+    for i in range(0, x.shape[0], slice_size):
+        fp4, block = stochastic_round_quantize_nvfp4_block(x[i: i + slice_size], per_tensor_scale, generator=generator)
+        output_fp4[i:i + slice_size].copy_(fp4)
+        output_block[i:i + slice_size].copy_(block)
+
+    return output_fp4, to_blocked(output_block, flatten=False)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1303,22 +1303,23 @@ class WAN21_HuMo(WAN21):
        if audio_embed is not None:
            out['audio_embed'] = comfy.conds.CONDRegular(audio_embed)

-        if "c_concat" not in out:  # 1.7B model
            reference_latents = kwargs.get("reference_latents", None)
-            if reference_latents is not None:
+
+            if "c_concat" not in out and reference_latents is not None and reference_latents[0].shape[1] == 16:  # 1.7B model
                out['reference_latent'] = comfy.conds.CONDRegular(self.process_latent_in(reference_latents[-1]))
-        else:
-            noise_shape = list(noise.shape)
-            noise_shape[1] += 4
-            concat_latent = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype)
-            zero_vae_values_first = torch.tensor([0.8660, -0.4326, -0.0017, -0.4884, -0.5283, 0.9207, -0.9896, 0.4433, -0.5543, -0.0113, 0.5753, -0.6000, -0.8346, -0.3497, -0.1926, -0.6938]).view(1, 16, 1, 1, 1)
-            zero_vae_values_second = torch.tensor([1.0869, -1.2370, 0.0206, -0.4357, -0.6411, 2.0307, -1.5972, 1.2659, -0.8595, -0.4654, 0.9638, -1.6330, -1.4310, -0.1098, -0.3856, -1.4583]).view(1, 16, 1, 1, 1)
-            zero_vae_values = torch.tensor([0.8642, -1.8583, 0.1577, 0.1350, -0.3641, 2.5863, -1.9670, 1.6065, -1.0475, -0.8678, 1.1734, -1.8138, -1.5933, -0.7721, -0.3289, -1.3745]).view(1, 16, 1, 1, 1)
-            concat_latent[:, 4:] = zero_vae_values
-            concat_latent[:, 4:, :1] = zero_vae_values_first
-            concat_latent[:, 4:, 1:2] = zero_vae_values_second
-            out['c_concat'] = comfy.conds.CONDNoiseShape(concat_latent)
-            reference_latents = kwargs.get("reference_latents", None)
+            else:
+                concat_latent_image = kwargs.get("concat_latent_image", None)
+                if concat_latent_image is None:
+                    noise_shape = list(noise.shape)
+                    noise_shape[1] += 4
+                    concat_latent = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype)
+                    zero_vae_values_first = torch.tensor([0.8660, -0.4326, -0.0017, -0.4884, -0.5283, 0.9207, -0.9896, 0.4433, -0.5543, -0.0113, 0.5753, -0.6000, -0.8346, -0.3497, -0.1926, -0.6938]).view(1, 16, 1, 1, 1)
+                    zero_vae_values_second = torch.tensor([1.0869, -1.2370, 0.0206, -0.4357, -0.6411, 2.0307, -1.5972, 1.2659, -0.8595, -0.4654, 0.9638, -1.6330, -1.4310, -0.1098, -0.3856, -1.4583]).view(1, 16, 1, 1, 1)
+                    zero_vae_values = torch.tensor([0.8642, -1.8583, 0.1577, 0.1350, -0.3641, 2.5863, -1.9670, 1.6065, -1.0475, -0.8678, 1.1734, -1.8138, -1.5933, -0.7721, -0.3289, -1.3745]).view(1, 16, 1, 1, 1)
+                    concat_latent[:, 4:] = zero_vae_values
+                    concat_latent[:, 4:, :1] = zero_vae_values_first
+                    concat_latent[:, 4:, 1:2] = zero_vae_values_second
+                    out['c_concat'] = comfy.conds.CONDNoiseShape(concat_latent)
            if reference_latents is not None:
                ref_latent = self.process_latent_in(reference_latents[-1])
                ref_latent_shape = list(ref_latent.shape)
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@ -104,7 +104,7 @@ class TensorCoreNVFP4Layout(_CKNvfp4Layout):
        needs_padding = padded_shape != orig_shape

        if stochastic_rounding > 0:
-            qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4(tensor, scale, pad_16x=needs_padding, seed=stochastic_rounding)
+            qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4_by_block(tensor, scale, pad_16x=needs_padding, seed=stochastic_rounding)
        else:
            qdata, block_scale = ck.quantize_nvfp4(tensor, scale, pad_16x=needs_padding)

--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1042,7 +1042,7 @@ class ZImage(Lumina2):
        "shift": 3.0,
    }

-    memory_usage_factor = 2.0
+    memory_usage_factor = 2.8

    supported_inference_dtypes = [torch.bfloat16, torch.float32]

--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -30,6 +30,7 @@ from torch.nn.functional import interpolate
 from einops import rearrange
 from comfy.cli_args import args
 import json
+import time

 MMAP_TORCH_FILES = args.mmap_torch_files
 DISABLE_MMAP = args.disable_mmap
@ -1097,6 +1098,10 @@ def set_progress_bar_global_hook(function):
    global PROGRESS_BAR_HOOK
    PROGRESS_BAR_HOOK = function

+# Throttle settings for progress bar updates to reduce WebSocket flooding
+PROGRESS_THROTTLE_MIN_INTERVAL = 0.1  # 100ms minimum between updates
+PROGRESS_THROTTLE_MIN_PERCENT = 0.5   # 0.5% minimum progress change
+
 class ProgressBar:
    def __init__(self, total, node_id=None):
        global PROGRESS_BAR_HOOK
@ -1104,6 +1109,8 @@ class ProgressBar:
        self.current = 0
        self.hook = PROGRESS_BAR_HOOK
        self.node_id = node_id
+        self._last_update_time = 0.0
+        self._last_sent_value = -1

    def update_absolute(self, value, total=None, preview=None):
        if total is not None:
@ -1112,7 +1119,29 @@ class ProgressBar:
            value = self.total
        self.current = value
        if self.hook is not None:
-            self.hook(self.current, self.total, preview, node_id=self.node_id)
+            current_time = time.perf_counter()
+            is_first = (self._last_sent_value < 0)
+            is_final = (value >= self.total)
+            has_preview = (preview is not None)
+
+            # Always send immediately for previews, first update, or final update
+            if has_preview or is_first or is_final:
+                self.hook(self.current, self.total, preview, node_id=self.node_id)
+                self._last_update_time = current_time
+                self._last_sent_value = value
+                return
+
+            # Apply throttling for regular progress updates
+            if self.total > 0:
+                percent_changed = ((value - max(0, self._last_sent_value)) / self.total) * 100
+            else:
+                percent_changed = 100
+            time_elapsed = current_time - self._last_update_time
+
+            if time_elapsed >= PROGRESS_THROTTLE_MIN_INTERVAL and percent_changed >= PROGRESS_THROTTLE_MIN_PERCENT:
+                self.hook(self.current, self.total, preview, node_id=self.node_id)
+                self._last_update_time = current_time
+                self._last_sent_value = value

    def update(self, value):
        self.update_absolute(self.current + value)
Author	SHA1	Message	Date
drozbay	abfc0b01e6	Merge `9ee905bc47` into `6165c38cb5`	2026-01-14 11:48:16 +01:00
comfyanonymous	6165c38cb5	Optimize nvfp4 lora applying. (#11866 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details This changes results a bit but it also speeds up things a lot.	2026-01-14 00:49:38 -05:00
Silver	712cca36a1	feat: throttle ProgressBar updates to reduce WebSocket flooding (#11504 )	2026-01-13 22:41:44 -05:00
drozbay	9ee905bc47	Merge branch 'master' into humo_i2v	2025-12-31 07:37:07 -07:00
ozbayb	d56d374c96	Allow Wan21_HuMo extra_conds to pass concat_latent_image through if present	2025-12-30 18:29:00 -07:00