Merge 9ee905bc47 into ac4d8ea9b3

2026-01-30 00:00:26 +08:00 · 2026-01-13 19:40:00 -08:00
4 changed files with 12 additions and 79 deletions
--- a/comfy/float.py
+++ b/comfy/float.py
@ -137,44 +137,10 @@ def to_blocked(input_matrix, flatten: bool = True) -> torch.Tensor:
    return rearranged.reshape(padded_rows, padded_cols)


-def stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator):
+def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
    F4_E2M1_MAX = 6.0
    F8_E4M3_MAX = 448.0

-    orig_shape = x.shape
-
-    block_size = 16
-
-    x = x.reshape(orig_shape[0], -1, block_size)
-    scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
-    x = x / (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)
-
-    x = x.view(orig_shape).nan_to_num()
-    data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
-    return data_lp, scaled_block_scales_fp8
-
-
-def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
-    def roundup(x: int, multiple: int) -> int:
-        """Round up x to the nearest multiple."""
-        return ((x + multiple - 1) // multiple) * multiple
-
-    generator = torch.Generator(device=x.device)
-    generator.manual_seed(seed)
-
-    # Handle padding
-    if pad_16x:
-        rows, cols = x.shape
-        padded_rows = roundup(rows, 16)
-        padded_cols = roundup(cols, 16)
-        if padded_rows != rows or padded_cols != cols:
-            x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
-
-    x, blocked_scaled = stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator)
-    return x, to_blocked(blocked_scaled, flatten=False)
-
-
-def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=0, block_size=4096 * 4096):
    def roundup(x: int, multiple: int) -> int:
        """Round up x to the nearest multiple."""
        return ((x + multiple - 1) // multiple) * multiple
@ -192,20 +158,16 @@ def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=
            # what we want to produce. If we pad here, we want the padded output.
            orig_shape = x.shape

-    orig_shape = list(orig_shape)
+    block_size = 16

-    output_fp4 = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 2], dtype=torch.uint8, device=x.device)
-    output_block = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 16], dtype=torch.float8_e4m3fn, device=x.device)
+    x = x.reshape(orig_shape[0], -1, block_size)
+    scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
+    x /= (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)

    generator = torch.Generator(device=x.device)
    generator.manual_seed(seed)

-    num_slices = max(1, (x.numel() / block_size))
-    slice_size = max(1, (round(x.shape[0] / num_slices)))
-
-    for i in range(0, x.shape[0], slice_size):
-        fp4, block = stochastic_round_quantize_nvfp4_block(x[i: i + slice_size], per_tensor_scale, generator=generator)
-        output_fp4[i:i + slice_size].copy_(fp4)
-        output_block[i:i + slice_size].copy_(block)
-
-    return output_fp4, to_blocked(output_block, flatten=False)
+    x = x.view(orig_shape).nan_to_num()
+    data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
+    blocked_scales = to_blocked(scaled_block_scales_fp8, flatten=False)
+    return data_lp, blocked_scales
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@ -104,7 +104,7 @@ class TensorCoreNVFP4Layout(_CKNvfp4Layout):
        needs_padding = padded_shape != orig_shape

        if stochastic_rounding > 0:
-            qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4_by_block(tensor, scale, pad_16x=needs_padding, seed=stochastic_rounding)
+            qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4(tensor, scale, pad_16x=needs_padding, seed=stochastic_rounding)
        else:
            qdata, block_scale = ck.quantize_nvfp4(tensor, scale, pad_16x=needs_padding)

--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1042,7 +1042,7 @@ class ZImage(Lumina2):
        "shift": 3.0,
    }

-    memory_usage_factor = 2.8
+    memory_usage_factor = 2.0

    supported_inference_dtypes = [torch.bfloat16, torch.float32]

--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -30,7 +30,6 @@ from torch.nn.functional import interpolate
 from einops import rearrange
 from comfy.cli_args import args
 import json
-import time

 MMAP_TORCH_FILES = args.mmap_torch_files
 DISABLE_MMAP = args.disable_mmap
@ -1098,10 +1097,6 @@ def set_progress_bar_global_hook(function):
    global PROGRESS_BAR_HOOK
    PROGRESS_BAR_HOOK = function

-# Throttle settings for progress bar updates to reduce WebSocket flooding
-PROGRESS_THROTTLE_MIN_INTERVAL = 0.1  # 100ms minimum between updates
-PROGRESS_THROTTLE_MIN_PERCENT = 0.5   # 0.5% minimum progress change
-
 class ProgressBar:
    def __init__(self, total, node_id=None):
        global PROGRESS_BAR_HOOK
@ -1109,8 +1104,6 @@ class ProgressBar:
        self.current = 0
        self.hook = PROGRESS_BAR_HOOK
        self.node_id = node_id
-        self._last_update_time = 0.0
-        self._last_sent_value = -1

    def update_absolute(self, value, total=None, preview=None):
        if total is not None:
@ -1119,29 +1112,7 @@ class ProgressBar:
            value = self.total
        self.current = value
        if self.hook is not None:
-            current_time = time.perf_counter()
-            is_first = (self._last_sent_value < 0)
-            is_final = (value >= self.total)
-            has_preview = (preview is not None)
-
-            # Always send immediately for previews, first update, or final update
-            if has_preview or is_first or is_final:
-                self.hook(self.current, self.total, preview, node_id=self.node_id)
-                self._last_update_time = current_time
-                self._last_sent_value = value
-                return
-
-            # Apply throttling for regular progress updates
-            if self.total > 0:
-                percent_changed = ((value - max(0, self._last_sent_value)) / self.total) * 100
-            else:
-                percent_changed = 100
-            time_elapsed = current_time - self._last_update_time
-
-            if time_elapsed >= PROGRESS_THROTTLE_MIN_INTERVAL and percent_changed >= PROGRESS_THROTTLE_MIN_PERCENT:
-                self.hook(self.current, self.total, preview, node_id=self.node_id)
-                self._last_update_time = current_time
-                self._last_sent_value = value
+            self.hook(self.current, self.total, preview, node_id=self.node_id)

    def update(self, value):
        self.update_absolute(self.current + value)