Compare commits

..

1 Commits

Author SHA1 Message Date
drozbay
04ab8412ab
Merge 9ee905bc47 into ac4d8ea9b3 2026-01-13 19:40:00 -08:00
4 changed files with 12 additions and 79 deletions

View File

@ -137,44 +137,10 @@ def to_blocked(input_matrix, flatten: bool = True) -> torch.Tensor:
return rearranged.reshape(padded_rows, padded_cols)
def stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator):
def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
F4_E2M1_MAX = 6.0
F8_E4M3_MAX = 448.0
orig_shape = x.shape
block_size = 16
x = x.reshape(orig_shape[0], -1, block_size)
scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
x = x / (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)
x = x.view(orig_shape).nan_to_num()
data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
return data_lp, scaled_block_scales_fp8
def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
def roundup(x: int, multiple: int) -> int:
"""Round up x to the nearest multiple."""
return ((x + multiple - 1) // multiple) * multiple
generator = torch.Generator(device=x.device)
generator.manual_seed(seed)
# Handle padding
if pad_16x:
rows, cols = x.shape
padded_rows = roundup(rows, 16)
padded_cols = roundup(cols, 16)
if padded_rows != rows or padded_cols != cols:
x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
x, blocked_scaled = stochastic_round_quantize_nvfp4_block(x, per_tensor_scale, generator)
return x, to_blocked(blocked_scaled, flatten=False)
def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=0, block_size=4096 * 4096):
def roundup(x: int, multiple: int) -> int:
"""Round up x to the nearest multiple."""
return ((x + multiple - 1) // multiple) * multiple
@ -192,20 +158,16 @@ def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=
# what we want to produce. If we pad here, we want the padded output.
orig_shape = x.shape
orig_shape = list(orig_shape)
block_size = 16
output_fp4 = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 2], dtype=torch.uint8, device=x.device)
output_block = torch.empty(orig_shape[:-1] + [orig_shape[-1] // 16], dtype=torch.float8_e4m3fn, device=x.device)
x = x.reshape(orig_shape[0], -1, block_size)
scaled_block_scales_fp8 = torch.clamp(((torch.amax(torch.abs(x), dim=-1)) / F4_E2M1_MAX) / per_tensor_scale.to(x.dtype), max=F8_E4M3_MAX).to(torch.float8_e4m3fn)
x /= (per_tensor_scale.to(x.dtype) * scaled_block_scales_fp8.to(x.dtype)).unsqueeze(-1)
generator = torch.Generator(device=x.device)
generator.manual_seed(seed)
num_slices = max(1, (x.numel() / block_size))
slice_size = max(1, (round(x.shape[0] / num_slices)))
for i in range(0, x.shape[0], slice_size):
fp4, block = stochastic_round_quantize_nvfp4_block(x[i: i + slice_size], per_tensor_scale, generator=generator)
output_fp4[i:i + slice_size].copy_(fp4)
output_block[i:i + slice_size].copy_(block)
return output_fp4, to_blocked(output_block, flatten=False)
x = x.view(orig_shape).nan_to_num()
data_lp = stochastic_float_to_fp4_e2m1(x, generator=generator)
blocked_scales = to_blocked(scaled_block_scales_fp8, flatten=False)
return data_lp, blocked_scales

View File

@ -104,7 +104,7 @@ class TensorCoreNVFP4Layout(_CKNvfp4Layout):
needs_padding = padded_shape != orig_shape
if stochastic_rounding > 0:
qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4_by_block(tensor, scale, pad_16x=needs_padding, seed=stochastic_rounding)
qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4(tensor, scale, pad_16x=needs_padding, seed=stochastic_rounding)
else:
qdata, block_scale = ck.quantize_nvfp4(tensor, scale, pad_16x=needs_padding)

View File

@ -1042,7 +1042,7 @@ class ZImage(Lumina2):
"shift": 3.0,
}
memory_usage_factor = 2.8
memory_usage_factor = 2.0
supported_inference_dtypes = [torch.bfloat16, torch.float32]

View File

@ -30,7 +30,6 @@ from torch.nn.functional import interpolate
from einops import rearrange
from comfy.cli_args import args
import json
import time
MMAP_TORCH_FILES = args.mmap_torch_files
DISABLE_MMAP = args.disable_mmap
@ -1098,10 +1097,6 @@ def set_progress_bar_global_hook(function):
global PROGRESS_BAR_HOOK
PROGRESS_BAR_HOOK = function
# Throttle settings for progress bar updates to reduce WebSocket flooding
PROGRESS_THROTTLE_MIN_INTERVAL = 0.1 # 100ms minimum between updates
PROGRESS_THROTTLE_MIN_PERCENT = 0.5 # 0.5% minimum progress change
class ProgressBar:
def __init__(self, total, node_id=None):
global PROGRESS_BAR_HOOK
@ -1109,8 +1104,6 @@ class ProgressBar:
self.current = 0
self.hook = PROGRESS_BAR_HOOK
self.node_id = node_id
self._last_update_time = 0.0
self._last_sent_value = -1
def update_absolute(self, value, total=None, preview=None):
if total is not None:
@ -1119,29 +1112,7 @@ class ProgressBar:
value = self.total
self.current = value
if self.hook is not None:
current_time = time.perf_counter()
is_first = (self._last_sent_value < 0)
is_final = (value >= self.total)
has_preview = (preview is not None)
# Always send immediately for previews, first update, or final update
if has_preview or is_first or is_final:
self.hook(self.current, self.total, preview, node_id=self.node_id)
self._last_update_time = current_time
self._last_sent_value = value
return
# Apply throttling for regular progress updates
if self.total > 0:
percent_changed = ((value - max(0, self._last_sent_value)) / self.total) * 100
else:
percent_changed = 100
time_elapsed = current_time - self._last_update_time
if time_elapsed >= PROGRESS_THROTTLE_MIN_INTERVAL and percent_changed >= PROGRESS_THROTTLE_MIN_PERCENT:
self.hook(self.current, self.total, preview, node_id=self.node_id)
self._last_update_time = current_time
self._last_sent_value = value
self.hook(self.current, self.total, preview, node_id=self.node_id)
def update(self, value):
self.update_absolute(self.current + value)