Merge 69d3bfa391 into e84a200a3c

ops: opt out of deferred weight init if subclassed (#12967 )
If a subclass BYO _load_from_state_dict and doesnt call the super() the needed default init of these weights is missed and can lead to problems for uninitialized weights.
2026-03-22 17:43:33 +08:00 · 2026-03-16 04:00:09 +09:00 · 2026-03-15 11:49:49 -07:00 · 2026-03-15 11:48:56 -07:00 · 2026-03-14 18:09:09 -07:00 · 2026-03-14 19:53:31 -04:00
19 changed files with 255 additions and 41 deletions
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -83,6 +83,8 @@ fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
 fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

+parser.add_argument("--fp16-intermediates", action="store_true", help="Experimental: Use fp16 for intermediate tensors between nodes instead of fp32.")
+
 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -193,6 +193,8 @@ class HiddenInputTypeDict(TypedDict):
    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
    dynprompt: NotRequired[Literal["DYNPROMPT"]]
    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""
+    prompt_id: NotRequired[Literal["PROMPT_ID"]]
+    """PROMPT_ID is the unique identifier of the current prompt/job being executed. Useful for associating progress updates with specific jobs."""


 class InputTypeDict(TypedDict):
--- a/comfy/float.py
+++ b/comfy/float.py
@ -209,3 +209,39 @@ def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=
        output_block[i:i + slice_size].copy_(block)

    return output_fp4, to_blocked(output_block, flatten=False)
+
+
+def stochastic_round_quantize_mxfp8_by_block(x, pad_32x, seed=0):
+    def roundup(x_val, multiple):
+        return ((x_val + multiple - 1) // multiple) * multiple
+
+    if pad_32x:
+        rows, cols = x.shape
+        padded_rows = roundup(rows, 32)
+        padded_cols = roundup(cols, 32)
+        if padded_rows != rows or padded_cols != cols:
+            x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
+
+    F8_E4M3_MAX = 448.0
+    E8M0_BIAS = 127
+    BLOCK_SIZE = 32
+
+    rows, cols = x.shape
+    x_blocked = x.reshape(rows, -1, BLOCK_SIZE)
+    max_abs = torch.amax(torch.abs(x_blocked), dim=-1)
+
+    # E8M0 block scales (power-of-2 exponents)
+    scale_needed = torch.clamp(max_abs.float() / F8_E4M3_MAX, min=2**(-127))
+    exp_biased = torch.clamp(torch.ceil(torch.log2(scale_needed)).to(torch.int32) + E8M0_BIAS, 0, 254)
+    block_scales_e8m0 = exp_biased.to(torch.uint8)
+
+    zero_mask = (max_abs == 0)
+    block_scales_f32 = (block_scales_e8m0.to(torch.int32) << 23).view(torch.float32)
+    block_scales_f32 = torch.where(zero_mask, torch.ones_like(block_scales_f32), block_scales_f32)
+
+    # Scale per-block then stochastic round
+    data_scaled = (x_blocked.float() / block_scales_f32.unsqueeze(-1)).reshape(rows, cols)
+    output_fp8 = stochastic_rounding(data_scaled, torch.float8_e4m3fn, seed=seed)
+
+    block_scales_e8m0 = torch.where(zero_mask, torch.zeros_like(block_scales_e8m0), block_scales_e8m0)
+    return output_fp8, to_blocked(block_scales_e8m0, flatten=False).view(torch.float8_e8m0fnu)
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@ -11,6 +11,7 @@ from .causal_conv3d import CausalConv3d
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
+import comfy.model_management
 from comfy.ldm.modules.diffusionmodules.model import torch_cat_if_needed

 ops = comfy.ops.disable_weight_init
@ -536,7 +537,7 @@ class Decoder(nn.Module):
                    mark_conv3d_ended(self.conv_out)
                sample = self.conv_out(sample, causal=self.causal)
                if sample is not None and sample.shape[2] > 0:
-                    output.append(sample)
+                    output.append(sample.to(comfy.model_management.intermediate_device()))
                return

            up_block = self.up_blocks[idx]
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -1050,6 +1050,12 @@ def intermediate_device():
    else:
        return torch.device("cpu")

+def intermediate_dtype():
+    if args.fp16_intermediates:
+        return torch.float16
+    else:
+        return torch.float32
+
 def vae_device():
    if args.cpu_vae:
        return torch.device("cpu")
@ -1712,6 +1718,19 @@ def supports_nvfp4_compute(device=None):

    return True

+def supports_mxfp8_compute(device=None):
+    if not is_nvidia():
+        return False
+
+    if torch_version_numeric < (2, 10):
+        return False
+
+    props = torch.cuda.get_device_properties(device)
+    if props.major < 10:
+        return False
+
+    return True
+
 def extended_fp16_support():
    # TODO: check why some models work with fp16 on newer torch versions but not on older
    if torch_version_numeric < (2, 7):
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -336,7 +336,10 @@ class disable_weight_init:
    class Linear(torch.nn.Linear, CastWeightBiasOp):

        def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
-            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
+            # don't trust subclasses that BYO state dict loader to call us.
+            if (not comfy.model_management.WINDOWS
+                or not comfy.memory_management.aimdo_enabled
+                or type(self)._load_from_state_dict is not disable_weight_init.Linear._load_from_state_dict):
                super().__init__(in_features, out_features, bias, device, dtype)
                return

@ -357,7 +360,9 @@ class disable_weight_init:
        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
                                strict, missing_keys, unexpected_keys, error_msgs):

-            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
+            if (not comfy.model_management.WINDOWS
+                or not comfy.memory_management.aimdo_enabled
+                or type(self)._load_from_state_dict is not disable_weight_init.Linear._load_from_state_dict):
                return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
                                                     missing_keys, unexpected_keys, error_msgs)
            disable_weight_init._lazy_load_from_state_dict(
@ -564,7 +569,10 @@ class disable_weight_init:
        def __init__(self, num_embeddings, embedding_dim, padding_idx=None, max_norm=None,
                     norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None,
                     _freeze=False, device=None, dtype=None):
-            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
+            # don't trust subclasses that BYO state dict loader to call us.
+            if (not comfy.model_management.WINDOWS
+                or not comfy.memory_management.aimdo_enabled
+                or type(self)._load_from_state_dict is not disable_weight_init.Embedding._load_from_state_dict):
                super().__init__(num_embeddings, embedding_dim, padding_idx, max_norm,
                                 norm_type, scale_grad_by_freq, sparse, _weight,
                                 _freeze, device, dtype)
@ -590,7 +598,9 @@ class disable_weight_init:
        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
                                strict, missing_keys, unexpected_keys, error_msgs):

-            if not comfy.model_management.WINDOWS or not comfy.memory_management.aimdo_enabled:
+            if (not comfy.model_management.WINDOWS
+                or not comfy.memory_management.aimdo_enabled
+                or type(self)._load_from_state_dict is not disable_weight_init.Embedding._load_from_state_dict):
                return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
                                                     missing_keys, unexpected_keys, error_msgs)
            disable_weight_init._lazy_load_from_state_dict(
@ -857,6 +867,22 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
                            orig_shape=(self.out_features, self.in_features),
                        )

+                    elif self.quant_format == "mxfp8":
+                        # MXFP8: E8M0 block scales stored as uint8 in safetensors
+                        block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys,
+                                                             dtype=torch.uint8)
+
+                        if block_scale is None:
+                            raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}")
+
+                        block_scale = block_scale.view(torch.float8_e8m0fnu)
+
+                        params = layout_cls.Params(
+                            scale=block_scale,
+                            orig_dtype=MixedPrecisionOps._compute_dtype,
+                            orig_shape=(self.out_features, self.in_features),
+                        )
+
                    elif self.quant_format == "nvfp4":
                        # NVFP4: tensor_scale (weight_scale_2) + block_scale (weight_scale)
                        tensor_scale = self._load_scale_param(state_dict, prefix, "weight_scale_2", device, manually_loaded_keys)
@ -1006,12 +1032,15 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, model_config=None):
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
    nvfp4_compute = comfy.model_management.supports_nvfp4_compute(load_device)
+    mxfp8_compute = comfy.model_management.supports_mxfp8_compute(load_device)

    if model_config and hasattr(model_config, 'quant_config') and model_config.quant_config:
        logging.info("Using mixed precision operations")
        disabled = set()
        if not nvfp4_compute:
            disabled.add("nvfp4")
+        if not mxfp8_compute:
+            disabled.add("mxfp8")
        if not fp8_compute:
            disabled.add("float8_e4m3fn")
            disabled.add("float8_e5m2")
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@ -43,6 +43,18 @@ except ImportError as e:
    def get_layout_class(name):
        return None

+_CK_MXFP8_AVAILABLE = False
+if _CK_AVAILABLE:
+    try:
+        from comfy_kitchen.tensor import TensorCoreMXFP8Layout as _CKMxfp8Layout
+        _CK_MXFP8_AVAILABLE = True
+    except ImportError:
+        logging.warning("comfy_kitchen does not support MXFP8, please update comfy_kitchen.")
+
+if not _CK_MXFP8_AVAILABLE:
+    class _CKMxfp8Layout:
+        pass
+
 import comfy.float

 # ==============================================================================
@ -84,6 +96,31 @@ class _TensorCoreFP8LayoutBase(_CKFp8Layout):
        return qdata, params


+class TensorCoreMXFP8Layout(_CKMxfp8Layout):
+    @classmethod
+    def quantize(cls, tensor, scale=None, stochastic_rounding=0, inplace_ops=False):
+        if tensor.dim() != 2:
+            raise ValueError(f"MXFP8 requires 2D tensor, got {tensor.dim()}D")
+
+        orig_dtype = tensor.dtype
+        orig_shape = tuple(tensor.shape)
+
+        padded_shape = cls.get_padded_shape(orig_shape)
+        needs_padding = padded_shape != orig_shape
+
+        if stochastic_rounding > 0:
+            qdata, block_scale = comfy.float.stochastic_round_quantize_mxfp8_by_block(tensor, pad_32x=needs_padding, seed=stochastic_rounding)
+        else:
+            qdata, block_scale = ck.quantize_mxfp8(tensor, pad_32x=needs_padding)
+
+        params = cls.Params(
+            scale=block_scale,
+            orig_dtype=orig_dtype,
+            orig_shape=orig_shape,
+        )
+        return qdata, params
+
+
 class TensorCoreNVFP4Layout(_CKNvfp4Layout):
    @classmethod
    def quantize(cls, tensor, scale=None, stochastic_rounding=0, inplace_ops=False):
@ -137,6 +174,8 @@ register_layout_class("TensorCoreFP8Layout", TensorCoreFP8Layout)
 register_layout_class("TensorCoreFP8E4M3Layout", TensorCoreFP8E4M3Layout)
 register_layout_class("TensorCoreFP8E5M2Layout", TensorCoreFP8E5M2Layout)
 register_layout_class("TensorCoreNVFP4Layout", TensorCoreNVFP4Layout)
+if _CK_MXFP8_AVAILABLE:
+    register_layout_class("TensorCoreMXFP8Layout", TensorCoreMXFP8Layout)

 QUANT_ALGOS = {
    "float8_e4m3fn": {
@ -157,6 +196,14 @@ QUANT_ALGOS = {
    },
 }

+if _CK_MXFP8_AVAILABLE:
+    QUANT_ALGOS["mxfp8"] = {
+        "storage_t": torch.float8_e4m3fn,
+        "parameters": {"weight_scale", "input_scale"},
+        "comfy_tensor_layout": "TensorCoreMXFP8Layout",
+        "group_size": 32,
+    }
+

 # ==============================================================================
 # Re-exports for backward compatibility
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -871,13 +871,16 @@ class VAE:
                pixels = torch.nn.functional.pad(pixels, (0, self.output_channels - pixels.shape[-1]), mode=mode, value=value)
        return pixels

+    def vae_output_dtype(self):
+        return model_management.intermediate_dtype()
+
    def decode_tiled_(self, samples, tile_x=64, tile_y=64, overlap = 16):
        steps = samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x, tile_y, overlap)
        steps += samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x // 2, tile_y * 2, overlap)
        steps += samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x * 2, tile_y // 2, overlap)
        pbar = comfy.utils.ProgressBar(steps)

-        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
        output = self.process_output(
            (comfy.utils.tiled_scale(samples, decode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = self.upscale_ratio, output_device=self.output_device, pbar = pbar) +
            comfy.utils.tiled_scale(samples, decode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = self.upscale_ratio, output_device=self.output_device, pbar = pbar) +
@ -887,16 +890,16 @@ class VAE:

    def decode_tiled_1d(self, samples, tile_x=256, overlap=32):
        if samples.ndim == 3:
-            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
+            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
        else:
            og_shape = samples.shape
            samples = samples.reshape((og_shape[0], og_shape[1] * og_shape[2], -1))
-            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).float()
+            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())

        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))

    def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
-        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, index_formulas=self.upscale_index_formula, output_device=self.output_device))

    def encode_tiled_(self, pixel_samples, tile_x=512, tile_y=512, overlap = 64):
@ -905,7 +908,7 @@ class VAE:
        steps += pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x * 2, tile_y // 2, overlap)
        pbar = comfy.utils.ProgressBar(steps)

-        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
+        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
        samples = comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x, tile_y, overlap, upscale_amount = (1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device, pbar=pbar)
        samples += comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = (1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device, pbar=pbar)
        samples += comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = (1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device, pbar=pbar)
@ -914,7 +917,7 @@ class VAE:

    def encode_tiled_1d(self, samples, tile_x=256 * 2048, overlap=64 * 2048):
        if self.latent_dim == 1:
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
+            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
            out_channels = self.latent_channels
            upscale_amount = 1 / self.downscale_ratio
        else:
@ -923,7 +926,7 @@ class VAE:
            tile_x = tile_x // extra_channel_size
            overlap = overlap // extra_channel_size
            upscale_amount = 1 / self.downscale_ratio
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).float()
+            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).to(dtype=self.vae_output_dtype())

        out = comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=self.output_device)
        if self.latent_dim == 1:
@ -932,7 +935,7 @@ class VAE:
            return out.reshape(samples.shape[0], self.latent_channels, extra_channel_size, -1)

    def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)):
-        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
+        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, index_formulas=self.downscale_index_formula, output_device=self.output_device)

    def decode(self, samples_in, vae_options={}):
@ -950,9 +953,9 @@ class VAE:

            for x in range(0, samples_in.shape[0], batch_number):
                samples = samples_in[x:x+batch_number].to(self.vae_dtype).to(self.device)
-                out = self.process_output(self.first_stage_model.decode(samples, **vae_options).to(self.output_device).float())
+                out = self.process_output(self.first_stage_model.decode(samples, **vae_options).to(self.output_device).to(dtype=self.vae_output_dtype()))
                if pixel_samples is None:
-                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
+                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
                pixel_samples[x:x+batch_number] = out
        except Exception as e:
            model_management.raise_non_oom(e)
@ -1025,9 +1028,9 @@ class VAE:
            samples = None
            for x in range(0, pixel_samples.shape[0], batch_number):
                pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype).to(self.device)
-                out = self.first_stage_model.encode(pixels_in).to(self.output_device).float()
+                out = self.first_stage_model.encode(pixels_in).to(self.output_device).to(dtype=self.vae_output_dtype())
                if samples is None:
-                    samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
+                    samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
                samples[x:x + batch_number] = out

        except Exception as e:
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@ -12,6 +12,7 @@ from comfy.cli_args import args
 # Default server capabilities
 SERVER_FEATURE_FLAGS: dict[str, Any] = {
    "supports_preview_metadata": True,
+    "supports_progress_text_metadata": True,
    "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
    "extension": {"manager": {"supports_v4": True}},
    "node_replacements": True,
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@ -1282,9 +1282,16 @@ class V3Data(TypedDict):
    'When True, the value of the dynamic input will be in the format (value, path_key).'

 class HiddenHolder:
+    """Holds hidden input values resolved during node execution.
+
+    Hidden inputs are special values automatically provided by the execution
+    engine (e.g., node ID, prompt data, authentication tokens) rather than
+    being connected by the user in the graph.
+    """
    def __init__(self, unique_id: str, prompt: Any,
                 extra_pnginfo: Any, dynprompt: Any,
-                 auth_token_comfy_org: str, api_key_comfy_org: str, **kwargs):
+                 auth_token_comfy_org: str, api_key_comfy_org: str,
+                 prompt_id: str = None, **kwargs):
        self.unique_id = unique_id
        """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
        self.prompt = prompt
@ -1297,6 +1304,8 @@ class HiddenHolder:
        """AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend."""
        self.api_key_comfy_org = api_key_comfy_org
        """API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend."""
+        self.prompt_id = prompt_id
+        """PROMPT_ID is the unique identifier of the current prompt/job being executed."""

    def __getattr__(self, key: str):
        '''If hidden variable not found, return None.'''
@ -1304,6 +1313,14 @@ class HiddenHolder:

    @classmethod
    def from_dict(cls, d: dict | None):
+        """Create a HiddenHolder from a dictionary of hidden input values.
+
+        Args:
+            d: Dictionary mapping Hidden enum values to their resolved values.
+
+        Returns:
+            A new HiddenHolder instance with values populated from the dict.
+        """
        if d is None:
            d = {}
        return cls(
@ -1313,6 +1330,7 @@ class HiddenHolder:
            dynprompt=d.get(Hidden.dynprompt, None),
            auth_token_comfy_org=d.get(Hidden.auth_token_comfy_org, None),
            api_key_comfy_org=d.get(Hidden.api_key_comfy_org, None),
+            prompt_id=d.get(Hidden.prompt_id, None),
        )

    @classmethod
@ -1335,6 +1353,8 @@ class Hidden(str, Enum):
    """AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend."""
    api_key_comfy_org = "API_KEY_COMFY_ORG"
    """API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend."""
+    prompt_id = "PROMPT_ID"
+    """PROMPT_ID is the unique identifier of the current prompt/job being executed. Useful for associating progress updates with specific jobs."""


@dataclass
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@ -17,6 +17,7 @@ from pydantic import BaseModel

 from comfy import utils
 from comfy_api.latest import IO
+from comfy_execution.utils import get_executing_context
 from server import PromptServer

 from . import request_logger
@ -440,6 +441,17 @@ def _display_text(
    status: str | int | None = None,
    price: float | None = None,
 ) -> None:
+    """Send a progress text message to the client for display on a node.
+
+    Assembles status, price, and text lines, then sends them via WebSocket.
+    Automatically retrieves the current prompt_id from the execution context.
+
+    Args:
+        node_cls: The ComfyNode class sending the progress text.
+        text: Optional text content to display.
+        status: Optional status string or code to display.
+        price: Optional price in dollars to display as credits.
+    """
    display_lines: list[str] = []
    if status:
        display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}")
@ -450,7 +462,9 @@ def _display_text(
    if text is not None:
        display_lines.append(text)
    if display_lines:
-        PromptServer.instance.send_progress_text("\n".join(display_lines), get_node_id(node_cls))
+        ctx = get_executing_context()
+        prompt_id = ctx.prompt_id if ctx is not None else None
+        PromptServer.instance.send_progress_text("\n".join(display_lines), get_node_id(node_cls), prompt_id=prompt_id)


 def _display_time_progress(
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@ -566,7 +566,7 @@ class GetImageSize(IO.ComfyNode):
                IO.Int.Output(display_name="height"),
                IO.Int.Output(display_name="batch_size"),
            ],
-            hidden=[IO.Hidden.unique_id],
+            hidden=[IO.Hidden.unique_id, IO.Hidden.prompt_id],
        )

    @classmethod
@ -577,7 +577,7 @@ class GetImageSize(IO.ComfyNode):

        # Send progress text to display size on the node
        if cls.hidden.unique_id:
-            PromptServer.instance.send_progress_text(f"width: {width}, height: {height}\n batch size: {batch_size}", cls.hidden.unique_id)
+            PromptServer.instance.send_progress_text(f"width: {width}, height: {height}\n batch size: {batch_size}", cls.hidden.unique_id, prompt_id=cls.hidden.prompt_id)

        return IO.NodeOutput(width, height, batch_size)

--- a/execution.py
+++ b/execution.py
@ -150,7 +150,7 @@ class CacheSet:

 SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org")

-def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}):
+def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}, prompt_id=None):
    is_v3 = issubclass(class_def, _ComfyNodeInternal)
    v3_data: io.V3Data = {}
    hidden_inputs_v3 = {}
@ -197,6 +197,8 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=
                hidden_inputs_v3[io.Hidden.auth_token_comfy_org] = extra_data.get("auth_token_comfy_org", None)
            if io.Hidden.api_key_comfy_org.name in hidden:
                hidden_inputs_v3[io.Hidden.api_key_comfy_org] = extra_data.get("api_key_comfy_org", None)
+            if io.Hidden.prompt_id.name in hidden:
+                hidden_inputs_v3[io.Hidden.prompt_id] = prompt_id
    else:
        if "hidden" in valid_inputs:
            h = valid_inputs["hidden"]
@ -213,6 +215,8 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=
                    input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)]
                if h[x] == "API_KEY_COMFY_ORG":
                    input_data_all[x] = [extra_data.get("api_key_comfy_org", None)]
+                if h[x] == "PROMPT_ID":
+                    input_data_all[x] = [prompt_id]
    v3_data["hidden_inputs"] = hidden_inputs_v3
    return input_data_all, missing_keys, v3_data

@ -470,7 +474,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
            has_subgraph = False
        else:
            get_progress_state().start_progress(unique_id)
-            input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data)
+            input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data, prompt_id=prompt_id)
            if server.client_id is not None:
                server.last_node_id = display_node_id
                server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id)
--- a/manager_requirements.txt
+++ b/manager_requirements.txt
@ -1 +1 @@
-comfyui_manager==4.1b4
+comfyui_manager==4.1b5
--- a/middleware/cache_middleware.py
+++ b/middleware/cache_middleware.py
@ -32,7 +32,7 @@ async def cache_control(
    )

    if request.path.endswith(".js") or request.path.endswith(".css") or is_entry_point:
-        response.headers.setdefault("Cache-Control", "no-cache")
+        response.headers.setdefault("Cache-Control", "no-store")
        return response

    # Early return for non-image files - no cache headers needed
--- a/nodes.py
+++ b/nodes.py
@ -1724,6 +1724,8 @@ class LoadImage:
        output_masks = []
        w, h = None, None

+        dtype = comfy.model_management.intermediate_dtype()
+
        for i in ImageSequence.Iterator(img):
            i = node_helpers.pillow(ImageOps.exif_transpose, i)

@ -1748,8 +1750,8 @@ class LoadImage:
                mask = 1. - torch.from_numpy(mask)
            else:
                mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
-            output_images.append(image)
-            output_masks.append(mask.unsqueeze(0))
+            output_images.append(image.to(dtype=dtype))
+            output_masks.append(mask.unsqueeze(0).to(dtype=dtype))

            if img.format == "MPO":
                break  # ignore all frames except the first one for MPO format
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-comfyui-frontend-package==1.41.19
+comfyui-frontend-package==1.41.20
 comfyui-workflow-templates==0.9.21
 comfyui-embedded-docs==0.4.3
 torch
--- a/server.py
+++ b/server.py
@ -310,7 +310,7 @@ class PromptServer():
        @routes.get("/")
        async def get_root(request):
            response = web.FileResponse(os.path.join(self.web_root, "index.html"))
-            response.headers['Cache-Control'] = 'no-cache'
+            response.headers['Cache-Control'] = 'no-store, must-revalidate'
            response.headers["Pragma"] = "no-cache"
            response.headers["Expires"] = "0"
            return response
@ -1230,13 +1230,47 @@ class PromptServer():
        return json_data

    def send_progress_text(
-        self, text: Union[bytes, bytearray, str], node_id: str, sid=None
+        self,
+        text: Union[bytes, bytearray, str],
+        node_id: str,
+        prompt_id: Optional[str] = None,
+        sid=None,
    ):
+        """Send a progress text message to the client via WebSocket.
+
+        Encodes the text as a binary message with length-prefixed node_id. When
+        the client supports the ``supports_progress_text_metadata`` feature flag,
+        the prompt_id is always prepended as a length-prefixed field (empty string
+        when None) to ensure consistent binary framing.
+
+        Args:
+            text: The progress text content to send.
+            node_id: The unique identifier of the node sending the progress.
+            prompt_id: Optional prompt/job identifier to associate the message with.
+            sid: Optional session ID to target a specific client.
+        """
        if isinstance(text, str):
            text = text.encode("utf-8")
        node_id_bytes = str(node_id).encode("utf-8")

-        # Pack the node_id length as a 4-byte unsigned integer, followed by the node_id bytes
-        message = struct.pack(">I", len(node_id_bytes)) + node_id_bytes + text
+        # Auto-resolve sid to the currently executing client
+        target_sid = sid if sid is not None else self.client_id

-        self.send_sync(BinaryEventTypes.TEXT, message, sid)
+        # When client supports the new format, always send
+        # [prompt_id_len][prompt_id][node_id_len][node_id][text]
+        # even when prompt_id is None (encoded as zero-length string)
+        if feature_flags.supports_feature(
+            self.sockets_metadata, target_sid, "supports_progress_text_metadata"
+        ):
+            prompt_id_bytes = (prompt_id or "").encode("utf-8")
+            message = (
+                struct.pack(">I", len(prompt_id_bytes))
+                + prompt_id_bytes
+                + struct.pack(">I", len(node_id_bytes))
+                + node_id_bytes
+                + text
+            )
+        else:
+            message = struct.pack(">I", len(node_id_bytes)) + node_id_bytes + text
+
+        self.send_sync(BinaryEventTypes.TEXT, message, target_sid)
--- a/tests-unit/server_test/test_cache_control.py
+++ b/tests-unit/server_test/test_cache_control.py
@ -28,31 +28,31 @@ CACHE_SCENARIOS = [
    },
    # JavaScript/CSS scenarios
    {
-        "name": "js_no_cache",
+        "name": "js_no_store",
        "path": "/script.js",
        "status": 200,
-        "expected_cache": "no-cache",
+        "expected_cache": "no-store",
        "should_have_header": True,
    },
    {
-        "name": "css_no_cache",
+        "name": "css_no_store",
        "path": "/styles.css",
        "status": 200,
-        "expected_cache": "no-cache",
+        "expected_cache": "no-store",
        "should_have_header": True,
    },
    {
-        "name": "index_json_no_cache",
+        "name": "index_json_no_store",
        "path": "/api/index.json",
        "status": 200,
-        "expected_cache": "no-cache",
+        "expected_cache": "no-store",
        "should_have_header": True,
    },
    {
-        "name": "localized_index_json_no_cache",
+        "name": "localized_index_json_no_store",
        "path": "/templates/index.zh.json",
        "status": 200,
-        "expected_cache": "no-cache",
+        "expected_cache": "no-store",
        "should_have_header": True,
    },
    # Non-matching files
Author	SHA1	Message	Date
Christian Byrne	d331f7c068	Merge `69d3bfa391` into `e84a200a3c`	2026-03-16 04:00:09 +09:00
rattus	e84a200a3c	ops: opt out of deferred weight init if subclassed (#12967 ) If a subclass BYO _load_from_state_dict and doesnt call the super() the needed default init of these weights is missed and can lead to problems for uninitialized weights.	2026-03-15 11:49:49 -07:00
Dr.Lt.Data	192cb8eeb9	bump manager version to 4.1b5 (#12957 )	2026-03-15 11:48:56 -07:00
Jukka Seppänen	0904cc3fe5	LTXV: Accumulate VAE decode results on intermediate_device (#12955 ) Some checks failed Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details Build package / Build Test (3.10) (push) Has been cancelled Details Build package / Build Test (3.11) (push) Has been cancelled Details Build package / Build Test (3.12) (push) Has been cancelled Details Build package / Build Test (3.13) (push) Has been cancelled Details Build package / Build Test (3.14) (push) Has been cancelled Details Generate Pydantic Stubs from api.comfy.org / generate-models (push) Has been cancelled Details	2026-03-14 18:09:09 -07:00
comfyanonymous	4941cd046e	Update comfyui-frontend-package to version 1.41.20 (#12954 )	2026-03-14 19:53:31 -04:00
comfyanonymous	c711b8f437	Add --fp16-intermediates to use fp16 for intermediate values between nodes (#12953 ) This is an experimental WIP option that might not work in your workflow but should lower memory usage if it does. Currently only the VAE and the load image node will output in fp16 when this option is turned on.	2026-03-14 19:18:19 -04:00
Jukka Seppänen	1c5db7397d	feat: Support mxfp8 (#12907 )	2026-03-14 18:36:29 -04:00
Christian Byrne	e0982a7174	fix: use no-store cache headers to prevent stale frontend chunks (#12911 ) After a frontend update (e.g. nightly build), browsers could load outdated cached index.html and JS/CSS chunks, causing dynamically imported modules to fail with MIME type errors and vite:preloadError. Hard refresh (Ctrl+Shift+R) was insufficient to fix the issue because Cache-Control: no-cache still allows the browser to cache and revalidate via ETags. aiohttp's FileResponse auto-generates ETags based on file mtime+size, which may not change after pip reinstall, so the browser gets 304 Not Modified and serves stale content. Clearing ALL site data in DevTools did fix it, confirming the HTTP cache was the root cause. The fix changes: - index.html: no-cache -> no-store, must-revalidate - JS/CSS/JSON entry points: no-cache -> no-store no-store instructs browsers to never cache these responses, ensuring every page load fetches the current index.html with correct chunk references. This is a small tradeoff (~5KB re-download per page load) for guaranteed correctness after updates.	2026-03-14 18:25:09 -04:00
bymyself	69d3bfa391	fix: always send new binary format when client supports feature flag Some checks failed Python Linting / Run Ruff (push) Has been cancelled Details Python Linting / Run Pylint (push) Has been cancelled Details When prompt_id is None, encode as zero-length string instead of falling back to old format. Prevents binary parse corruption on the frontend. Addresses review feedback: https://github.com/Comfy-Org/ComfyUI/pull/12540#discussion_r2923412491	2026-03-12 09:20:52 -07:00
bymyself	09e9bdbcad	remove send_progress_text stub tests Some checks failed Python Linting / Run Ruff (push) Has been cancelled Details Python Linting / Run Pylint (push) Has been cancelled Details Copy-paste stub tests don't verify the real implementation and add maintenance burden without meaningful coverage. Amp-Thread-ID: https://ampcode.com/threads/T-019ca3ce-c530-75dd-8d68-349e745a022e	2026-03-04 20:50:01 +00:00
bymyself	d74dfd2570	fix: send_progress_text unicasts to client_id instead of broadcasting - Default sid to self.client_id when not explicitly provided, matching every other WS message dispatch (executing, executed, progress_state, etc.) - Previously sid=None caused broadcast to all connected clients - Format signature per ruff, remove redundant comments - Add unit tests for routing, legacy format, and new prompt_id format Amp-Thread-ID: https://ampcode.com/threads/T-019ca3ce-c530-75dd-8d68-349e745a022e	2026-03-04 20:40:46 +00:00
bymyself	83df2a88bd	refactor: add prompt_id as hidden type, fix imports, add docstrings Some checks failed Python Linting / Run Ruff (push) Has been cancelled Details Python Linting / Run Pylint (push) Has been cancelled Details Build package / Build Test (3.10) (push) Has been cancelled Details Build package / Build Test (3.11) (push) Has been cancelled Details Build package / Build Test (3.12) (push) Has been cancelled Details Build package / Build Test (3.13) (push) Has been cancelled Details Build package / Build Test (3.14) (push) Has been cancelled Details - Add PROMPT_ID as a new hidden type in the Hidden enum, HiddenHolder, HiddenInputTypeDict, and execution engine resolution (both V3 and legacy) - Refactor GetImageSize to use cls.hidden.prompt_id instead of manually calling get_executing_context() — addresses reviewer feedback - Remove lazy import of get_executing_context from nodes_images.py - Add docstrings to send_progress_text, _display_text, HiddenHolder, and HiddenHolder.from_dict Amp-Thread-ID: https://ampcode.com/threads/T-019ca1cb-0150-7549-8b1b-6713060d3408	2026-02-27 17:21:14 -08:00
bymyself	1c7e656eb4	Add prompt_id to progress_text binary WS messages Add supports_progress_text_metadata feature flag and extend send_progress_text() to accept optional prompt_id param. When prompt_id is provided and the client supports the new format, the binary wire format includes a length-prefixed prompt_id field: [4B event_type][4B prompt_id_len][prompt_id][4B node_id_len][node_id][text] Legacy format preserved for clients without the flag. Both callers (nodes_images.py, client.py) updated to pass prompt_id from get_executing_context(). Part of COM-12671: parallel workflow execution support. Amp-Thread-ID: https://ampcode.com/threads/T-019c79f7-f19b-70d9-b662-0687cc206282	2026-02-27 17:12:49 -08:00