Merge 45a2363e6a into 7810f49702

2026-03-17 07:05:12 +08:00 · 2026-03-14 10:18:12 +08:00
13 changed files with 29 additions and 165 deletions
--- a/README.md
+++ b/README.md
@ -38,8 +38,6 @@ ComfyUI lets you design and execute advanced stable diffusion pipelines using a

 ## Get Started

-### Local
-
 #### [Desktop Application](https://www.comfy.org/download)
 - The easiest way to get started.
 - Available on Windows & macOS.
@ -51,13 +49,8 @@ ComfyUI lets you design and execute advanced stable diffusion pipelines using a
 #### [Manual Install](#manual-install-windows-linux)
 Supports all operating systems and GPU types (NVIDIA, AMD, Intel, Apple Silicon, Ascend).

-### Cloud
-
-#### [Comfy Cloud](https://www.comfy.org/cloud)
- Our official paid cloud version for those who can't afford local hardware.
-
-## Examples
-See what ComfyUI can do with the [newer template workflows](https://comfy.org/workflows) or old [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).
+## [Examples](https://comfyanonymous.github.io/ComfyUI_examples/)
+See what ComfyUI can do with the [example workflows](https://comfyanonymous.github.io/ComfyUI_examples/).

 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -83,8 +83,6 @@ fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text
 fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
 fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

-parser.add_argument("--fp16-intermediates", action="store_true", help="Experimental: Use fp16 for intermediate tensors between nodes instead of fp32.")
-
 parser.add_argument("--force-channels-last", action="store_true", help="Force channels last format when inferencing the models.")

 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
--- a/comfy/float.py
+++ b/comfy/float.py
@ -226,39 +226,3 @@ def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=
        output_block[i:i + slice_size].copy_(block)

    return output_fp4, to_blocked(output_block, flatten=False)
-
-
-def stochastic_round_quantize_mxfp8_by_block(x, pad_32x, seed=0):
-    def roundup(x_val, multiple):
-        return ((x_val + multiple - 1) // multiple) * multiple
-
-    if pad_32x:
-        rows, cols = x.shape
-        padded_rows = roundup(rows, 32)
-        padded_cols = roundup(cols, 32)
-        if padded_rows != rows or padded_cols != cols:
-            x = torch.nn.functional.pad(x, (0, padded_cols - cols, 0, padded_rows - rows))
-
-    F8_E4M3_MAX = 448.0
-    E8M0_BIAS = 127
-    BLOCK_SIZE = 32
-
-    rows, cols = x.shape
-    x_blocked = x.reshape(rows, -1, BLOCK_SIZE)
-    max_abs = torch.amax(torch.abs(x_blocked), dim=-1)
-
-    # E8M0 block scales (power-of-2 exponents)
-    scale_needed = torch.clamp(max_abs.float() / F8_E4M3_MAX, min=2**(-127))
-    exp_biased = torch.clamp(torch.ceil(torch.log2(scale_needed)).to(torch.int32) + E8M0_BIAS, 0, 254)
-    block_scales_e8m0 = exp_biased.to(torch.uint8)
-
-    zero_mask = (max_abs == 0)
-    block_scales_f32 = (block_scales_e8m0.to(torch.int32) << 23).view(torch.float32)
-    block_scales_f32 = torch.where(zero_mask, torch.ones_like(block_scales_f32), block_scales_f32)
-
-    # Scale per-block then stochastic round
-    data_scaled = (x_blocked.float() / block_scales_f32.unsqueeze(-1)).reshape(rows, cols)
-    output_fp8 = stochastic_rounding(data_scaled, torch.float8_e4m3fn, seed=seed)
-
-    block_scales_e8m0 = torch.where(zero_mask, torch.zeros_like(block_scales_e8m0), block_scales_e8m0)
-    return output_fp8, to_blocked(block_scales_e8m0, flatten=False).view(torch.float8_e8m0fnu)
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@ -11,7 +11,6 @@ from .causal_conv3d import CausalConv3d
 from .pixel_norm import PixelNorm
 from ..model import PixArtAlphaCombinedTimestepSizeEmbeddings
 import comfy.ops
-import comfy.model_management
 from comfy.ldm.modules.diffusionmodules.model import torch_cat_if_needed

 ops = comfy.ops.disable_weight_init
@ -537,7 +536,7 @@ class Decoder(nn.Module):
                    mark_conv3d_ended(self.conv_out)
                sample = self.conv_out(sample, causal=self.causal)
                if sample is not None and sample.shape[2] > 0:
-                    output.append(sample.to(comfy.model_management.intermediate_device()))
+                    output.append(sample)
                return

            up_block = self.up_blocks[idx]
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -1050,12 +1050,6 @@ def intermediate_device():
    else:
        return torch.device("cpu")

-def intermediate_dtype():
-    if args.fp16_intermediates:
-        return torch.float16
-    else:
-        return torch.float32
-
 def vae_device():
    if args.cpu_vae:
        return torch.device("cpu")
@ -1718,19 +1712,6 @@ def supports_nvfp4_compute(device=None):

    return True

-def supports_mxfp8_compute(device=None):
-    if not is_nvidia():
-        return False
-
-    if torch_version_numeric < (2, 10):
-        return False
-
-    props = torch.cuda.get_device_properties(device)
-    if props.major < 10:
-        return False
-
-    return True
-
 def extended_fp16_support():
    # TODO: check why some models work with fp16 on newer torch versions but not on older
    if torch_version_numeric < (2, 7):
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -857,22 +857,6 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
                            orig_shape=(self.out_features, self.in_features),
                        )

-                    elif self.quant_format == "mxfp8":
-                        # MXFP8: E8M0 block scales stored as uint8 in safetensors
-                        block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys,
-                                                             dtype=torch.uint8)
-
-                        if block_scale is None:
-                            raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}")
-
-                        block_scale = block_scale.view(torch.float8_e8m0fnu)
-
-                        params = layout_cls.Params(
-                            scale=block_scale,
-                            orig_dtype=MixedPrecisionOps._compute_dtype,
-                            orig_shape=(self.out_features, self.in_features),
-                        )
-
                    elif self.quant_format == "nvfp4":
                        # NVFP4: tensor_scale (weight_scale_2) + block_scale (weight_scale)
                        tensor_scale = self._load_scale_param(state_dict, prefix, "weight_scale_2", device, manually_loaded_keys)
@ -1022,15 +1006,12 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, model_config=None):
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
    nvfp4_compute = comfy.model_management.supports_nvfp4_compute(load_device)
-    mxfp8_compute = comfy.model_management.supports_mxfp8_compute(load_device)

    if model_config and hasattr(model_config, 'quant_config') and model_config.quant_config:
        logging.info("Using mixed precision operations")
        disabled = set()
        if not nvfp4_compute:
            disabled.add("nvfp4")
-        if not mxfp8_compute:
-            disabled.add("mxfp8")
        if not fp8_compute:
            disabled.add("float8_e4m3fn")
            disabled.add("float8_e5m2")
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@ -43,18 +43,6 @@ except ImportError as e:
    def get_layout_class(name):
        return None

-_CK_MXFP8_AVAILABLE = False
-if _CK_AVAILABLE:
-    try:
-        from comfy_kitchen.tensor import TensorCoreMXFP8Layout as _CKMxfp8Layout
-        _CK_MXFP8_AVAILABLE = True
-    except ImportError:
-        logging.warning("comfy_kitchen does not support MXFP8, please update comfy_kitchen.")
-
-if not _CK_MXFP8_AVAILABLE:
-    class _CKMxfp8Layout:
-        pass
-
 import comfy.float

 # ==============================================================================
@ -102,31 +90,6 @@ class _TensorCoreFP8LayoutBase(_CKFp8Layout):
        return qdata, params


-class TensorCoreMXFP8Layout(_CKMxfp8Layout):
-    @classmethod
-    def quantize(cls, tensor, scale=None, stochastic_rounding=0, inplace_ops=False):
-        if tensor.dim() != 2:
-            raise ValueError(f"MXFP8 requires 2D tensor, got {tensor.dim()}D")
-
-        orig_dtype = tensor.dtype
-        orig_shape = tuple(tensor.shape)
-
-        padded_shape = cls.get_padded_shape(orig_shape)
-        needs_padding = padded_shape != orig_shape
-
-        if stochastic_rounding > 0:
-            qdata, block_scale = comfy.float.stochastic_round_quantize_mxfp8_by_block(tensor, pad_32x=needs_padding, seed=stochastic_rounding)
-        else:
-            qdata, block_scale = ck.quantize_mxfp8(tensor, pad_32x=needs_padding)
-
-        params = cls.Params(
-            scale=block_scale,
-            orig_dtype=orig_dtype,
-            orig_shape=orig_shape,
-        )
-        return qdata, params
-
-
 class TensorCoreNVFP4Layout(_CKNvfp4Layout):
    @classmethod
    def quantize(cls, tensor, scale=None, stochastic_rounding=0, inplace_ops=False):
@ -180,8 +143,6 @@ register_layout_class("TensorCoreFP8Layout", TensorCoreFP8Layout)
 register_layout_class("TensorCoreFP8E4M3Layout", TensorCoreFP8E4M3Layout)
 register_layout_class("TensorCoreFP8E5M2Layout", TensorCoreFP8E5M2Layout)
 register_layout_class("TensorCoreNVFP4Layout", TensorCoreNVFP4Layout)
-if _CK_MXFP8_AVAILABLE:
-    register_layout_class("TensorCoreMXFP8Layout", TensorCoreMXFP8Layout)

 QUANT_ALGOS = {
    "float8_e4m3fn": {
@ -202,14 +163,6 @@ QUANT_ALGOS = {
    },
 }

-if _CK_MXFP8_AVAILABLE:
-    QUANT_ALGOS["mxfp8"] = {
-        "storage_t": torch.float8_e4m3fn,
-        "parameters": {"weight_scale", "input_scale"},
-        "comfy_tensor_layout": "TensorCoreMXFP8Layout",
-        "group_size": 32,
-    }
-

 # ==============================================================================
 # Re-exports for backward compatibility
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -871,16 +871,13 @@ class VAE:
                pixels = torch.nn.functional.pad(pixels, (0, self.output_channels - pixels.shape[-1]), mode=mode, value=value)
        return pixels

-    def vae_output_dtype(self):
-        return model_management.intermediate_dtype()
-
    def decode_tiled_(self, samples, tile_x=64, tile_y=64, overlap = 16):
        steps = samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x, tile_y, overlap)
        steps += samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x // 2, tile_y * 2, overlap)
        steps += samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x * 2, tile_y // 2, overlap)
        pbar = comfy.utils.ProgressBar(steps)

-        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        output = self.process_output(
            (comfy.utils.tiled_scale(samples, decode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = self.upscale_ratio, output_device=self.output_device, pbar = pbar) +
            comfy.utils.tiled_scale(samples, decode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = self.upscale_ratio, output_device=self.output_device, pbar = pbar) +
@ -890,16 +887,16 @@ class VAE:

    def decode_tiled_1d(self, samples, tile_x=256, overlap=32):
        if samples.ndim == 3:
-            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+            decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        else:
            og_shape = samples.shape
            samples = samples.reshape((og_shape[0], og_shape[1] * og_shape[2], -1))
-            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+            decode_fn = lambda a: self.first_stage_model.decode(a.reshape((-1, og_shape[1], og_shape[2], a.shape[-1])).to(self.vae_dtype).to(self.device)).float()

        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, output_device=self.output_device))

    def decode_tiled_3d(self, samples, tile_t=999, tile_x=32, tile_y=32, overlap=(1, 8, 8)):
-        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+        decode_fn = lambda a: self.first_stage_model.decode(a.to(self.vae_dtype).to(self.device)).float()
        return self.process_output(comfy.utils.tiled_scale_multidim(samples, decode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.upscale_ratio, out_channels=self.output_channels, index_formulas=self.upscale_index_formula, output_device=self.output_device))

    def encode_tiled_(self, pixel_samples, tile_x=512, tile_y=512, overlap = 64):
@ -908,7 +905,7 @@ class VAE:
        steps += pixel_samples.shape[0] * comfy.utils.get_tiled_scale_steps(pixel_samples.shape[3], pixel_samples.shape[2], tile_x * 2, tile_y // 2, overlap)
        pbar = comfy.utils.ProgressBar(steps)

-        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
        samples = comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x, tile_y, overlap, upscale_amount = (1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device, pbar=pbar)
        samples += comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x * 2, tile_y // 2, overlap, upscale_amount = (1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device, pbar=pbar)
        samples += comfy.utils.tiled_scale(pixel_samples, encode_fn, tile_x // 2, tile_y * 2, overlap, upscale_amount = (1/self.downscale_ratio), out_channels=self.latent_channels, output_device=self.output_device, pbar=pbar)
@ -917,7 +914,7 @@ class VAE:

    def encode_tiled_1d(self, samples, tile_x=256 * 2048, overlap=64 * 2048):
        if self.latent_dim == 1:
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
            out_channels = self.latent_channels
            upscale_amount = 1 / self.downscale_ratio
        else:
@ -926,7 +923,7 @@ class VAE:
            tile_x = tile_x // extra_channel_size
            overlap = overlap // extra_channel_size
            upscale_amount = 1 / self.downscale_ratio
-            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).to(dtype=self.vae_output_dtype())
+            encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).reshape(1, out_channels, -1).float()

        out = comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_x,), overlap=overlap, upscale_amount=upscale_amount, out_channels=out_channels, output_device=self.output_device)
        if self.latent_dim == 1:
@ -935,7 +932,7 @@ class VAE:
            return out.reshape(samples.shape[0], self.latent_channels, extra_channel_size, -1)

    def encode_tiled_3d(self, samples, tile_t=9999, tile_x=512, tile_y=512, overlap=(1, 64, 64)):
-        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).to(dtype=self.vae_output_dtype())
+        encode_fn = lambda a: self.first_stage_model.encode((self.process_input(a)).to(self.vae_dtype).to(self.device)).float()
        return comfy.utils.tiled_scale_multidim(samples, encode_fn, tile=(tile_t, tile_x, tile_y), overlap=overlap, upscale_amount=self.downscale_ratio, out_channels=self.latent_channels, downscale=True, index_formulas=self.downscale_index_formula, output_device=self.output_device)

    def decode(self, samples_in, vae_options={}):
@ -953,9 +950,9 @@ class VAE:

            for x in range(0, samples_in.shape[0], batch_number):
                samples = samples_in[x:x+batch_number].to(self.vae_dtype).to(self.device)
-                out = self.process_output(self.first_stage_model.decode(samples, **vae_options).to(self.output_device).to(dtype=self.vae_output_dtype()))
+                out = self.process_output(self.first_stage_model.decode(samples, **vae_options).to(self.output_device).float())
                if pixel_samples is None:
-                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
+                    pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
                pixel_samples[x:x+batch_number] = out
        except Exception as e:
            model_management.raise_non_oom(e)
@ -1028,9 +1025,9 @@ class VAE:
            samples = None
            for x in range(0, pixel_samples.shape[0], batch_number):
                pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype).to(self.device)
-                out = self.first_stage_model.encode(pixels_in).to(self.output_device).to(dtype=self.vae_output_dtype())
+                out = self.first_stage_model.encode(pixels_in).to(self.output_device).float()
                if samples is None:
-                    samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
+                    samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
                samples[x:x + batch_number] = out

        except Exception as e:
--- a/middleware/cache_middleware.py
+++ b/middleware/cache_middleware.py
@ -32,7 +32,7 @@ async def cache_control(
    )

    if request.path.endswith(".js") or request.path.endswith(".css") or is_entry_point:
-        response.headers.setdefault("Cache-Control", "no-store")
+        response.headers.setdefault("Cache-Control", "no-cache")
        return response

    # Early return for non-image files - no cache headers needed
--- a/nodes.py
+++ b/nodes.py
@ -1724,8 +1724,6 @@ class LoadImage:
        output_masks = []
        w, h = None, None

-        dtype = comfy.model_management.intermediate_dtype()
-
        for i in ImageSequence.Iterator(img):
            i = node_helpers.pillow(ImageOps.exif_transpose, i)

@ -1750,8 +1748,8 @@ class LoadImage:
                mask = 1. - torch.from_numpy(mask)
            else:
                mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
-            output_images.append(image.to(dtype=dtype))
-            output_masks.append(mask.unsqueeze(0).to(dtype=dtype))
+            output_images.append(image)
+            output_masks.append(mask.unsqueeze(0))

            if img.format == "MPO":
                break  # ignore all frames except the first one for MPO format
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,4 @@
-comfyui-frontend-package==1.41.20
+comfyui-frontend-package==1.41.19
 comfyui-workflow-templates==0.9.21
 comfyui-embedded-docs==0.4.3
 torch
@ -23,7 +23,7 @@ SQLAlchemy
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo>=0.2.12
+comfy-aimdo>=0.2.11
 requests
 simpleeval>=1.0.0
 blake3
--- a/server.py
+++ b/server.py
@ -310,7 +310,7 @@ class PromptServer():
        @routes.get("/")
        async def get_root(request):
            response = web.FileResponse(os.path.join(self.web_root, "index.html"))
-            response.headers['Cache-Control'] = 'no-store, must-revalidate'
+            response.headers['Cache-Control'] = 'no-cache'
            response.headers["Pragma"] = "no-cache"
            response.headers["Expires"] = "0"
            return response
--- a/tests-unit/server_test/test_cache_control.py
+++ b/tests-unit/server_test/test_cache_control.py
@ -28,31 +28,31 @@ CACHE_SCENARIOS = [
    },
    # JavaScript/CSS scenarios
    {
-        "name": "js_no_store",
+        "name": "js_no_cache",
        "path": "/script.js",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    {
-        "name": "css_no_store",
+        "name": "css_no_cache",
        "path": "/styles.css",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    {
-        "name": "index_json_no_store",
+        "name": "index_json_no_cache",
        "path": "/api/index.json",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    {
-        "name": "localized_index_json_no_store",
+        "name": "localized_index_json_no_cache",
        "path": "/templates/index.zh.json",
        "status": 200,
-        "expected_cache": "no-store",
+        "expected_cache": "no-cache",
        "should_have_header": True,
    },
    # Non-matching files