Compare commits

...

8 Commits

Author SHA1 Message Date
tashiscool
2f13ca038f
Merge 45a2363e6a into b138133ffa 2026-05-03 16:10:23 -04:00
Silver
b138133ffa
Enable triton comfy kitchen via cli-arg (#12730) 2026-05-03 14:07:21 -04:00
Jukka Seppänen
025e6792ee
Batch broadcasting in JoinImageWithAlpha node (#13686)
Some checks failed
Python Linting / Run Ruff (push) Waiting to run
Python Linting / Run Pylint (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run
Execution Tests / test (macos-latest) (push) Waiting to run
Execution Tests / test (ubuntu-latest) (push) Waiting to run
Execution Tests / test (windows-latest) (push) Waiting to run
Test server launches without errors / test (push) Waiting to run
Unit Tests / test (macos-latest) (push) Waiting to run
Unit Tests / test (ubuntu-latest) (push) Waiting to run
Unit Tests / test (windows-2022) (push) Waiting to run
Generate Pydantic Stubs from api.comfy.org / generate-models (push) Has been cancelled
* Batch broadcasting in JoinImageWithAlpha node
2026-05-03 16:30:00 +03:00
Luke Mino-Altherr
867b8d2408
fix: gracefully handle port-in-use error on server startup (#13001)
Catch EADDRINUSE OSError when binding the TCP site and exit with a clear error message instead of an unhandled traceback.
2026-05-03 20:44:20 +08:00
Alexis Rolland
d0f0b15cf5
Update ComfyUI screenshot in README (#13683)
Update ComfyUI screenshot to showcase a more modern workflow
2026-05-03 18:48:58 +08:00
Alexis Rolland
b5bb83c964
Fix issue blend images with alpha (#13615)
Make ImageBlend and ImageCompositeMasked nodes handle images with different channel counts
2026-05-03 18:17:08 +08:00
Tashdid Khan
45a2363e6a fix: remove hardcoded local paths from MPS FP8 tests
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 21:27:35 -05:00
Tashdid Khan
edd44a6874 fix: add CPU fallback for FP8 quantization on MPS (Apple Silicon)
MPS does not support float8_e4m3fn/float8_e5m2 dtypes. When FP8-quantized
models (FLUX, SD3.5, Wan 2.2, LTX-Video) are loaded on Apple Silicon, the
quantization step crashes with:

  TypeError: Trying to convert Float8_e4m3fn to the MPS backend but it does
  not have support for that dtype.

This adds device-aware fallbacks that move tensors to CPU for the FP8
quantization step only. The rest of inference remains on MPS.

Three code paths are patched:
- comfy/float.py: stochastic_rounding() — also fixes the secondary
  "Placeholder storage has not been allocated on MPS device!" error
  caused by torch.Generator being bound to MPS.
- comfy/float.py: stochastic_round_quantize_nvfp4*() — these create
  float8_e4m3fn block scales internally.
- comfy/quant_ops.py: _TensorCoreFP8LayoutBase.quantize() — the
  ck.quantize_per_tensor_fp8 path also fails on MPS.

Fixes: #6995, #9255, #11626, #11817

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 21:21:31 -05:00
8 changed files with 198 additions and 12 deletions

View File

@ -31,7 +31,8 @@
[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/4aab0bef-b413-4595-9766-a2c134676d27" />
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/36e065e0-bfae-4456-8c7f-8369d5ea48a2" />
<br>
</div>
ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...

View File

@ -91,6 +91,7 @@ parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE"
parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
parser.add_argument("--enable-triton-backend", action="store_true", help="ComfyUI will enable the use of Triton backend in comfy-kitchen. Is disabled at launch by default.")
class LatentPreviewMethod(enum.Enum):
NoPreviews = "none"

View File

@ -55,6 +55,11 @@ def stochastic_rounding(value, dtype, seed=0):
if dtype == torch.bfloat16:
return value.to(dtype=torch.bfloat16)
if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
# MPS does not support FP8 dtypes — perform rounding on CPU and return the result there.
on_mps = value.device.type == "mps"
if on_mps:
value = value.cpu()
generator = torch.Generator(device=value.device)
generator.manual_seed(seed)
output = torch.empty_like(value, dtype=dtype)
@ -159,6 +164,12 @@ def stochastic_round_quantize_nvfp4(x, per_tensor_scale, pad_16x, seed=0):
"""Round up x to the nearest multiple."""
return ((x + multiple - 1) // multiple) * multiple
# MPS does not support FP8 dtypes used for block scales — perform on CPU.
on_mps = x.device.type == "mps"
if on_mps:
x = x.cpu()
per_tensor_scale = per_tensor_scale.cpu() if isinstance(per_tensor_scale, torch.Tensor) else per_tensor_scale
generator = torch.Generator(device=x.device)
generator.manual_seed(seed)
@ -179,6 +190,12 @@ def stochastic_round_quantize_nvfp4_by_block(x, per_tensor_scale, pad_16x, seed=
"""Round up x to the nearest multiple."""
return ((x + multiple - 1) // multiple) * multiple
# MPS does not support FP8 dtypes used for block scales — perform on CPU.
on_mps = x.device.type == "mps"
if on_mps:
x = x.cpu()
per_tensor_scale = per_tensor_scale.cpu() if isinstance(per_tensor_scale, torch.Tensor) else per_tensor_scale
orig_shape = x.shape
# Handle padding

View File

@ -1,6 +1,8 @@
import torch
import logging
from comfy.cli_args import args
try:
import comfy_kitchen as ck
from comfy_kitchen.tensor import (
@ -21,7 +23,15 @@ try:
ck.registry.disable("cuda")
logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
ck.registry.disable("triton")
if args.enable_triton_backend:
try:
import triton
logging.info("Found triton %s. Enabling comfy-kitchen triton backend.", triton.__version__)
except ImportError as e:
logging.error(f"Failed to import triton, Error: {e}, the comfy-kitchen triton backend will not be available.")
ck.registry.disable("triton")
else:
ck.registry.disable("triton")
for k, v in ck.list_backends().items():
logging.info(f"Found comfy_kitchen backend {k}: {v}")
except ImportError as e:
@ -83,6 +93,12 @@ class _TensorCoreFP8LayoutBase(_CKFp8Layout):
if not isinstance(scale, torch.Tensor):
scale = torch.tensor(scale, device=tensor.device, dtype=torch.float32)
# MPS does not support FP8 dtypes — move to CPU for quantization.
on_mps = tensor.device.type == "mps"
if on_mps:
tensor = tensor.cpu()
scale = scale.cpu()
if stochastic_rounding > 0:
if inplace_ops:
tensor *= (1.0 / scale).to(tensor.dtype)

View File

@ -202,14 +202,11 @@ class JoinImageWithAlpha(io.ComfyNode):
@classmethod
def execute(cls, image: torch.Tensor, alpha: torch.Tensor) -> io.NodeOutput:
batch_size = min(len(image), len(alpha))
out_images = []
batch_size = max(len(image), len(alpha))
alpha = 1.0 - resize_mask(alpha, image.shape[1:])
for i in range(batch_size):
out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2))
return io.NodeOutput(torch.stack(out_images))
alpha = comfy.utils.repeat_to_batch_size(alpha, batch_size)
image = comfy.utils.repeat_to_batch_size(image, batch_size)
return io.NodeOutput(torch.cat((image[..., :3], alpha.unsqueeze(-1)), dim=-1))
class CompositingExtension(ComfyExtension):

View File

@ -86,6 +86,6 @@ def image_alpha_fix(destination, source):
if destination.shape[-1] < source.shape[-1]:
source = source[...,:destination.shape[-1]]
elif destination.shape[-1] > source.shape[-1]:
destination = torch.nn.functional.pad(destination, (0, 1))
destination[..., -1] = 1.0
source = torch.nn.functional.pad(source, (0, 1))
source[..., -1] = 1.0
return destination, source

View File

@ -1,3 +1,4 @@
import errno
import os
import sys
import asyncio
@ -1245,7 +1246,13 @@ class PromptServer():
address = addr[0]
port = addr[1]
site = web.TCPSite(runner, address, port, ssl_context=ssl_ctx)
await site.start()
try:
await site.start()
except OSError as e:
if e.errno == errno.EADDRINUSE:
logging.error(f"Port {port} is already in use on address {address}. Please close the other application or use a different port with --port.")
raise SystemExit(1)
raise
if not hasattr(self, 'address'):
self.address = address #TODO: remove this

147
tests/test_fp8_mps.py Normal file
View File

@ -0,0 +1,147 @@
"""
Tests for FP8 quantization on MPS (Apple Silicon) devices.
MPS does not natively support float8_e4m3fn or float8_e5m2 dtypes.
These tests verify that:
1. FP8 operations correctly fall back to CPU when on MPS.
2. The round-trip (quantize on CPU -> result on original device) is numerically sound.
3. No "Placeholder storage has not been allocated on MPS device!" errors occur.
"""
import pytest
import torch
import comfy.float
from comfy.quant_ops import TensorCoreFP8E4M3Layout, TensorCoreFP8E5M2Layout
# Skip the entire module if MPS is not available
pytestmark = pytest.mark.skipif(
not torch.backends.mps.is_available(),
reason="MPS backend not available"
)
# ── helpers ──────────────────────────────────────────────────────────────────
def _make_mps_tensor(shape=(256, 256), dtype=torch.float32):
return torch.randn(shape, device="mps", dtype=dtype)
# ── Tests for comfy.float ────────────────────────────────────────────────────
class TestStochasticRoundingMPS:
"""Tests for comfy.float.stochastic_rounding on MPS device."""
def test_stochastic_rounding_fp8_e4m3fn_on_mps(self):
"""stochastic_rounding must not crash when input is on MPS and target dtype is float8_e4m3fn."""
x = _make_mps_tensor((64, 64), dtype=torch.float32)
result = comfy.float.stochastic_rounding(x, dtype=torch.float8_e4m3fn, seed=42)
assert result.dtype == torch.float8_e4m3fn
assert result.shape == x.shape
def test_stochastic_rounding_fp8_e5m2_on_mps(self):
"""stochastic_rounding must not crash when input is on MPS and target dtype is float8_e5m2."""
x = _make_mps_tensor((64, 64), dtype=torch.float32)
result = comfy.float.stochastic_rounding(x, dtype=torch.float8_e5m2, seed=42)
assert result.dtype == torch.float8_e5m2
assert result.shape == x.shape
def test_stochastic_rounding_fp8_result_on_cpu(self):
"""Result of FP8 rounding from MPS input should be on CPU (since MPS can't hold FP8)."""
x = _make_mps_tensor((32, 32), dtype=torch.float32)
result = comfy.float.stochastic_rounding(x, dtype=torch.float8_e4m3fn, seed=42)
# FP8 tensors cannot live on MPS, so result must be on CPU
assert result.device.type == "cpu"
def test_stochastic_rounding_non_fp8_still_works(self):
"""Non-FP8 dtypes on MPS must still work as before (no regression)."""
x = _make_mps_tensor((32, 32), dtype=torch.float32)
r16 = comfy.float.stochastic_rounding(x, dtype=torch.float16, seed=0)
assert r16.dtype == torch.float16
assert r16.device.type == "mps"
rbf16 = comfy.float.stochastic_rounding(x, dtype=torch.bfloat16, seed=0)
assert rbf16.dtype == torch.bfloat16
assert rbf16.device.type == "mps"
def test_stochastic_rounding_fp8_numerical_sanity(self):
"""FP8 round-trip (float32 -> fp8 -> float32) should have bounded error."""
x = torch.randn(128, 128, device="mps", dtype=torch.float32)
x_clamped = torch.clamp(x, min=-448, max=448) # FP8 e4m3fn range
fp8 = comfy.float.stochastic_rounding(x_clamped, dtype=torch.float8_e4m3fn, seed=123)
# Convert back to float32 for comparison
reconstructed = fp8.to(torch.float32)
# Max relative error should be bounded (FP8 e4m3fn has ~0.125 relative precision)
x_cpu = x_clamped.cpu()
max_abs_err = (reconstructed - x_cpu).abs().max().item()
# FP8 e4m3fn max value is 448, min subnormal ~0.001953
# For random normal data, error should be well under 1.0
assert max_abs_err < 2.0, f"FP8 round-trip error too large: {max_abs_err}"
class TestManualStochasticRoundMPS:
"""Tests for comfy.float.manual_stochastic_round_to_float8 on MPS device."""
def test_manual_round_fp8_on_mps_tensor(self):
"""stochastic_rounding handles MPS generator internally without 'Placeholder storage' error."""
x = _make_mps_tensor((16, 16), dtype=torch.float32)
result = comfy.float.stochastic_rounding(x, dtype=torch.float8_e4m3fn, seed=42)
assert result.dtype == torch.float8_e4m3fn
class TestNVFP4StochasticRoundMPS:
"""Tests for NVFP4 stochastic rounding on MPS - also creates FP8 tensors internally."""
def test_nvfp4_stochastic_round_on_mps(self):
"""stochastic_round_quantize_nvfp4 creates FP8 block scales internally."""
# NVFP4 requires 2D input with dimensions divisible by 16
x = torch.randn(32, 32, device="mps", dtype=torch.float32)
scale = torch.tensor(1.0, device="mps", dtype=torch.float32)
# This should not crash - internally creates float8_e4m3fn block scales
qdata, block_scale = comfy.float.stochastic_round_quantize_nvfp4(
x, scale, pad_16x=False, seed=42
)
assert qdata.dtype == torch.uint8
# ── Tests for comfy.quant_ops (integration) ──────────────────────────────────
class TestQuantOpsMPS:
"""Tests for the quantization ops layer that calls into comfy.float."""
def test_fp8_layout_quantize_on_mps(self):
"""TensorCoreFP8E4M3Layout.quantize must work with MPS tensors."""
x = _make_mps_tensor((64, 64), dtype=torch.bfloat16)
qdata, params = TensorCoreFP8E4M3Layout.quantize(
x, scale="recalculate", stochastic_rounding=42
)
assert qdata.dtype == torch.float8_e4m3fn
assert params.orig_dtype == torch.bfloat16
def test_fp8_layout_quantize_without_stochastic_on_mps(self):
"""TensorCoreFP8E4M3Layout.quantize with stochastic_rounding=0 uses ck.quantize_per_tensor_fp8."""
x = _make_mps_tensor((64, 64), dtype=torch.bfloat16)
qdata, params = TensorCoreFP8E4M3Layout.quantize(
x, scale="recalculate", stochastic_rounding=0
)
assert qdata.dtype == torch.float8_e4m3fn
def test_fp8_e5m2_layout_quantize_on_mps(self):
"""TensorCoreFP8E5M2Layout.quantize must work with MPS tensors."""
x = _make_mps_tensor((64, 64), dtype=torch.float32)
qdata, params = TensorCoreFP8E5M2Layout.quantize(
x, scale="recalculate", stochastic_rounding=42
)
assert qdata.dtype == torch.float8_e5m2
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])