From 6da00dd899e3ee6f2a0a8163b080a9f373395025 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 5 Jan 2026 18:48:58 -0800
Subject: [PATCH] Initial ops changes to use comfy_kitchen: Initial nvfp4
 checkpoint support. (#11635)

---------

Co-authored-by: Jedrzej Kosinski <kosinkadink1@gmail.com>
---
 .github/workflows/test-build.yml              |   2 +-
 .github/workflows/test-launch.yml             |   4 +-
 comfy/model_management.py                     |   4 +-
 comfy/ops.py                                  | 164 +++--
 comfy/quant_ops.py                            | 641 +++---------------
 requirements.txt                              |   1 +
 .../comfy_quant/test_mixed_precision.py       |  12 +-
 tests-unit/comfy_quant/test_quant_registry.py | 190 ------
 8 files changed, 223 insertions(+), 795 deletions(-)
 delete mode 100644 tests-unit/comfy_quant/test_quant_registry.py

diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index 419873ad8..9160242e9 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
     steps:
       - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test-launch.yml b/.github/workflows/test-launch.yml
index fd70aff23..ef0d3f123 100644
--- a/.github/workflows/test-launch.yml
+++ b/.github/workflows/test-launch.yml
@@ -32,7 +32,9 @@ jobs:
       working-directory: ComfyUI
     - name: Check for unhandled exceptions in server log
       run: |
-        if grep -qE "Exception|Error" console_output.log; then
+        grep -v "Found comfy_kitchen backend triton: {'available': False, 'disabled': True, 'unavailable_reason': \"ImportError: No module named 'triton'\", 'capabilities': \[\]}" console_output.log | grep -v "Found comfy_kitchen backend triton: {'available': False, 'disabled': False, 'unavailable_reason': \"ImportError: No module named 'triton'\", 'capabilities': \[\]}" > console_output_filtered.log
+        cat console_output_filtered.log
+        if grep -qE "Exception|Error" console_output_filtered.log; then
           echo "Unhandled exception/error found in server log."
           exit 1
         fi
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 7f5a8aee9..22f4de044 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1156,7 +1156,7 @@ def pin_memory(tensor):
     if not tensor.is_contiguous():
         return False
 
-    size = tensor.numel() * tensor.element_size()
+    size = tensor.nbytes
     if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
         return False
 
@@ -1183,7 +1183,7 @@ def unpin_memory(tensor):
         return False
 
     ptr = tensor.data_ptr()
-    size = tensor.numel() * tensor.element_size()
+    size = tensor.nbytes
 
     size_stored = PINNED_MEMORY.get(ptr, None)
     if size_stored is None:
diff --git a/comfy/ops.py b/comfy/ops.py
index 16889bb82..f5e1e9230 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -79,7 +79,7 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
     if input is not None:
         if dtype is None:
             if isinstance(input, QuantizedTensor):
-                dtype = input._layout_params["orig_dtype"]
+                dtype = input.params.orig_dtype
             else:
                 dtype = input.dtype
         if bias_dtype is None:
@@ -412,26 +412,34 @@ def fp8_linear(self, input):
         return None
 
     input_dtype = input.dtype
+    input_shape = input.shape
+    tensor_3d = input.ndim == 3
 
-    if input.ndim == 3 or input.ndim == 2:
-        w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True)
-        scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
+    if tensor_3d:
+        input = input.reshape(-1, input_shape[2])
 
-        scale_input = torch.ones((), device=input.device, dtype=torch.float32)
-        input = torch.clamp(input, min=-448, max=448, out=input)
-        layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype}
-        quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight)
+    if input.ndim != 2:
+        return None
+    w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True)
+    scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
 
-        # Wrap weight in QuantizedTensor - this enables unified dispatch
-        # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
-        layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype}
-        quantized_weight = QuantizedTensor(w, "TensorCoreFP8Layout", layout_params_weight)
-        o = torch.nn.functional.linear(quantized_input, quantized_weight, bias)
+    scale_input = torch.ones((), device=input.device, dtype=torch.float32)
+    input = torch.clamp(input, min=-448, max=448, out=input)
+    input_fp8 = input.to(dtype).contiguous()
+    layout_params_input = TensorCoreFP8Layout.Params(scale=scale_input, orig_dtype=input_dtype, orig_shape=tuple(input_fp8.shape))
+    quantized_input = QuantizedTensor(input_fp8, TensorCoreFP8Layout, layout_params_input)
 
-        uncast_bias_weight(self, w, bias, offload_stream)
-        return o
+    # Wrap weight in QuantizedTensor - this enables unified dispatch
+    # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py!
+    layout_params_weight = TensorCoreFP8Layout.Params(scale=scale_weight, orig_dtype=input_dtype, orig_shape=tuple(w.shape))
+    quantized_weight = QuantizedTensor(w, TensorCoreFP8Layout, layout_params_weight)
+    o = torch.nn.functional.linear(quantized_input, quantized_weight, bias)
 
-    return None
+    uncast_bias_weight(self, w, bias, offload_stream)
+    if tensor_3d:
+        o = o.reshape((input_shape[0], input_shape[1], w.shape[0]))
+
+    return o
 
 class fp8_ops(manual_cast):
     class Linear(manual_cast.Linear):
@@ -477,7 +485,12 @@ if CUBLAS_IS_AVAILABLE:
 # ==============================================================================
 # Mixed Precision Operations
 # ==============================================================================
-from .quant_ops import QuantizedTensor, QUANT_ALGOS
+from .quant_ops import (
+    QuantizedTensor,
+    QUANT_ALGOS,
+    TensorCoreFP8Layout,
+    get_layout_class,
+)
 
 
 def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False):
@@ -497,14 +510,15 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
             ) -> None:
                 super().__init__()
 
-                if dtype is None:
-                    dtype = MixedPrecisionOps._compute_dtype
-
-                self.factory_kwargs = {"device": device, "dtype": dtype}
+                self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
+                # self.factory_kwargs = {"device": device, "dtype": dtype}
 
                 self.in_features = in_features
                 self.out_features = out_features
-                self._has_bias = bias
+                if bias:
+                    self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
+                else:
+                    self.register_parameter("bias", None)
 
                 self.tensor_class = None
                 self._full_precision_mm = MixedPrecisionOps._full_precision_mm
@@ -512,6 +526,16 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
             def reset_parameters(self):
                 return None
 
+            def _load_scale_param(self, state_dict, prefix, param_name, device, manually_loaded_keys, dtype=None):
+                key = f"{prefix}{param_name}"
+                value = state_dict.pop(key, None)
+                if value is not None:
+                    value = value.to(device=device)
+                    if dtype is not None:
+                        value = value.view(dtype=dtype)
+                    manually_loaded_keys.append(key)
+                return value
+
             def _load_from_state_dict(self, state_dict, prefix, local_metadata,
                                     strict, missing_keys, unexpected_keys, error_msgs):
 
@@ -529,14 +553,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
                     layer_conf = json.loads(layer_conf.numpy().tobytes())
 
                 if layer_conf is None:
-                    dtype = self.factory_kwargs["dtype"]
-                    self.weight = torch.nn.Parameter(weight.to(device=device, dtype=dtype), requires_grad=False)
-                    if dtype != MixedPrecisionOps._compute_dtype:
-                        self.comfy_cast_weights = True
-                    if self._has_bias:
-                        self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=dtype))
-                    else:
-                        self.register_parameter("bias", None)
+                    self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
                 else:
                     self.quant_format = layer_conf.get("format", None)
                     if not self._full_precision_mm:
@@ -547,31 +564,46 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
 
                     qconfig = QUANT_ALGOS[self.quant_format]
                     self.layout_type = qconfig["comfy_tensor_layout"]
+                    layout_cls = get_layout_class(self.layout_type)
 
-                    weight_scale_key = f"{prefix}weight_scale"
-                    scale = state_dict.pop(weight_scale_key, None)
-                    if scale is not None:
-                        scale = scale.to(device)
-                    layout_params = {
-                        'scale': scale,
-                        'orig_dtype': MixedPrecisionOps._compute_dtype,
-                        'block_size': qconfig.get("group_size", None),
-                    }
+                    # Load format-specific parameters
+                    if self.quant_format in ["float8_e4m3fn", "float8_e5m2"]:
+                        # FP8: single tensor scale
+                        scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys)
 
-                    if scale is not None:
-                        manually_loaded_keys.append(weight_scale_key)
+                        params = layout_cls.Params(
+                            scale=scale,
+                            orig_dtype=MixedPrecisionOps._compute_dtype,
+                            orig_shape=(self.out_features, self.in_features),
+                        )
+
+                    elif self.quant_format == "nvfp4":
+                        # NVFP4: tensor_scale (weight_scale_2) + block_scale (weight_scale)
+                        tensor_scale = self._load_scale_param(state_dict, prefix, "weight_scale_2", device, manually_loaded_keys)
+                        block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys,
+                                                             dtype=torch.float8_e4m3fn)
+
+                        if tensor_scale is None or block_scale is None:
+                            raise ValueError(f"Missing NVFP4 scales for layer {layer_name}")
+
+                        params = layout_cls.Params(
+                            scale=tensor_scale,
+                            block_scale=block_scale,
+                            orig_dtype=MixedPrecisionOps._compute_dtype,
+                            orig_shape=(self.out_features, self.in_features),
+                        )
+                    else:
+                        raise ValueError(f"Unsupported quantization format: {self.quant_format}")
 
                     self.weight = torch.nn.Parameter(
-                        QuantizedTensor(weight.to(device=device, dtype=qconfig.get("storage_t", None)), self.layout_type, layout_params),
+                        QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), self.layout_type, params),
                         requires_grad=False
                     )
 
-                    if self._has_bias:
-                        self.bias = torch.nn.Parameter(torch.empty(self.out_features, device=device, dtype=MixedPrecisionOps._compute_dtype))
-                    else:
-                        self.register_parameter("bias", None)
-
                     for param_name in qconfig["parameters"]:
+                        if param_name in {"weight_scale", "weight_scale_2"}:
+                            continue  # Already handled above
+
                         param_key = f"{prefix}{param_name}"
                         _v = state_dict.pop(param_key, None)
                         if _v is None:
@@ -588,7 +620,15 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
             def state_dict(self, *args, destination=None, prefix="", **kwargs):
                 sd = super().state_dict(*args, destination=destination, prefix=prefix, **kwargs)
                 if isinstance(self.weight, QuantizedTensor):
-                    sd["{}weight_scale".format(prefix)] = self.weight._layout_params['scale']
+                    layout_cls = self.weight._layout_cls
+
+                    # Check if it's any FP8 variant (E4M3 or E5M2)
+                    if layout_cls in ("TensorCoreFP8E4M3Layout", "TensorCoreFP8E5M2Layout", "TensorCoreFP8Layout"):
+                        sd["{}weight_scale".format(prefix)] = self.weight._params.scale
+                    elif layout_cls == "TensorCoreNVFP4Layout":
+                        sd["{}weight_scale_2".format(prefix)] = self.weight._params.scale
+                        sd["{}weight_scale".format(prefix)] = self.weight._params.block_scale
+
                     quant_conf = {"format": self.quant_format}
                     if self._full_precision_mm:
                         quant_conf["full_precision_matrix_mult"] = True
@@ -607,12 +647,33 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
             def forward(self, input, *args, **kwargs):
                 run_every_op()
 
+                input_shape = input.shape
+                tensor_3d = input.ndim == 3
+
                 if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
                     return self.forward_comfy_cast_weights(input, *args, **kwargs)
+
                 if (getattr(self, 'layout_type', None) is not None and
                     not isinstance(input, QuantizedTensor)):
-                    input = QuantizedTensor.from_float(input, self.layout_type, scale=getattr(self, 'input_scale', None), dtype=self.weight.dtype)
-                return self._forward(input, self.weight, self.bias)
+
+                    # Reshape 3D tensors to 2D for quantization (needed for NVFP4 and others)
+                    if tensor_3d:
+                        input = input.reshape(-1, input_shape[2])
+
+                    if input.ndim != 2:
+                        # Fall back to comfy_cast_weights for non-2D tensors
+                        return self.forward_comfy_cast_weights(input.reshape(input_shape), *args, **kwargs)
+
+                    # dtype is now implicit in the layout class
+                    input = QuantizedTensor.from_float(input, self.layout_type, scale=getattr(self, 'input_scale', None))
+
+                output = self._forward(input, self.weight, self.bias)
+
+                # Reshape output back to 3D if input was 3D
+                if tensor_3d:
+                    output = output.reshape((input_shape[0], input_shape[1], self.weight.shape[0]))
+
+                return output
 
             def convert_weight(self, weight, inplace=False, **kwargs):
                 if isinstance(weight, QuantizedTensor):
@@ -622,7 +683,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
 
             def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
                 if getattr(self, 'layout_type', None) is not None:
-                    weight = QuantizedTensor.from_float(weight, self.layout_type, scale="recalculate", dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True)
+                    # dtype is now implicit in the layout class
+                    weight = QuantizedTensor.from_float(weight, self.layout_type, scale="recalculate", stochastic_rounding=seed, inplace_ops=True)
                 else:
                     weight = weight.to(self.weight.dtype)
                 if return_weight:
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index cd96541d7..cd737726f 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -1,580 +1,133 @@
 import torch
 import logging
-from typing import Tuple, Dict
+
+try:
+    import comfy_kitchen as ck
+    from comfy_kitchen.tensor import (
+        QuantizedTensor,
+        QuantizedLayout,
+        TensorCoreFP8Layout as _CKFp8Layout,
+        TensorCoreNVFP4Layout,  # Direct import, no wrapper needed
+        register_layout_op,
+        register_layout_class,
+        get_layout_class,
+    )
+    _CK_AVAILABLE = True
+    ck.registry.disable("triton")
+    for k, v in ck.list_backends().items():
+        logging.info(f"Found comfy_kitchen backend {k}: {v}")
+except ImportError as e:
+    logging.error(f"Failed to import comfy_kitchen, Error: {e}, fp8 and fp4 support will not be available.")
+    _CK_AVAILABLE = False
+
+    class QuantizedTensor:
+        pass
+
+    class _CKFp8Layout:
+        pass
+
+    class TensorCoreNVFP4Layout:
+        pass
+
+    def register_layout_class(name, cls):
+        pass
+
+    def get_layout_class(name):
+        return None
+
 import comfy.float
 
-_LAYOUT_REGISTRY = {}
-_GENERIC_UTILS = {}
-
-
-def register_layout_op(torch_op, layout_type):
-    """
-    Decorator to register a layout-specific operation handler.
-    Args:
-        torch_op: PyTorch operation (e.g., torch.ops.aten.linear.default)
-        layout_type: Layout class (e.g., TensorCoreFP8Layout)
-    Example:
-        @register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout)
-        def fp8_linear(func, args, kwargs):
-            # FP8-specific linear implementation
-            ...
-    """
-    def decorator(handler_func):
-        if torch_op not in _LAYOUT_REGISTRY:
-            _LAYOUT_REGISTRY[torch_op] = {}
-        _LAYOUT_REGISTRY[torch_op][layout_type] = handler_func
-        return handler_func
-    return decorator
-
-
-def register_generic_util(torch_op):
-    """
-    Decorator to register a generic utility that works for all layouts.
-    Args:
-        torch_op: PyTorch operation (e.g., torch.ops.aten.detach.default)
-
-    Example:
-        @register_generic_util(torch.ops.aten.detach.default)
-        def generic_detach(func, args, kwargs):
-            # Works for any layout
-            ...
-    """
-    def decorator(handler_func):
-        _GENERIC_UTILS[torch_op] = handler_func
-        return handler_func
-    return decorator
-
-
-def _get_layout_from_args(args):
-    for arg in args:
-        if isinstance(arg, QuantizedTensor):
-            return arg._layout_type
-        elif isinstance(arg, (list, tuple)):
-            for item in arg:
-                if isinstance(item, QuantizedTensor):
-                    return item._layout_type
-    return None
-
-
-def _move_layout_params_to_device(params, device):
-    new_params = {}
-    for k, v in params.items():
-        if isinstance(v, torch.Tensor):
-            new_params[k] = v.to(device=device)
-        else:
-            new_params[k] = v
-    return new_params
-
-
-def _copy_layout_params(params):
-    new_params = {}
-    for k, v in params.items():
-        if isinstance(v, torch.Tensor):
-            new_params[k] = v.clone()
-        else:
-            new_params[k] = v
-    return new_params
-
-def _copy_layout_params_inplace(src, dst, non_blocking=False):
-    for k, v in src.items():
-        if isinstance(v, torch.Tensor):
-            dst[k].copy_(v, non_blocking=non_blocking)
-        else:
-            dst[k] = v
-
-class QuantizedLayout:
-    """
-    Base class for quantization layouts.
-
-    A layout encapsulates the format-specific logic for quantization/dequantization
-    and provides a uniform interface for extracting raw tensors needed for computation.
-
-    New quantization formats should subclass this and implement the required methods.
-    """
-    @classmethod
-    def quantize(cls, tensor, **kwargs) -> Tuple[torch.Tensor, Dict]:
-        raise NotImplementedError(f"{cls.__name__} must implement quantize()")
-
-    @staticmethod
-    def dequantize(qdata, **layout_params) -> torch.Tensor:
-        raise NotImplementedError("TensorLayout must implement dequantize()")
-
-    @classmethod
-    def get_plain_tensors(cls, qtensor) -> torch.Tensor:
-        raise NotImplementedError(f"{cls.__name__} must implement get_plain_tensors()")
-
-
-class QuantizedTensor(torch.Tensor):
-    """
-    Universal quantized tensor that works with any layout.
-
-    This tensor subclass uses a pluggable layout system to support multiple
-    quantization formats (FP8, INT4, INT8, etc.) without code duplication.
-
-    The layout_type determines format-specific behavior, while common operations
-    (detach, clone, to) are handled generically.
-
-    Attributes:
-        _qdata: The quantized tensor data
-        _layout_type: Layout class (e.g., TensorCoreFP8Layout)
-        _layout_params: Dict with layout-specific params (scale, zero_point, etc.)
-    """
-
-    @staticmethod
-    def __new__(cls, qdata, layout_type, layout_params):
-        """
-        Create a quantized tensor.
-
-        Args:
-            qdata: The quantized data tensor
-            layout_type: Layout class (subclass of QuantizedLayout)
-            layout_params: Dict with layout-specific parameters
-        """
-        return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, device=qdata.device, dtype=qdata.dtype, requires_grad=False)
-
-    def __init__(self, qdata, layout_type, layout_params):
-        self._qdata = qdata
-        self._layout_type = layout_type
-        self._layout_params = layout_params
-
-    def __repr__(self):
-        layout_name = self._layout_type
-        param_str = ", ".join(f"{k}={v}" for k, v in list(self._layout_params.items())[:2])
-        return f"QuantizedTensor(shape={self.shape}, layout={layout_name}, {param_str})"
-
-    @property
-    def layout_type(self):
-        return self._layout_type
-
-    def __tensor_flatten__(self):
-        """
-        Tensor flattening protocol for proper device movement.
-        """
-        inner_tensors = ["_qdata"]
-        ctx = {
-            "layout_type": self._layout_type,
-        }
-
-        tensor_params = {}
-        non_tensor_params = {}
-        for k, v in self._layout_params.items():
-            if isinstance(v, torch.Tensor):
-                tensor_params[k] = v
-            else:
-                non_tensor_params[k] = v
-
-        ctx["tensor_param_keys"] = list(tensor_params.keys())
-        ctx["non_tensor_params"] = non_tensor_params
-
-        for k, v in tensor_params.items():
-            attr_name = f"_layout_param_{k}"
-            object.__setattr__(self, attr_name, v)
-            inner_tensors.append(attr_name)
-
-        return inner_tensors, ctx
-
-    @staticmethod
-    def __tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride):
-        """
-        Tensor unflattening protocol for proper device movement.
-        Reconstructs the QuantizedTensor after device movement.
-        """
-        layout_type = ctx["layout_type"]
-        layout_params = dict(ctx["non_tensor_params"])
-
-        for key in ctx["tensor_param_keys"]:
-            attr_name = f"_layout_param_{key}"
-            layout_params[key] = inner_tensors[attr_name]
-
-        return QuantizedTensor(inner_tensors["_qdata"], layout_type, layout_params)
-
-    @classmethod
-    def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor':
-        qdata, layout_params = LAYOUTS[layout_type].quantize(tensor, **quantize_kwargs)
-        return cls(qdata, layout_type, layout_params)
-
-    def dequantize(self) -> torch.Tensor:
-        return LAYOUTS[self._layout_type].dequantize(self._qdata, **self._layout_params)
-
-    @classmethod
-    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
-        kwargs = kwargs or {}
-
-        # Step 1: Check generic utilities first (detach, clone, to, etc.)
-        if func in _GENERIC_UTILS:
-            return _GENERIC_UTILS[func](func, args, kwargs)
-
-        # Step 2: Check layout-specific handlers (linear, matmul, etc.)
-        layout_type = _get_layout_from_args(args)
-        if layout_type and func in _LAYOUT_REGISTRY:
-            handler = _LAYOUT_REGISTRY[func].get(layout_type)
-            if handler:
-                return handler(func, args, kwargs)
-
-        # Step 3: Fallback to dequantization
-        if isinstance(args[0] if args else None, QuantizedTensor):
-            logging.info(f"QuantizedTensor: Unhandled operation {func}, falling back to dequantization. kwargs={kwargs}")
-        return cls._dequant_and_fallback(func, args, kwargs)
-
-    @classmethod
-    def _dequant_and_fallback(cls, func, args, kwargs):
-        def dequant_arg(arg):
-            if isinstance(arg, QuantizedTensor):
-                return arg.dequantize()
-            elif isinstance(arg, (list, tuple)):
-                return type(arg)(dequant_arg(a) for a in arg)
-            return arg
-
-        new_args = dequant_arg(args)
-        new_kwargs = dequant_arg(kwargs)
-        return func(*new_args, **new_kwargs)
-
-    def data_ptr(self):
-        return self._qdata.data_ptr()
-
-    def is_pinned(self):
-        return self._qdata.is_pinned()
-
-    def is_contiguous(self, *arg, **kwargs):
-        return self._qdata.is_contiguous(*arg, **kwargs)
-
-    def storage(self):
-        return self._qdata.storage()
-
 # ==============================================================================
-# Generic Utilities (Layout-Agnostic Operations)
+# FP8 Layouts with Comfy-Specific Extensions
 # ==============================================================================
 
-def _create_transformed_qtensor(qt, transform_fn):
-    new_data = transform_fn(qt._qdata)
-    new_params = _copy_layout_params(qt._layout_params)
-    return QuantizedTensor(new_data, qt._layout_type, new_params)
+class _TensorCoreFP8LayoutBase(_CKFp8Layout):
+    FP8_DTYPE = None  # Must be overridden in subclass
 
-
-def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"):
-    if target_layout is not None and target_layout != torch.strided:
-        logging.warning(
-            f"QuantizedTensor: layout change requested to {target_layout}, "
-            f"but not supported. Ignoring layout."
-        )
-
-    # Handle device transfer
-    current_device = qt._qdata.device
-    if target_device is not None:
-        # Normalize device for comparison
-        if isinstance(target_device, str):
-            target_device = torch.device(target_device)
-        if isinstance(current_device, str):
-            current_device = torch.device(current_device)
-
-        if target_device != current_device:
-            logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}")
-            new_q_data = qt._qdata.to(device=target_device)
-            new_params = _move_layout_params_to_device(qt._layout_params, target_device)
-            if target_dtype is not None:
-                new_params["orig_dtype"] = target_dtype
-            new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params)
-            logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}")
-            return new_qt
-
-    logging.debug(f"QuantizedTensor.{op_name}: No device change needed, returning original")
-    return qt
-
-
-@register_generic_util(torch.ops.aten.detach.default)
-def generic_detach(func, args, kwargs):
-    """Detach operation - creates a detached copy of the quantized tensor."""
-    qt = args[0]
-    if isinstance(qt, QuantizedTensor):
-        return _create_transformed_qtensor(qt, lambda x: x.detach())
-    return func(*args, **kwargs)
-
-
-@register_generic_util(torch.ops.aten.clone.default)
-def generic_clone(func, args, kwargs):
-    """Clone operation - creates a deep copy of the quantized tensor."""
-    qt = args[0]
-    if isinstance(qt, QuantizedTensor):
-        return _create_transformed_qtensor(qt, lambda x: x.clone())
-    return func(*args, **kwargs)
-
-
-@register_generic_util(torch.ops.aten._to_copy.default)
-def generic_to_copy(func, args, kwargs):
-    """Device/dtype transfer operation - handles .to(device) calls."""
-    qt = args[0]
-    if isinstance(qt, QuantizedTensor):
-        return _handle_device_transfer(
-            qt,
-            target_device=kwargs.get('device', None),
-            target_dtype=kwargs.get('dtype', None),
-            op_name="_to_copy"
-        )
-    return func(*args, **kwargs)
-
-
-@register_generic_util(torch.ops.aten.to.dtype_layout)
-def generic_to_dtype_layout(func, args, kwargs):
-    """Handle .to(device) calls using the dtype_layout variant."""
-    qt = args[0]
-    if isinstance(qt, QuantizedTensor):
-        return _handle_device_transfer(
-            qt,
-            target_device=kwargs.get('device', None),
-            target_dtype=kwargs.get('dtype', None),
-            target_layout=kwargs.get('layout', None),
-            op_name="to"
-        )
-    return func(*args, **kwargs)
-
-
-@register_generic_util(torch.ops.aten.copy_.default)
-def generic_copy_(func, args, kwargs):
-    qt_dest = args[0]
-    src = args[1]
-    non_blocking = args[2] if len(args) > 2 else False
-    if isinstance(qt_dest, QuantizedTensor):
-        if isinstance(src, QuantizedTensor):
-            # Copy from another quantized tensor
-            qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking)
-            qt_dest._layout_type = src._layout_type
-            orig_dtype = qt_dest._layout_params["orig_dtype"]
-            _copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking)
-            qt_dest._layout_params["orig_dtype"] = orig_dtype
-        else:
-            # Copy from regular tensor - just copy raw data
-            qt_dest._qdata.copy_(src)
-        return qt_dest
-    return func(*args, **kwargs)
-
-
-@register_generic_util(torch.ops.aten.to.dtype)
-def generic_to_dtype(func, args, kwargs):
-    """Handle .to(dtype) calls - dtype conversion only."""
-    src = args[0]
-    if isinstance(src, QuantizedTensor):
-        # For dtype-only conversion, just change the orig_dtype, no real cast is needed
-        target_dtype = args[1] if len(args) > 1 else kwargs.get('dtype')
-        src._layout_params["orig_dtype"] = target_dtype
-        return src
-    return func(*args, **kwargs)
-
-
-@register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default)
-def generic_has_compatible_shallow_copy_type(func, args, kwargs):
-    return True
-
-
-@register_generic_util(torch.ops.aten.empty_like.default)
-def generic_empty_like(func, args, kwargs):
-    """Empty_like operation - creates an empty tensor with the same quantized structure."""
-    qt = args[0]
-    if isinstance(qt, QuantizedTensor):
-        # Create empty tensor with same shape and dtype as the quantized data
-        hp_dtype = kwargs.pop('dtype', qt._layout_params["orig_dtype"])
-        new_qdata = torch.empty_like(qt._qdata, **kwargs)
-
-        # Handle device transfer for layout params
-        target_device = kwargs.get('device', new_qdata.device)
-        new_params = _move_layout_params_to_device(qt._layout_params, target_device)
-
-        # Update orig_dtype if dtype is specified
-        new_params['orig_dtype'] = hp_dtype
-
-        return QuantizedTensor(new_qdata, qt._layout_type, new_params)
-    return func(*args, **kwargs)
-
-# ==============================================================================
-# FP8 Layout + Operation Handlers
-# ==============================================================================
-class TensorCoreFP8Layout(QuantizedLayout):
-    """
-    Storage format:
-    - qdata: FP8 tensor (torch.float8_e4m3fn or torch.float8_e5m2)
-    - scale: Scalar tensor (float32) for dequantization
-    - orig_dtype: Original dtype before quantization (for casting back)
-    """
     @classmethod
-    def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False):
+    def quantize(cls, tensor, scale=None, stochastic_rounding=0, inplace_ops=False):
+        if cls.FP8_DTYPE is None:
+            raise NotImplementedError(f"{cls.__name__} must define FP8_DTYPE")
+
         orig_dtype = tensor.dtype
+        orig_shape = tuple(tensor.shape)
 
         if isinstance(scale, str) and scale == "recalculate":
-            scale = torch.amax(tensor.abs()).to(dtype=torch.float32) / torch.finfo(dtype).max
+            scale = torch.amax(tensor.abs()).to(dtype=torch.float32) / torch.finfo(cls.FP8_DTYPE).max
             if tensor.dtype not in [torch.float32, torch.bfloat16]:  # Prevent scale from being too small
                 tensor_info = torch.finfo(tensor.dtype)
                 scale = (1.0 / torch.clamp((1.0 / scale), min=tensor_info.min, max=tensor_info.max))
 
-        if scale is not None:
-            if not isinstance(scale, torch.Tensor):
-                scale = torch.tensor(scale)
-            scale = scale.to(device=tensor.device, dtype=torch.float32)
+        if scale is None:
+            scale = torch.ones((), device=tensor.device, dtype=torch.float32)
+        if not isinstance(scale, torch.Tensor):
+            scale = torch.tensor(scale, device=tensor.device, dtype=torch.float32)
 
+        if stochastic_rounding > 0:
             if inplace_ops:
                 tensor *= (1.0 / scale).to(tensor.dtype)
             else:
                 tensor = tensor * (1.0 / scale).to(tensor.dtype)
+            qdata = comfy.float.stochastic_rounding(tensor, dtype=cls.FP8_DTYPE, seed=stochastic_rounding)
         else:
-            scale = torch.ones((), device=tensor.device, dtype=torch.float32)
+            qdata = ck.quantize_per_tensor_fp8(tensor, scale, cls.FP8_DTYPE)
 
-        if stochastic_rounding > 0:
-            tensor = comfy.float.stochastic_rounding(tensor, dtype=dtype, seed=stochastic_rounding)
-        else:
-            lp_amax = torch.finfo(dtype).max
-            torch.clamp(tensor, min=-lp_amax, max=lp_amax, out=tensor)
-            tensor = tensor.to(dtype, memory_format=torch.contiguous_format)
+        params = cls.Params(scale=scale.float(), orig_dtype=orig_dtype, orig_shape=orig_shape)
+        return qdata, params
 
-        layout_params = {
-            'scale': scale,
-            'orig_dtype': orig_dtype
-        }
-        return tensor, layout_params
 
-    @staticmethod
-    def dequantize(qdata, scale, orig_dtype, **kwargs):
-        plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype)
-        plain_tensor.mul_(scale)
-        return plain_tensor
+class TensorCoreFP8E4M3Layout(_TensorCoreFP8LayoutBase):
+    FP8_DTYPE = torch.float8_e4m3fn
 
-    @classmethod
-    def get_plain_tensors(cls, qtensor):
-        return qtensor._qdata, qtensor._layout_params['scale']
+
+class TensorCoreFP8E5M2Layout(_TensorCoreFP8LayoutBase):
+    FP8_DTYPE = torch.float8_e5m2
+
+
+# Backward compatibility alias - default to E4M3
+TensorCoreFP8Layout = TensorCoreFP8E4M3Layout
+
+
+# ==============================================================================
+# Registry
+# ==============================================================================
+
+register_layout_class("TensorCoreFP8Layout", TensorCoreFP8Layout)
+register_layout_class("TensorCoreFP8E4M3Layout", TensorCoreFP8E4M3Layout)
+register_layout_class("TensorCoreFP8E5M2Layout", TensorCoreFP8E5M2Layout)
+register_layout_class("TensorCoreNVFP4Layout", TensorCoreNVFP4Layout)
 
 QUANT_ALGOS = {
     "float8_e4m3fn": {
         "storage_t": torch.float8_e4m3fn,
         "parameters": {"weight_scale", "input_scale"},
-        "comfy_tensor_layout": "TensorCoreFP8Layout",
+        "comfy_tensor_layout": "TensorCoreFP8E4M3Layout",
+    },
+    "float8_e5m2": {
+        "storage_t": torch.float8_e5m2,
+        "parameters": {"weight_scale", "input_scale"},
+        "comfy_tensor_layout": "TensorCoreFP8E5M2Layout",
+    },
+    "nvfp4": {
+        "storage_t": torch.uint8,
+        "parameters": {"weight_scale", "weight_scale_2", "input_scale"},
+        "comfy_tensor_layout": "TensorCoreNVFP4Layout",
+        "group_size": 16,
     },
 }
 
-LAYOUTS = {
-    "TensorCoreFP8Layout": TensorCoreFP8Layout,
-}
 
+# ==============================================================================
+# Re-exports for backward compatibility
+# ==============================================================================
 
-@register_layout_op(torch.ops.aten.linear.default, "TensorCoreFP8Layout")
-def fp8_linear(func, args, kwargs):
-    input_tensor = args[0]
-    weight = args[1]
-    bias = args[2] if len(args) > 2 else None
-
-    if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
-        plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
-        plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight)
-
-        out_dtype = kwargs.get("out_dtype")
-        if out_dtype is None:
-            out_dtype = input_tensor._layout_params['orig_dtype']
-
-        weight_t = plain_weight.t()
-
-        tensor_2d = False
-        if len(plain_input.shape) == 2:
-            tensor_2d = True
-            plain_input = plain_input.unsqueeze(1)
-
-        input_shape = plain_input.shape
-        if len(input_shape) != 3:
-            return None
-
-        try:
-            output = torch._scaled_mm(
-                plain_input.reshape(-1, input_shape[2]).contiguous(),
-                weight_t,
-                bias=bias,
-                scale_a=scale_a,
-                scale_b=scale_b,
-                out_dtype=out_dtype,
-            )
-
-            if isinstance(output, tuple):  # TODO: remove when we drop support for torch 2.4
-                output = output[0]
-
-            if not tensor_2d:
-                output = output.reshape((-1, input_shape[1], weight.shape[0]))
-
-            if output.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-                output_scale = scale_a * scale_b
-                output_params = {
-                    'scale': output_scale,
-                    'orig_dtype': input_tensor._layout_params['orig_dtype']
-                }
-                return QuantizedTensor(output, "TensorCoreFP8Layout", output_params)
-            else:
-                return output
-
-        except Exception as e:
-            raise RuntimeError(f"FP8 _scaled_mm failed, falling back to dequantization: {e}")
-
-    # Case 2: DQ Fallback
-    if isinstance(weight, QuantizedTensor):
-        weight = weight.dequantize()
-    if isinstance(input_tensor, QuantizedTensor):
-        input_tensor = input_tensor.dequantize()
-
-    return torch.nn.functional.linear(input_tensor, weight, bias)
-
-def fp8_mm_(input_tensor, weight, bias=None, out_dtype=None):
-    if out_dtype is None:
-        out_dtype = input_tensor._layout_params['orig_dtype']
-
-    plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
-    plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight)
-
-    output = torch._scaled_mm(
-        plain_input.contiguous(),
-        plain_weight,
-        bias=bias,
-        scale_a=scale_a,
-        scale_b=scale_b,
-        out_dtype=out_dtype,
-    )
-
-    if isinstance(output, tuple):  # TODO: remove when we drop support for torch 2.4
-        output = output[0]
-    return output
-
-@register_layout_op(torch.ops.aten.addmm.default, "TensorCoreFP8Layout")
-def fp8_addmm(func, args, kwargs):
-    input_tensor = args[1]
-    weight = args[2]
-    bias = args[0]
-
-    if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
-        return fp8_mm_(input_tensor, weight, bias=bias, out_dtype=kwargs.get("out_dtype", None))
-
-    a = list(args)
-    if isinstance(args[0], QuantizedTensor):
-        a[0] = args[0].dequantize()
-    if isinstance(args[1], QuantizedTensor):
-        a[1] = args[1].dequantize()
-    if isinstance(args[2], QuantizedTensor):
-        a[2] = args[2].dequantize()
-
-    return func(*a, **kwargs)
-
-@register_layout_op(torch.ops.aten.mm.default, "TensorCoreFP8Layout")
-def fp8_mm(func, args, kwargs):
-    input_tensor = args[0]
-    weight = args[1]
-
-    if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor):
-        return fp8_mm_(input_tensor, weight, bias=None, out_dtype=kwargs.get("out_dtype", None))
-
-    a = list(args)
-    if isinstance(args[0], QuantizedTensor):
-        a[0] = args[0].dequantize()
-    if isinstance(args[1], QuantizedTensor):
-        a[1] = args[1].dequantize()
-    return func(*a, **kwargs)
-
-@register_layout_op(torch.ops.aten.view.default, "TensorCoreFP8Layout")
-@register_layout_op(torch.ops.aten.t.default, "TensorCoreFP8Layout")
-def fp8_func(func, args, kwargs):
-    input_tensor = args[0]
-    if isinstance(input_tensor, QuantizedTensor):
-        plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor)
-        ar = list(args)
-        ar[0] = plain_input
-        return QuantizedTensor(func(*ar, **kwargs), "TensorCoreFP8Layout", input_tensor._layout_params)
-    return func(*args, **kwargs)
+__all__ = [
+    "QuantizedTensor",
+    "QuantizedLayout",
+    "TensorCoreFP8Layout",
+    "TensorCoreFP8E4M3Layout",
+    "TensorCoreFP8E5M2Layout",
+    "TensorCoreNVFP4Layout",
+    "QUANT_ALGOS",
+    "register_layout_op",
+]
diff --git a/requirements.txt b/requirements.txt
index 3a05799eb..0ee152032 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,6 +21,7 @@ psutil
 alembic
 SQLAlchemy
 av>=14.2.0
+comfy-kitchen>=0.2.0
 
 #non essential dependencies:
 kornia>=0.7.1
diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py
index 3a54941e6..7b2eac940 100644
--- a/tests-unit/comfy_quant/test_mixed_precision.py
+++ b/tests-unit/comfy_quant/test_mixed_precision.py
@@ -103,18 +103,18 @@ class TestMixedPrecisionOps(unittest.TestCase):
 
         # Verify weights are wrapped in QuantizedTensor
         self.assertIsInstance(model.layer1.weight, QuantizedTensor)
-        self.assertEqual(model.layer1.weight._layout_type, "TensorCoreFP8Layout")
+        self.assertEqual(model.layer1.weight._layout_cls, "TensorCoreFP8E4M3Layout")
 
         # Layer 2 should NOT be quantized
         self.assertNotIsInstance(model.layer2.weight, QuantizedTensor)
 
         # Layer 3 should be quantized
         self.assertIsInstance(model.layer3.weight, QuantizedTensor)
-        self.assertEqual(model.layer3.weight._layout_type, "TensorCoreFP8Layout")
+        self.assertEqual(model.layer3.weight._layout_cls, "TensorCoreFP8E4M3Layout")
 
         # Verify scales were loaded
-        self.assertEqual(model.layer1.weight._layout_params['scale'].item(), 2.0)
-        self.assertEqual(model.layer3.weight._layout_params['scale'].item(), 1.5)
+        self.assertEqual(model.layer1.weight._params.scale.item(), 2.0)
+        self.assertEqual(model.layer3.weight._params.scale.item(), 1.5)
 
         # Forward pass
         input_tensor = torch.randn(5, 10, dtype=torch.bfloat16)
@@ -154,8 +154,8 @@ class TestMixedPrecisionOps(unittest.TestCase):
 
         # Verify layer1.weight is a QuantizedTensor with scale preserved
         self.assertIsInstance(state_dict2["layer1.weight"], QuantizedTensor)
-        self.assertEqual(state_dict2["layer1.weight"]._layout_params['scale'].item(), 3.0)
-        self.assertEqual(state_dict2["layer1.weight"]._layout_type, "TensorCoreFP8Layout")
+        self.assertEqual(state_dict2["layer1.weight"]._params.scale.item(), 3.0)
+        self.assertEqual(state_dict2["layer1.weight"]._layout_cls, "TensorCoreFP8E4M3Layout")
 
         # Verify non-quantized layers are standard tensors
         self.assertNotIsInstance(state_dict2["layer2.weight"], QuantizedTensor)
diff --git a/tests-unit/comfy_quant/test_quant_registry.py b/tests-unit/comfy_quant/test_quant_registry.py
deleted file mode 100644
index 9cb54ede8..000000000
--- a/tests-unit/comfy_quant/test_quant_registry.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import unittest
-import torch
-import sys
-import os
-
-# Add comfy to path
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
-
-def has_gpu():
-    return torch.cuda.is_available()
-
-from comfy.cli_args import args
-if not has_gpu():
-    args.cpu = True
-
-from comfy.quant_ops import QuantizedTensor, TensorCoreFP8Layout
-
-
-class TestQuantizedTensor(unittest.TestCase):
-    """Test the QuantizedTensor subclass with FP8 layout"""
-
-    def test_creation(self):
-        """Test creating a QuantizedTensor with TensorCoreFP8Layout"""
-        fp8_data = torch.randn(256, 128, dtype=torch.float32).to(torch.float8_e4m3fn)
-        scale = torch.tensor(2.0)
-        layout_params = {'scale': scale, 'orig_dtype': torch.bfloat16}
-
-        qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params)
-
-        self.assertIsInstance(qt, QuantizedTensor)
-        self.assertEqual(qt.shape, (256, 128))
-        self.assertEqual(qt.dtype, torch.float8_e4m3fn)
-        self.assertEqual(qt._layout_params['scale'], scale)
-        self.assertEqual(qt._layout_params['orig_dtype'], torch.bfloat16)
-        self.assertEqual(qt._layout_type, "TensorCoreFP8Layout")
-
-    def test_dequantize(self):
-        """Test explicit dequantization"""
-
-        fp8_data = torch.ones(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn)
-        scale = torch.tensor(3.0)
-        layout_params = {'scale': scale, 'orig_dtype': torch.float32}
-
-        qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params)
-        dequantized = qt.dequantize()
-
-        self.assertEqual(dequantized.dtype, torch.float32)
-        self.assertTrue(torch.allclose(dequantized, torch.ones(10, 20) * 3.0, rtol=0.1))
-
-    def test_from_float(self):
-        """Test creating QuantizedTensor from float tensor"""
-        float_tensor = torch.randn(64, 32, dtype=torch.float32)
-        scale = torch.tensor(1.5)
-
-        qt = QuantizedTensor.from_float(
-            float_tensor,
-            "TensorCoreFP8Layout",
-            scale=scale,
-            dtype=torch.float8_e4m3fn
-        )
-
-        self.assertIsInstance(qt, QuantizedTensor)
-        self.assertEqual(qt.dtype, torch.float8_e4m3fn)
-        self.assertEqual(qt.shape, (64, 32))
-
-        # Verify dequantization gives approximately original values
-        dequantized = qt.dequantize()
-        mean_rel_error = ((dequantized - float_tensor).abs() / (float_tensor.abs() + 1e-6)).mean()
-        self.assertLess(mean_rel_error, 0.1)
-
-
-class TestGenericUtilities(unittest.TestCase):
-    """Test generic utility operations"""
-
-    def test_detach(self):
-        """Test detach operation on quantized tensor"""
-        fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn)
-        scale = torch.tensor(1.5)
-        layout_params = {'scale': scale, 'orig_dtype': torch.float32}
-        qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params)
-
-        # Detach should return a new QuantizedTensor
-        qt_detached = qt.detach()
-
-        self.assertIsInstance(qt_detached, QuantizedTensor)
-        self.assertEqual(qt_detached.shape, qt.shape)
-        self.assertEqual(qt_detached._layout_type, "TensorCoreFP8Layout")
-
-    def test_clone(self):
-        """Test clone operation on quantized tensor"""
-        fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn)
-        scale = torch.tensor(1.5)
-        layout_params = {'scale': scale, 'orig_dtype': torch.float32}
-        qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params)
-
-        # Clone should return a new QuantizedTensor
-        qt_cloned = qt.clone()
-
-        self.assertIsInstance(qt_cloned, QuantizedTensor)
-        self.assertEqual(qt_cloned.shape, qt.shape)
-        self.assertEqual(qt_cloned._layout_type, "TensorCoreFP8Layout")
-
-        # Verify it's a deep copy
-        self.assertIsNot(qt_cloned._qdata, qt._qdata)
-
-    @unittest.skipUnless(has_gpu(), "GPU not available")
-    def test_to_device(self):
-        """Test device transfer"""
-        fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn)
-        scale = torch.tensor(1.5)
-        layout_params = {'scale': scale, 'orig_dtype': torch.float32}
-        qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params)
-
-        # Moving to same device should work (CPU to CPU)
-        qt_cpu = qt.to('cpu')
-
-        self.assertIsInstance(qt_cpu, QuantizedTensor)
-        self.assertEqual(qt_cpu.device.type, 'cpu')
-        self.assertEqual(qt_cpu._layout_params['scale'].device.type, 'cpu')
-
-
-class TestTensorCoreFP8Layout(unittest.TestCase):
-    """Test the TensorCoreFP8Layout implementation"""
-
-    def test_quantize(self):
-        """Test quantization method"""
-        float_tensor = torch.randn(32, 64, dtype=torch.float32)
-        scale = torch.tensor(1.5)
-
-        qdata, layout_params = TensorCoreFP8Layout.quantize(
-            float_tensor,
-            scale=scale,
-            dtype=torch.float8_e4m3fn
-        )
-
-        self.assertEqual(qdata.dtype, torch.float8_e4m3fn)
-        self.assertEqual(qdata.shape, float_tensor.shape)
-        self.assertIn('scale', layout_params)
-        self.assertIn('orig_dtype', layout_params)
-        self.assertEqual(layout_params['orig_dtype'], torch.float32)
-
-    def test_dequantize(self):
-        """Test dequantization method"""
-        float_tensor = torch.ones(10, 20, dtype=torch.float32) * 3.0
-        scale = torch.tensor(1.0)
-
-        qdata, layout_params = TensorCoreFP8Layout.quantize(
-            float_tensor,
-            scale=scale,
-            dtype=torch.float8_e4m3fn
-        )
-
-        dequantized = TensorCoreFP8Layout.dequantize(qdata, **layout_params)
-
-        # Should approximately match original
-        self.assertTrue(torch.allclose(dequantized, float_tensor, rtol=0.1, atol=0.1))
-
-
-class TestFallbackMechanism(unittest.TestCase):
-    """Test fallback for unsupported operations"""
-
-    def test_unsupported_op_dequantizes(self):
-        """Test that unsupported operations fall back to dequantization"""
-        # Set seed for reproducibility
-        torch.manual_seed(42)
-
-        # Create quantized tensor
-        a_fp32 = torch.randn(10, 20, dtype=torch.float32)
-        scale = torch.tensor(1.0)
-        a_q = QuantizedTensor.from_float(
-            a_fp32,
-            "TensorCoreFP8Layout",
-            scale=scale,
-            dtype=torch.float8_e4m3fn
-        )
-
-        # Call an operation that doesn't have a registered handler
-        # For example, torch.abs
-        result = torch.abs(a_q)
-
-        # Should work via fallback (dequantize → abs → return)
-        self.assertNotIsInstance(result, QuantizedTensor)
-        expected = torch.abs(a_fp32)
-        # FP8 introduces quantization error, so use loose tolerance
-        mean_error = (result - expected).abs().mean()
-        self.assertLess(mean_error, 0.05, f"Mean error {mean_error:.4f} is too large")
-
-
-if __name__ == "__main__":
-    unittest.main()