diff --git a/comfy/ops.py b/comfy/ops.py index 96dffa85d..2a90a5ba2 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -77,7 +77,10 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of # will add async-offload support to your cast and improve performance. if input is not None: if dtype is None: - dtype = input.dtype + if isinstance(input, QuantizedTensor): + dtype = input._layout_params["orig_dtype"] + else: + dtype = input.dtype if bias_dtype is None: bias_dtype = dtype if device is None: @@ -534,18 +537,7 @@ if CUBLAS_IS_AVAILABLE: # ============================================================================== # Mixed Precision Operations # ============================================================================== -from .quant_ops import QuantizedTensor - -QUANT_FORMAT_MIXINS = { - "float8_e4m3fn": { - "dtype": torch.float8_e4m3fn, - "layout_type": "TensorCoreFP8Layout", - "parameters": { - "weight_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), - "input_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False), - } - } -} +from .quant_ops import QuantizedTensor, QUANT_ALGOS class MixedPrecisionOps(disable_weight_init): _layer_quant_config = {} @@ -596,23 +588,24 @@ class MixedPrecisionOps(disable_weight_init): if quant_format is None: raise ValueError(f"Unknown quantization format for layer {layer_name}") - mixin = QUANT_FORMAT_MIXINS[quant_format] - self.layout_type = mixin["layout_type"] + qconfig = QUANT_ALGOS[quant_format] + self.layout_type = qconfig["comfy_tensor_layout"] - scale_key = f"{prefix}weight_scale" + weight_scale_key = f"{prefix}weight_scale" layout_params = { - 'scale': state_dict.pop(scale_key, None), - 'orig_dtype': MixedPrecisionOps._compute_dtype + 'scale': state_dict.pop(weight_scale_key, None), + 'orig_dtype': MixedPrecisionOps._compute_dtype, + 'block_size': qconfig.get("group_size", None), } if layout_params['scale'] is not None: - manually_loaded_keys.append(scale_key) + manually_loaded_keys.append(weight_scale_key) self.weight = torch.nn.Parameter( - QuantizedTensor(weight.to(device=device, dtype=mixin["dtype"]), self.layout_type, layout_params), + QuantizedTensor(weight.to(device=device), self.layout_type, layout_params), requires_grad=False ) - for param_name, param_value in mixin["parameters"].items(): + for param_name in qconfig["parameters"]: param_key = f"{prefix}{param_name}" _v = state_dict.pop(param_key, None) if _v is None: @@ -643,7 +636,7 @@ class MixedPrecisionOps(disable_weight_init): if (getattr(self, 'layout_type', None) is not None and getattr(self, 'input_scale', None) is not None and not isinstance(input, QuantizedTensor)): - input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, fp8_dtype=self.weight.dtype) + input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) return self._forward(input, self.weight, self.bias) diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py index c56e32a73..1d058bece 100644 --- a/comfy/quant_ops.py +++ b/comfy/quant_ops.py @@ -74,6 +74,12 @@ def _copy_layout_params(params): new_params[k] = v return new_params +def _copy_layout_params_inplace(src, dst, non_blocking=False): + for k, v in src.items(): + if isinstance(v, torch.Tensor): + dst[k].copy_(v, non_blocking=non_blocking) + else: + dst[k] = v class QuantizedLayout: """ @@ -318,13 +324,13 @@ def generic_to_dtype_layout(func, args, kwargs): def generic_copy_(func, args, kwargs): qt_dest = args[0] src = args[1] - + non_blocking = args[2] if len(args) > 2 else False if isinstance(qt_dest, QuantizedTensor): if isinstance(src, QuantizedTensor): # Copy from another quantized tensor - qt_dest._qdata.copy_(src._qdata) + qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking) qt_dest._layout_type = src._layout_type - qt_dest._layout_params = _copy_layout_params(src._layout_params) + _copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking) else: # Copy from regular tensor - just copy raw data qt_dest._qdata.copy_(src) @@ -336,6 +342,26 @@ def generic_copy_(func, args, kwargs): def generic_has_compatible_shallow_copy_type(func, args, kwargs): return True + +@register_generic_util(torch.ops.aten.empty_like.default) +def generic_empty_like(func, args, kwargs): + """Empty_like operation - creates an empty tensor with the same quantized structure.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + # Create empty tensor with same shape and dtype as the quantized data + hp_dtype = kwargs.pop('dtype', qt._layout_params["orig_dtype"]) + new_qdata = torch.empty_like(qt._qdata, **kwargs) + + # Handle device transfer for layout params + target_device = kwargs.get('device', new_qdata.device) + new_params = _move_layout_params_to_device(qt._layout_params, target_device) + + # Update orig_dtype if dtype is specified + new_params['orig_dtype'] = hp_dtype + + return QuantizedTensor(new_qdata, qt._layout_type, new_params) + return func(*args, **kwargs) + # ============================================================================== # FP8 Layout + Operation Handlers # ============================================================================== @@ -378,6 +404,13 @@ class TensorCoreFP8Layout(QuantizedLayout): def get_plain_tensors(cls, qtensor): return qtensor._qdata, qtensor._layout_params['scale'] +QUANT_ALGOS = { + "float8_e4m3fn": { + "storage_t": torch.float8_e4m3fn, + "parameters": {"weight_scale", "input_scale"}, + "comfy_tensor_layout": "TensorCoreFP8Layout", + }, +} LAYOUTS = { "TensorCoreFP8Layout": TensorCoreFP8Layout,