mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-14 20:42:31 +08:00
Fix OOM regression in _apply() for quantized models during inference
Skip unnecessary clone of inference-mode tensors when already inside
torch.inference_mode(), matching the existing guard in set_attr_param.
The unconditional clone introduced in 20561aa9 caused transient VRAM
doubling during model movement for FP8/quantized models.
This commit is contained in:
parent
31283d2892
commit
b7872e24f4
@ -1151,7 +1151,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
if param is None:
|
||||
continue
|
||||
p = fn(param)
|
||||
if p.is_inference():
|
||||
if (not torch.is_inference_mode_enabled()) and p.is_inference():
|
||||
p = p.clone()
|
||||
self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
|
||||
for key, buf in self._buffers.items():
|
||||
|
||||
Loading…
Reference in New Issue
Block a user