mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-16 03:57:27 +08:00
fix(directml): replace try/except with device-type guard; fix both ZeroDivisionError sites
Improves on the previous directml commit with three research-based refinements: 1. model_management.py — module_mmap_residency() and cast_to_gathered() Replace broad try/except NotImplementedError with an explicit `t.device.type == 'privateuseone'` guard. Checking device type is faster in a hot loop and makes the intent self-documenting. Fixes: github.com/Comfy-Org/ComfyUI/issues/8347 2. attention.py — attention_split() Replace the "assume 4 GB free" heuristic with `steps = 64`. 64-slice chunking is safe and correct regardless of card size; the 4 GB assumption was fragile on cards with less or more VRAM. 3. diffusionmodules/model.py — slice_attention() Apply the identical `steps = 64` guard to the second call site for the same ZeroDivisionError (was missed in the previous commit). Fixes: github.com/comfyanonymous/ComfyUI/issues/1518 Tested end-to-end on AMD RX 5600 XT (6 GB VRAM), Windows 11, torch-directml 0.2.5, ComfyUI 0.21.1, DreamShaper 8 (SD 1.5). Full 20-step txt2img pipeline completes and returns a valid PNG. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
93510fde17
commit
61235fc35a
@ -336,12 +336,14 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
|
|||||||
steps = 1
|
steps = 1
|
||||||
|
|
||||||
|
|
||||||
if mem_free_total <= 0:
|
|
||||||
# DirectML doesn't expose free VRAM — assume 4GB free as a safe fallback for 6GB cards
|
|
||||||
mem_free_total = 4 * (1024 ** 3)
|
|
||||||
|
|
||||||
if mem_required > mem_free_total:
|
if mem_required > mem_free_total:
|
||||||
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
if mem_free_total <= 0:
|
||||||
|
# Backend (e.g. DirectML) cannot report free VRAM — use max split as a safe fallback.
|
||||||
|
# 64 slices keeps individual tile memory tiny regardless of resolution.
|
||||||
|
# See: github.com/comfyanonymous/ComfyUI/issues/1518
|
||||||
|
steps = 64
|
||||||
|
else:
|
||||||
|
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
||||||
# print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
|
# print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
|
||||||
# f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")
|
# f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")
|
||||||
|
|
||||||
|
|||||||
@ -243,7 +243,12 @@ def slice_attention(q, k, v):
|
|||||||
steps = 1
|
steps = 1
|
||||||
|
|
||||||
if mem_required > mem_free_total:
|
if mem_required > mem_free_total:
|
||||||
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
if mem_free_total <= 0:
|
||||||
|
# Backend (e.g. DirectML) cannot report free VRAM — use max split as safe fallback.
|
||||||
|
# See: github.com/comfyanonymous/ComfyUI/issues/1518
|
||||||
|
steps = 64
|
||||||
|
else:
|
||||||
|
steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@ -544,11 +544,12 @@ def module_mmap_residency(module, free=False):
|
|||||||
for k in sd:
|
for k in sd:
|
||||||
t = sd[k]
|
t = sd[k]
|
||||||
module_mem += t.nbytes
|
module_mem += t.nbytes
|
||||||
try:
|
# DirectML tensors (device.type == 'privateuseone') are backed by OpaqueTensorImpl
|
||||||
storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
|
# and do not expose host storage. Mmap tracking is meaningless for GPU-side tensors;
|
||||||
except NotImplementedError:
|
# skip entirely. See: github.com/Comfy-Org/ComfyUI/issues/8347
|
||||||
# DirectML (AMD) tensors are opaque — no host storage to inspect; skip mmap tracking
|
if hasattr(t, 'device') and t.device.type == 'privateuseone':
|
||||||
continue
|
continue
|
||||||
|
storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
|
||||||
if not getattr(storage, "_comfy_tensor_mmap_touched", False):
|
if not getattr(storage, "_comfy_tensor_mmap_touched", False):
|
||||||
continue
|
continue
|
||||||
mmap_touched_mem += t.nbytes
|
mmap_touched_mem += t.nbytes
|
||||||
@ -1332,12 +1333,12 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
|
|||||||
continue
|
continue
|
||||||
if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
|
if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
|
||||||
continue
|
continue
|
||||||
try:
|
# DirectML tensors are OpaqueTensorImpl — no host storage to mark.
|
||||||
storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
|
# Skip mmap tracking and perform the copy directly.
|
||||||
except NotImplementedError:
|
if hasattr(tensor, 'device') and tensor.device.type == 'privateuseone':
|
||||||
# DirectML tensors are opaque — skip mmap marking, just copy
|
|
||||||
dest_view.copy_(tensor, non_blocking=non_blocking)
|
dest_view.copy_(tensor, non_blocking=non_blocking)
|
||||||
continue
|
continue
|
||||||
|
storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
|
||||||
if hasattr(storage, "_comfy_tensor_mmap_touched"):
|
if hasattr(storage, "_comfy_tensor_mmap_touched"):
|
||||||
storage._comfy_tensor_mmap_touched = True
|
storage._comfy_tensor_mmap_touched = True
|
||||||
dest_view.copy_(tensor, non_blocking=non_blocking)
|
dest_view.copy_(tensor, non_blocking=non_blocking)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user