From 61235fc35a54b72efbe67758fbf53a44ceffdd2f Mon Sep 17 00:00:00 2001
From: Emiliooooo <emiliooooo@local>
Date: Thu, 14 May 2026 21:09:35 -0400
Subject: [PATCH] fix(directml): replace try/except with device-type guard; fix
 both ZeroDivisionError sites
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improves on the previous directml commit with three research-based refinements:

1. model_management.py — module_mmap_residency() and cast_to_gathered()
   Replace broad try/except NotImplementedError with an explicit
   `t.device.type == 'privateuseone'` guard. Checking device type is
   faster in a hot loop and makes the intent self-documenting.
   Fixes: github.com/Comfy-Org/ComfyUI/issues/8347

2. attention.py — attention_split()
   Replace the "assume 4 GB free" heuristic with `steps = 64`.
   64-slice chunking is safe and correct regardless of card size;
   the 4 GB assumption was fragile on cards with less or more VRAM.

3. diffusionmodules/model.py — slice_attention()
   Apply the identical `steps = 64` guard to the second call site
   for the same ZeroDivisionError (was missed in the previous commit).
   Fixes: github.com/comfyanonymous/ComfyUI/issues/1518

Tested end-to-end on AMD RX 5600 XT (6 GB VRAM), Windows 11,
torch-directml 0.2.5, ComfyUI 0.21.1, DreamShaper 8 (SD 1.5).
Full 20-step txt2img pipeline completes and returns a valid PNG.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 comfy/ldm/modules/attention.py              | 12 +++++++-----
 comfy/ldm/modules/diffusionmodules/model.py |  7 ++++++-
 comfy/model_management.py                   | 17 +++++++++--------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index 37b2a8ee3..172dd18b4 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -336,12 +336,14 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
     steps = 1
 
 
-    if mem_free_total <= 0:
-        # DirectML doesn't expose free VRAM — assume 4GB free as a safe fallback for 6GB cards
-        mem_free_total = 4 * (1024 ** 3)
-
     if mem_required > mem_free_total:
-        steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
+        if mem_free_total <= 0:
+            # Backend (e.g. DirectML) cannot report free VRAM — use max split as a safe fallback.
+            # 64 slices keeps individual tile memory tiny regardless of resolution.
+            # See: github.com/comfyanonymous/ComfyUI/issues/1518
+            steps = 64
+        else:
+            steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
         # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
         #      f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")
 
diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py
index fcbaa074f..3aa8db1a4 100644
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@@ -243,7 +243,12 @@ def slice_attention(q, k, v):
     steps = 1
 
     if mem_required > mem_free_total:
-        steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
+        if mem_free_total <= 0:
+            # Backend (e.g. DirectML) cannot report free VRAM — use max split as safe fallback.
+            # See: github.com/comfyanonymous/ComfyUI/issues/1518
+            steps = 64
+        else:
+            steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
 
     while True:
         try:
diff --git a/comfy/model_management.py b/comfy/model_management.py
index a14627842..6ae363897 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -544,11 +544,12 @@ def module_mmap_residency(module, free=False):
     for k in sd:
         t = sd[k]
         module_mem += t.nbytes
-        try:
-            storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
-        except NotImplementedError:
-            # DirectML (AMD) tensors are opaque — no host storage to inspect; skip mmap tracking
+        # DirectML tensors (device.type == 'privateuseone') are backed by OpaqueTensorImpl
+        # and do not expose host storage. Mmap tracking is meaningless for GPU-side tensors;
+        # skip entirely. See: github.com/Comfy-Org/ComfyUI/issues/8347
+        if hasattr(t, 'device') and t.device.type == 'privateuseone':
             continue
+        storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
         if not getattr(storage, "_comfy_tensor_mmap_touched", False):
             continue
         mmap_touched_mem += t.nbytes
@@ -1332,12 +1333,12 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
                 continue
             if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
                 continue
-            try:
-                storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
-            except NotImplementedError:
-                # DirectML tensors are opaque — skip mmap marking, just copy
+            # DirectML tensors are OpaqueTensorImpl — no host storage to mark.
+            # Skip mmap tracking and perform the copy directly.
+            if hasattr(tensor, 'device') and tensor.device.type == 'privateuseone':
                 dest_view.copy_(tensor, non_blocking=non_blocking)
                 continue
+            storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
             if hasattr(storage, "_comfy_tensor_mmap_touched"):
                 storage._comfy_tensor_mmap_touched = True
             dest_view.copy_(tensor, non_blocking=non_blocking)