From e860732dba381576bfe7dd0f97e142229ae7ff6d Mon Sep 17 00:00:00 2001
From: Emiliooooo <emiliooooo@local>
Date: Thu, 14 May 2026 12:10:31 -0400
Subject: [PATCH 1/3] fix(directml): correct VRAM detection and make torchaudio
 imports optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## VRAM Detection (model_management.py)

The DirectML code path had two hardcoded `1024 * 1024 * 1024 #TODO` values
in `get_total_memory()` and `get_free_memory()`, causing ComfyUI to report
only 1 GB of VRAM on any AMD/Intel GPU using the DirectML backend — regardless
of actual hardware. This forced NORMAL_VRAM or LOW_VRAM calculations to be
wildly wrong.

Fix for `get_total_memory`:
- On Windows, reads `HardwareInformation.qwMemorySize` from the GPU driver
  registry key via `winreg`. This is the 64-bit accurate value (unlike
  `Win32_VideoController.AdapterRAM` which overflows at 4 GB).
- Allows override via `COMFYUI_DIRECTML_VRAM_MB` env var.
- Falls back to 6 GB if registry query fails (safe default for modern dGPUs).

Fix for `get_free_memory`:
- Uses `torch_directml.gpu_memory(0)` to get per-tile usage fractions and
  derives free memory as `total * (1 - max_usage_fraction)`.

## torchaudio: optional import on AMD/DirectML

torchaudio has a DLL incompatibility with torch-directml (which ships its own
torch runtime). The following files had bare `import torchaudio` at module
level, crashing ComfyUI startup entirely when torchaudio was absent:

- comfy/ldm/lightricks/vae/audio_vae.py
- comfy/audio_encoders/whisper.py
- comfy/audio_encoders/audio_encoders.py
- comfy_extras/nodes_audio.py
- comfy_extras/nodes_lt.py
- comfy_extras/nodes_wandancer.py

Each import is wrapped in `try/except (ImportError, OSError): torchaudio = None`,
matching the pattern already used in comfy/ldm/mmaudio/vae/autoencoder.py and
comfy/ldm/ace/vae/music_dcae_pipeline.py. Audio nodes will degrade gracefully
rather than preventing ComfyUI from starting.

Tested on: AMD Radeon RX 5600 XT (6 GB VRAM, gfx1010, Windows 10)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 comfy/audio_encoders/audio_encoders.py |  5 ++-
 comfy/audio_encoders/whisper.py        |  5 ++-
 comfy/ldm/lightricks/vae/audio_vae.py  |  5 ++-
 comfy/model_management.py              | 48 ++++++++++++++++++++++++--
 comfy_extras/nodes_audio.py            |  5 ++-
 comfy_extras/nodes_lt.py               |  5 ++-
 comfy_extras/nodes_wandancer.py        |  5 ++-
 7 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py
index 0de7584b0..5413a7db3 100644
--- a/comfy/audio_encoders/audio_encoders.py
+++ b/comfy/audio_encoders/audio_encoders.py
@@ -4,7 +4,10 @@ import comfy.model_management
 import comfy.ops
 import comfy.utils
 import logging
-import torchaudio
+try:
+    import torchaudio
+except (ImportError, OSError):
+    torchaudio = None
 
 
 class AudioEncoderModel():
diff --git a/comfy/audio_encoders/whisper.py b/comfy/audio_encoders/whisper.py
index 93d3782f1..f4f5c4655 100755
--- a/comfy/audio_encoders/whisper.py
+++ b/comfy/audio_encoders/whisper.py
@@ -1,7 +1,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torchaudio
+try:
+    import torchaudio
+except (ImportError, OSError):
+    torchaudio = None
 from typing import Optional
 from comfy.ldm.modules.attention import optimized_attention_masked
 import comfy.ops
diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py
index dd5320c8f..6755f5ff6 100644
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@@ -2,7 +2,10 @@ import json
 from dataclasses import dataclass
 import math
 import torch
-import torchaudio
+try:
+    import torchaudio
+except (ImportError, OSError):
+    torchaudio = None
 
 from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
 from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 21738a4c7..6b4d4b770 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -101,7 +101,7 @@ if args.deterministic:
 
 directml_enabled = False
 if args.directml is not None:
-    logging.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.")
+    logging.info("DirectML backend active (AMD/Intel GPU on Windows, no CUDA/ROCm required).")
     import torch_directml
     directml_enabled = True
     device_index = args.directml
@@ -213,7 +213,40 @@ def get_total_memory(dev=None, torch_total_too=False):
         mem_total_torch = mem_total
     else:
         if directml_enabled:
-            mem_total = 1024 * 1024 * 1024 #TODO
+            # Query real VRAM from Windows registry (qwMemorySize is 64-bit, AdapterRAM caps at 4GB)
+            # Falls back to COMFYUI_DIRECTML_VRAM_MB env var, then 6GB default
+            _dml_vram = 0
+            try:
+                _override = os.environ.get("COMFYUI_DIRECTML_VRAM_MB")
+                if _override:
+                    _dml_vram = int(_override) * 1024 * 1024
+            except Exception:
+                pass
+            if _dml_vram <= 0:
+                try:
+                    import winreg as _winreg
+                    _base = r"SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}"
+                    with _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, _base) as _hbase:
+                        _i = 0
+                        while True:
+                            try:
+                                _sub = _winreg.EnumKey(_hbase, _i)
+                                _i += 1
+                                try:
+                                    with _winreg.OpenKey(_hbase, _sub) as _hdev:
+                                        _mem, _ = _winreg.QueryValueEx(_hdev, "HardwareInformation.qwMemorySize")
+                                        if isinstance(_mem, int) and _mem > 128 * 1024 * 1024:
+                                            _dml_vram = _mem
+                                            break
+                                except Exception:
+                                    pass
+                            except OSError:
+                                break
+                except Exception:
+                    pass
+            if _dml_vram <= 0:
+                _dml_vram = 6 * 1024 * 1024 * 1024  # 6GB safe default for modern AMD cards
+            mem_total = _dml_vram
             mem_total_torch = mem_total
         elif is_intel_xpu():
             stats = torch.xpu.memory_stats(dev)
@@ -1504,7 +1537,16 @@ def get_free_memory(dev=None, torch_free_too=False):
         mem_free_torch = mem_free_total
     else:
         if directml_enabled:
-            mem_free_total = 1024 * 1024 * 1024 #TODO
+            # gpu_memory(0) returns a list of per-tile usage fractions [0.0–1.0]
+            # total_vram (module-level) is the registry-queried real VRAM in MB
+            try:
+                import torch_directml as _tdml
+                _usage_fracs = _tdml.gpu_memory(0)
+                _usage_pct = max(_usage_fracs) if _usage_fracs else 0.0
+                _total = int(total_vram * 1024 * 1024)
+                mem_free_total = max(0, int(_total * (1.0 - _usage_pct)))
+            except Exception:
+                mem_free_total = int(total_vram * 1024 * 1024)
             mem_free_torch = mem_free_total
         elif is_intel_xpu():
             stats = torch.xpu.memory_stats(dev)
diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index fcc1c34d5..ce1a49cd1 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
 
 import av
-import torchaudio
+try:
+    import torchaudio
+except (ImportError, OSError):
+    torchaudio = None
 import torch
 import comfy.model_management
 import folder_paths
diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py
index 3dc1199c2..48137fdf6 100644
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -1,7 +1,10 @@
 import nodes
 import node_helpers
 import torch
-import torchaudio
+try:
+    import torchaudio
+except (ImportError, OSError):
+    torchaudio = None
 import comfy.model_management
 import comfy.model_sampling
 import comfy.samplers
diff --git a/comfy_extras/nodes_wandancer.py b/comfy_extras/nodes_wandancer.py
index fc005ed4c..dbc929c83 100644
--- a/comfy_extras/nodes_wandancer.py
+++ b/comfy_extras/nodes_wandancer.py
@@ -2,7 +2,10 @@ import math
 import nodes
 import node_helpers
 import torch
-import torchaudio
+try:
+    import torchaudio
+except (ImportError, OSError):
+    torchaudio = None
 import comfy.model_management
 import comfy.utils
 import numpy as np

From 93510fde17786759bdc06845e5eb1195fca6eae8 Mon Sep 17 00:00:00 2001
From: Emiliooooo <emiliooooo@local>
Date: Thu, 14 May 2026 19:20:09 -0400
Subject: [PATCH 2/3] fix(directml): guard opaque tensor storage and zero VRAM
 edge cases

Two runtime crashes affect AMD GPU users on Windows using torch-directml:

1. NotImplementedError in module_mmap_residency / cast_to_gathered
   DirectML tensors are opaque (OpaqueTensorImpl) and do not support
   untyped_storage(). Wrap both call sites in try/except so mmap
   tracking is skipped for DirectML tensors instead of crashing.

2. ZeroDivisionError in attention_split
   DirectML does not expose free VRAM via the standard query path,
   leaving mem_free_total as 0. Guard the math.log() call with a
   floor of 4 GB so split-attention steps are computed safely.

Tested on AMD RX 5600 XT (6 GB VRAM), Windows 11, torch-directml 0.2.5,
ComfyUI 0.21.1, DreamShaper 8 (SD 1.5).

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 comfy/ldm/modules/attention.py |  4 ++++
 comfy/model_management.py      | 13 +++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index a68cb8439..37b2a8ee3 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -336,6 +336,10 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
     steps = 1
 
 
+    if mem_free_total <= 0:
+        # DirectML doesn't expose free VRAM — assume 4GB free as a safe fallback for 6GB cards
+        mem_free_total = 4 * (1024 ** 3)
+
     if mem_required > mem_free_total:
         steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
         # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 6b4d4b770..a14627842 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -544,7 +544,11 @@ def module_mmap_residency(module, free=False):
     for k in sd:
         t = sd[k]
         module_mem += t.nbytes
-        storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
+        try:
+            storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
+        except NotImplementedError:
+            # DirectML (AMD) tensors are opaque — no host storage to inspect; skip mmap tracking
+            continue
         if not getattr(storage, "_comfy_tensor_mmap_touched", False):
             continue
         mmap_touched_mem += t.nbytes
@@ -1328,7 +1332,12 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
                 continue
             if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
                 continue
-            storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
+            try:
+                storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
+            except NotImplementedError:
+                # DirectML tensors are opaque — skip mmap marking, just copy
+                dest_view.copy_(tensor, non_blocking=non_blocking)
+                continue
             if hasattr(storage, "_comfy_tensor_mmap_touched"):
                 storage._comfy_tensor_mmap_touched = True
             dest_view.copy_(tensor, non_blocking=non_blocking)

From 61235fc35a54b72efbe67758fbf53a44ceffdd2f Mon Sep 17 00:00:00 2001
From: Emiliooooo <emiliooooo@local>
Date: Thu, 14 May 2026 21:09:35 -0400
Subject: [PATCH 3/3] fix(directml): replace try/except with device-type guard;
 fix both ZeroDivisionError sites
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improves on the previous directml commit with three research-based refinements:

1. model_management.py — module_mmap_residency() and cast_to_gathered()
   Replace broad try/except NotImplementedError with an explicit
   `t.device.type == 'privateuseone'` guard. Checking device type is
   faster in a hot loop and makes the intent self-documenting.
   Fixes: github.com/Comfy-Org/ComfyUI/issues/8347

2. attention.py — attention_split()
   Replace the "assume 4 GB free" heuristic with `steps = 64`.
   64-slice chunking is safe and correct regardless of card size;
   the 4 GB assumption was fragile on cards with less or more VRAM.

3. diffusionmodules/model.py — slice_attention()
   Apply the identical `steps = 64` guard to the second call site
   for the same ZeroDivisionError (was missed in the previous commit).
   Fixes: github.com/comfyanonymous/ComfyUI/issues/1518

Tested end-to-end on AMD RX 5600 XT (6 GB VRAM), Windows 11,
torch-directml 0.2.5, ComfyUI 0.21.1, DreamShaper 8 (SD 1.5).
Full 20-step txt2img pipeline completes and returns a valid PNG.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 comfy/ldm/modules/attention.py              | 12 +++++++-----
 comfy/ldm/modules/diffusionmodules/model.py |  7 ++++++-
 comfy/model_management.py                   | 17 +++++++++--------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index 37b2a8ee3..172dd18b4 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -336,12 +336,14 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
     steps = 1
 
 
-    if mem_free_total <= 0:
-        # DirectML doesn't expose free VRAM — assume 4GB free as a safe fallback for 6GB cards
-        mem_free_total = 4 * (1024 ** 3)
-
     if mem_required > mem_free_total:
-        steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
+        if mem_free_total <= 0:
+            # Backend (e.g. DirectML) cannot report free VRAM — use max split as a safe fallback.
+            # 64 slices keeps individual tile memory tiny regardless of resolution.
+            # See: github.com/comfyanonymous/ComfyUI/issues/1518
+            steps = 64
+        else:
+            steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
         # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
         #      f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")
 
diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py
index fcbaa074f..3aa8db1a4 100644
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@@ -243,7 +243,12 @@ def slice_attention(q, k, v):
     steps = 1
 
     if mem_required > mem_free_total:
-        steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
+        if mem_free_total <= 0:
+            # Backend (e.g. DirectML) cannot report free VRAM — use max split as safe fallback.
+            # See: github.com/comfyanonymous/ComfyUI/issues/1518
+            steps = 64
+        else:
+            steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2)))
 
     while True:
         try:
diff --git a/comfy/model_management.py b/comfy/model_management.py
index a14627842..6ae363897 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -544,11 +544,12 @@ def module_mmap_residency(module, free=False):
     for k in sd:
         t = sd[k]
         module_mem += t.nbytes
-        try:
-            storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
-        except NotImplementedError:
-            # DirectML (AMD) tensors are opaque — no host storage to inspect; skip mmap tracking
+        # DirectML tensors (device.type == 'privateuseone') are backed by OpaqueTensorImpl
+        # and do not expose host storage. Mmap tracking is meaningless for GPU-side tensors;
+        # skip entirely. See: github.com/Comfy-Org/ComfyUI/issues/8347
+        if hasattr(t, 'device') and t.device.type == 'privateuseone':
             continue
+        storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
         if not getattr(storage, "_comfy_tensor_mmap_touched", False):
             continue
         mmap_touched_mem += t.nbytes
@@ -1332,12 +1333,12 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
                 continue
             if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
                 continue
-            try:
-                storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
-            except NotImplementedError:
-                # DirectML tensors are opaque — skip mmap marking, just copy
+            # DirectML tensors are OpaqueTensorImpl — no host storage to mark.
+            # Skip mmap tracking and perform the copy directly.
+            if hasattr(tensor, 'device') and tensor.device.type == 'privateuseone':
                 dest_view.copy_(tensor, non_blocking=non_blocking)
                 continue
+            storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
             if hasattr(storage, "_comfy_tensor_mmap_touched"):
                 storage._comfy_tensor_mmap_touched = True
             dest_view.copy_(tensor, non_blocking=non_blocking)