From e860732dba381576bfe7dd0f97e142229ae7ff6d Mon Sep 17 00:00:00 2001 From: Emiliooooo Date: Thu, 14 May 2026 12:10:31 -0400 Subject: [PATCH 1/3] fix(directml): correct VRAM detection and make torchaudio imports optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## VRAM Detection (model_management.py) The DirectML code path had two hardcoded `1024 * 1024 * 1024 #TODO` values in `get_total_memory()` and `get_free_memory()`, causing ComfyUI to report only 1 GB of VRAM on any AMD/Intel GPU using the DirectML backend — regardless of actual hardware. This forced NORMAL_VRAM or LOW_VRAM calculations to be wildly wrong. Fix for `get_total_memory`: - On Windows, reads `HardwareInformation.qwMemorySize` from the GPU driver registry key via `winreg`. This is the 64-bit accurate value (unlike `Win32_VideoController.AdapterRAM` which overflows at 4 GB). - Allows override via `COMFYUI_DIRECTML_VRAM_MB` env var. - Falls back to 6 GB if registry query fails (safe default for modern dGPUs). Fix for `get_free_memory`: - Uses `torch_directml.gpu_memory(0)` to get per-tile usage fractions and derives free memory as `total * (1 - max_usage_fraction)`. ## torchaudio: optional import on AMD/DirectML torchaudio has a DLL incompatibility with torch-directml (which ships its own torch runtime). The following files had bare `import torchaudio` at module level, crashing ComfyUI startup entirely when torchaudio was absent: - comfy/ldm/lightricks/vae/audio_vae.py - comfy/audio_encoders/whisper.py - comfy/audio_encoders/audio_encoders.py - comfy_extras/nodes_audio.py - comfy_extras/nodes_lt.py - comfy_extras/nodes_wandancer.py Each import is wrapped in `try/except (ImportError, OSError): torchaudio = None`, matching the pattern already used in comfy/ldm/mmaudio/vae/autoencoder.py and comfy/ldm/ace/vae/music_dcae_pipeline.py. Audio nodes will degrade gracefully rather than preventing ComfyUI from starting. Tested on: AMD Radeon RX 5600 XT (6 GB VRAM, gfx1010, Windows 10) Co-Authored-By: Claude Sonnet 4.6 --- comfy/audio_encoders/audio_encoders.py | 5 ++- comfy/audio_encoders/whisper.py | 5 ++- comfy/ldm/lightricks/vae/audio_vae.py | 5 ++- comfy/model_management.py | 48 ++++++++++++++++++++++++-- comfy_extras/nodes_audio.py | 5 ++- comfy_extras/nodes_lt.py | 5 ++- comfy_extras/nodes_wandancer.py | 5 ++- 7 files changed, 69 insertions(+), 9 deletions(-) diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py index 0de7584b0..5413a7db3 100644 --- a/comfy/audio_encoders/audio_encoders.py +++ b/comfy/audio_encoders/audio_encoders.py @@ -4,7 +4,10 @@ import comfy.model_management import comfy.ops import comfy.utils import logging -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None class AudioEncoderModel(): diff --git a/comfy/audio_encoders/whisper.py b/comfy/audio_encoders/whisper.py index 93d3782f1..f4f5c4655 100755 --- a/comfy/audio_encoders/whisper.py +++ b/comfy/audio_encoders/whisper.py @@ -1,7 +1,10 @@ import torch import torch.nn as nn import torch.nn.functional as F -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None from typing import Optional from comfy.ldm.modules.attention import optimized_attention_masked import comfy.ops diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py index dd5320c8f..6755f5ff6 100644 --- a/comfy/ldm/lightricks/vae/audio_vae.py +++ b/comfy/ldm/lightricks/vae/audio_vae.py @@ -2,7 +2,10 @@ import json from dataclasses import dataclass import math import torch -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier diff --git a/comfy/model_management.py b/comfy/model_management.py index 21738a4c7..6b4d4b770 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -101,7 +101,7 @@ if args.deterministic: directml_enabled = False if args.directml is not None: - logging.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.") + logging.info("DirectML backend active (AMD/Intel GPU on Windows, no CUDA/ROCm required).") import torch_directml directml_enabled = True device_index = args.directml @@ -213,7 +213,40 @@ def get_total_memory(dev=None, torch_total_too=False): mem_total_torch = mem_total else: if directml_enabled: - mem_total = 1024 * 1024 * 1024 #TODO + # Query real VRAM from Windows registry (qwMemorySize is 64-bit, AdapterRAM caps at 4GB) + # Falls back to COMFYUI_DIRECTML_VRAM_MB env var, then 6GB default + _dml_vram = 0 + try: + _override = os.environ.get("COMFYUI_DIRECTML_VRAM_MB") + if _override: + _dml_vram = int(_override) * 1024 * 1024 + except Exception: + pass + if _dml_vram <= 0: + try: + import winreg as _winreg + _base = r"SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}" + with _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, _base) as _hbase: + _i = 0 + while True: + try: + _sub = _winreg.EnumKey(_hbase, _i) + _i += 1 + try: + with _winreg.OpenKey(_hbase, _sub) as _hdev: + _mem, _ = _winreg.QueryValueEx(_hdev, "HardwareInformation.qwMemorySize") + if isinstance(_mem, int) and _mem > 128 * 1024 * 1024: + _dml_vram = _mem + break + except Exception: + pass + except OSError: + break + except Exception: + pass + if _dml_vram <= 0: + _dml_vram = 6 * 1024 * 1024 * 1024 # 6GB safe default for modern AMD cards + mem_total = _dml_vram mem_total_torch = mem_total elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) @@ -1504,7 +1537,16 @@ def get_free_memory(dev=None, torch_free_too=False): mem_free_torch = mem_free_total else: if directml_enabled: - mem_free_total = 1024 * 1024 * 1024 #TODO + # gpu_memory(0) returns a list of per-tile usage fractions [0.0–1.0] + # total_vram (module-level) is the registry-queried real VRAM in MB + try: + import torch_directml as _tdml + _usage_fracs = _tdml.gpu_memory(0) + _usage_pct = max(_usage_fracs) if _usage_fracs else 0.0 + _total = int(total_vram * 1024 * 1024) + mem_free_total = max(0, int(_total * (1.0 - _usage_pct))) + except Exception: + mem_free_total = int(total_vram * 1024 * 1024) mem_free_torch = mem_free_total elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index fcc1c34d5..ce1a49cd1 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -1,7 +1,10 @@ from __future__ import annotations import av -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None import torch import comfy.model_management import folder_paths diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index 3dc1199c2..48137fdf6 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -1,7 +1,10 @@ import nodes import node_helpers import torch -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None import comfy.model_management import comfy.model_sampling import comfy.samplers diff --git a/comfy_extras/nodes_wandancer.py b/comfy_extras/nodes_wandancer.py index fc005ed4c..dbc929c83 100644 --- a/comfy_extras/nodes_wandancer.py +++ b/comfy_extras/nodes_wandancer.py @@ -2,7 +2,10 @@ import math import nodes import node_helpers import torch -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None import comfy.model_management import comfy.utils import numpy as np From 93510fde17786759bdc06845e5eb1195fca6eae8 Mon Sep 17 00:00:00 2001 From: Emiliooooo Date: Thu, 14 May 2026 19:20:09 -0400 Subject: [PATCH 2/3] fix(directml): guard opaque tensor storage and zero VRAM edge cases Two runtime crashes affect AMD GPU users on Windows using torch-directml: 1. NotImplementedError in module_mmap_residency / cast_to_gathered DirectML tensors are opaque (OpaqueTensorImpl) and do not support untyped_storage(). Wrap both call sites in try/except so mmap tracking is skipped for DirectML tensors instead of crashing. 2. ZeroDivisionError in attention_split DirectML does not expose free VRAM via the standard query path, leaving mem_free_total as 0. Guard the math.log() call with a floor of 4 GB so split-attention steps are computed safely. Tested on AMD RX 5600 XT (6 GB VRAM), Windows 11, torch-directml 0.2.5, ComfyUI 0.21.1, DreamShaper 8 (SD 1.5). Co-Authored-By: Claude Sonnet 4.5 --- comfy/ldm/modules/attention.py | 4 ++++ comfy/model_management.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index a68cb8439..37b2a8ee3 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -336,6 +336,10 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape steps = 1 + if mem_free_total <= 0: + # DirectML doesn't expose free VRAM — assume 4GB free as a safe fallback for 6GB cards + mem_free_total = 4 * (1024 ** 3) + if mem_required > mem_free_total: steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB " diff --git a/comfy/model_management.py b/comfy/model_management.py index 6b4d4b770..a14627842 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -544,7 +544,11 @@ def module_mmap_residency(module, free=False): for k in sd: t = sd[k] module_mem += t.nbytes - storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() + try: + storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() + except NotImplementedError: + # DirectML (AMD) tensors are opaque — no host storage to inspect; skip mmap tracking + continue if not getattr(storage, "_comfy_tensor_mmap_touched", False): continue mmap_touched_mem += t.nbytes @@ -1328,7 +1332,12 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): continue if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): continue - storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() + try: + storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() + except NotImplementedError: + # DirectML tensors are opaque — skip mmap marking, just copy + dest_view.copy_(tensor, non_blocking=non_blocking) + continue if hasattr(storage, "_comfy_tensor_mmap_touched"): storage._comfy_tensor_mmap_touched = True dest_view.copy_(tensor, non_blocking=non_blocking) From 61235fc35a54b72efbe67758fbf53a44ceffdd2f Mon Sep 17 00:00:00 2001 From: Emiliooooo Date: Thu, 14 May 2026 21:09:35 -0400 Subject: [PATCH 3/3] fix(directml): replace try/except with device-type guard; fix both ZeroDivisionError sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improves on the previous directml commit with three research-based refinements: 1. model_management.py — module_mmap_residency() and cast_to_gathered() Replace broad try/except NotImplementedError with an explicit `t.device.type == 'privateuseone'` guard. Checking device type is faster in a hot loop and makes the intent self-documenting. Fixes: github.com/Comfy-Org/ComfyUI/issues/8347 2. attention.py — attention_split() Replace the "assume 4 GB free" heuristic with `steps = 64`. 64-slice chunking is safe and correct regardless of card size; the 4 GB assumption was fragile on cards with less or more VRAM. 3. diffusionmodules/model.py — slice_attention() Apply the identical `steps = 64` guard to the second call site for the same ZeroDivisionError (was missed in the previous commit). Fixes: github.com/comfyanonymous/ComfyUI/issues/1518 Tested end-to-end on AMD RX 5600 XT (6 GB VRAM), Windows 11, torch-directml 0.2.5, ComfyUI 0.21.1, DreamShaper 8 (SD 1.5). Full 20-step txt2img pipeline completes and returns a valid PNG. Co-Authored-By: Claude Sonnet 4.5 --- comfy/ldm/modules/attention.py | 12 +++++++----- comfy/ldm/modules/diffusionmodules/model.py | 7 ++++++- comfy/model_management.py | 17 +++++++++-------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index 37b2a8ee3..172dd18b4 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -336,12 +336,14 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape steps = 1 - if mem_free_total <= 0: - # DirectML doesn't expose free VRAM — assume 4GB free as a safe fallback for 6GB cards - mem_free_total = 4 * (1024 ** 3) - if mem_required > mem_free_total: - steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) + if mem_free_total <= 0: + # Backend (e.g. DirectML) cannot report free VRAM — use max split as a safe fallback. + # 64 slices keeps individual tile memory tiny regardless of resolution. + # See: github.com/comfyanonymous/ComfyUI/issues/1518 + steps = 64 + else: + steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB " # f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}") diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index fcbaa074f..3aa8db1a4 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -243,7 +243,12 @@ def slice_attention(q, k, v): steps = 1 if mem_required > mem_free_total: - steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) + if mem_free_total <= 0: + # Backend (e.g. DirectML) cannot report free VRAM — use max split as safe fallback. + # See: github.com/comfyanonymous/ComfyUI/issues/1518 + steps = 64 + else: + steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) while True: try: diff --git a/comfy/model_management.py b/comfy/model_management.py index a14627842..6ae363897 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -544,11 +544,12 @@ def module_mmap_residency(module, free=False): for k in sd: t = sd[k] module_mem += t.nbytes - try: - storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() - except NotImplementedError: - # DirectML (AMD) tensors are opaque — no host storage to inspect; skip mmap tracking + # DirectML tensors (device.type == 'privateuseone') are backed by OpaqueTensorImpl + # and do not expose host storage. Mmap tracking is meaningless for GPU-side tensors; + # skip entirely. See: github.com/Comfy-Org/ComfyUI/issues/8347 + if hasattr(t, 'device') and t.device.type == 'privateuseone': continue + storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() if not getattr(storage, "_comfy_tensor_mmap_touched", False): continue mmap_touched_mem += t.nbytes @@ -1332,12 +1333,12 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): continue if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): continue - try: - storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() - except NotImplementedError: - # DirectML tensors are opaque — skip mmap marking, just copy + # DirectML tensors are OpaqueTensorImpl — no host storage to mark. + # Skip mmap tracking and perform the copy directly. + if hasattr(tensor, 'device') and tensor.device.type == 'privateuseone': dest_view.copy_(tensor, non_blocking=non_blocking) continue + storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() if hasattr(storage, "_comfy_tensor_mmap_touched"): storage._comfy_tensor_mmap_touched = True dest_view.copy_(tensor, non_blocking=non_blocking)