diff --git a/comfy/audio_encoders/audio_encoders.py b/comfy/audio_encoders/audio_encoders.py index 0de7584b0..5413a7db3 100644 --- a/comfy/audio_encoders/audio_encoders.py +++ b/comfy/audio_encoders/audio_encoders.py @@ -4,7 +4,10 @@ import comfy.model_management import comfy.ops import comfy.utils import logging -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None class AudioEncoderModel(): diff --git a/comfy/audio_encoders/whisper.py b/comfy/audio_encoders/whisper.py index 93d3782f1..f4f5c4655 100755 --- a/comfy/audio_encoders/whisper.py +++ b/comfy/audio_encoders/whisper.py @@ -1,7 +1,10 @@ import torch import torch.nn as nn import torch.nn.functional as F -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None from typing import Optional from comfy.ldm.modules.attention import optimized_attention_masked import comfy.ops diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py index dd5320c8f..6755f5ff6 100644 --- a/comfy/ldm/lightricks/vae/audio_vae.py +++ b/comfy/ldm/lightricks/vae/audio_vae.py @@ -2,7 +2,10 @@ import json from dataclasses import dataclass import math import torch -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py index a68cb8439..172dd18b4 100644 --- a/comfy/ldm/modules/attention.py +++ b/comfy/ldm/modules/attention.py @@ -337,7 +337,13 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape if mem_required > mem_free_total: - steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) + if mem_free_total <= 0: + # Backend (e.g. DirectML) cannot report free VRAM — use max split as a safe fallback. + # 64 slices keeps individual tile memory tiny regardless of resolution. + # See: github.com/comfyanonymous/ComfyUI/issues/1518 + steps = 64 + else: + steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) # print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB " # f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}") diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py index fcbaa074f..3aa8db1a4 100644 --- a/comfy/ldm/modules/diffusionmodules/model.py +++ b/comfy/ldm/modules/diffusionmodules/model.py @@ -243,7 +243,12 @@ def slice_attention(q, k, v): steps = 1 if mem_required > mem_free_total: - steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) + if mem_free_total <= 0: + # Backend (e.g. DirectML) cannot report free VRAM — use max split as safe fallback. + # See: github.com/comfyanonymous/ComfyUI/issues/1518 + steps = 64 + else: + steps = 2**(math.ceil(math.log(mem_required / mem_free_total, 2))) while True: try: diff --git a/comfy/model_management.py b/comfy/model_management.py index 21738a4c7..6ae363897 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -101,7 +101,7 @@ if args.deterministic: directml_enabled = False if args.directml is not None: - logging.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.") + logging.info("DirectML backend active (AMD/Intel GPU on Windows, no CUDA/ROCm required).") import torch_directml directml_enabled = True device_index = args.directml @@ -213,7 +213,40 @@ def get_total_memory(dev=None, torch_total_too=False): mem_total_torch = mem_total else: if directml_enabled: - mem_total = 1024 * 1024 * 1024 #TODO + # Query real VRAM from Windows registry (qwMemorySize is 64-bit, AdapterRAM caps at 4GB) + # Falls back to COMFYUI_DIRECTML_VRAM_MB env var, then 6GB default + _dml_vram = 0 + try: + _override = os.environ.get("COMFYUI_DIRECTML_VRAM_MB") + if _override: + _dml_vram = int(_override) * 1024 * 1024 + except Exception: + pass + if _dml_vram <= 0: + try: + import winreg as _winreg + _base = r"SYSTEM\CurrentControlSet\Control\Class\{4d36e968-e325-11ce-bfc1-08002be10318}" + with _winreg.OpenKey(_winreg.HKEY_LOCAL_MACHINE, _base) as _hbase: + _i = 0 + while True: + try: + _sub = _winreg.EnumKey(_hbase, _i) + _i += 1 + try: + with _winreg.OpenKey(_hbase, _sub) as _hdev: + _mem, _ = _winreg.QueryValueEx(_hdev, "HardwareInformation.qwMemorySize") + if isinstance(_mem, int) and _mem > 128 * 1024 * 1024: + _dml_vram = _mem + break + except Exception: + pass + except OSError: + break + except Exception: + pass + if _dml_vram <= 0: + _dml_vram = 6 * 1024 * 1024 * 1024 # 6GB safe default for modern AMD cards + mem_total = _dml_vram mem_total_torch = mem_total elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) @@ -511,6 +544,11 @@ def module_mmap_residency(module, free=False): for k in sd: t = sd[k] module_mem += t.nbytes + # DirectML tensors (device.type == 'privateuseone') are backed by OpaqueTensorImpl + # and do not expose host storage. Mmap tracking is meaningless for GPU-side tensors; + # skip entirely. See: github.com/Comfy-Org/ComfyUI/issues/8347 + if hasattr(t, 'device') and t.device.type == 'privateuseone': + continue storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() if not getattr(storage, "_comfy_tensor_mmap_touched", False): continue @@ -1295,6 +1333,11 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): continue if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): continue + # DirectML tensors are OpaqueTensorImpl — no host storage to mark. + # Skip mmap tracking and perform the copy directly. + if hasattr(tensor, 'device') and tensor.device.type == 'privateuseone': + dest_view.copy_(tensor, non_blocking=non_blocking) + continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() if hasattr(storage, "_comfy_tensor_mmap_touched"): storage._comfy_tensor_mmap_touched = True @@ -1504,7 +1547,16 @@ def get_free_memory(dev=None, torch_free_too=False): mem_free_torch = mem_free_total else: if directml_enabled: - mem_free_total = 1024 * 1024 * 1024 #TODO + # gpu_memory(0) returns a list of per-tile usage fractions [0.0–1.0] + # total_vram (module-level) is the registry-queried real VRAM in MB + try: + import torch_directml as _tdml + _usage_fracs = _tdml.gpu_memory(0) + _usage_pct = max(_usage_fracs) if _usage_fracs else 0.0 + _total = int(total_vram * 1024 * 1024) + mem_free_total = max(0, int(_total * (1.0 - _usage_pct))) + except Exception: + mem_free_total = int(total_vram * 1024 * 1024) mem_free_torch = mem_free_total elif is_intel_xpu(): stats = torch.xpu.memory_stats(dev) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index fcc1c34d5..ce1a49cd1 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -1,7 +1,10 @@ from __future__ import annotations import av -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None import torch import comfy.model_management import folder_paths diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index 3dc1199c2..48137fdf6 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -1,7 +1,10 @@ import nodes import node_helpers import torch -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None import comfy.model_management import comfy.model_sampling import comfy.samplers diff --git a/comfy_extras/nodes_wandancer.py b/comfy_extras/nodes_wandancer.py index fc005ed4c..dbc929c83 100644 --- a/comfy_extras/nodes_wandancer.py +++ b/comfy_extras/nodes_wandancer.py @@ -2,7 +2,10 @@ import math import nodes import node_helpers import torch -import torchaudio +try: + import torchaudio +except (ImportError, OSError): + torchaudio = None import comfy.model_management import comfy.utils import numpy as np