diff --git a/baselines/system_info.txt b/baselines/system_info.txt index b0c103fa9..c5cb2b08e 100644 --- a/baselines/system_info.txt +++ b/baselines/system_info.txt @@ -28,7 +28,7 @@ torchaudio: scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp3 numpy: pinned to <2 (1.26.4) for wheel compatibility --- Startup --- -cd C:\\Users\\LvHHu\\ComfyUI +cd %USERPROFILE%\ComfyUI .\\venv\\Scripts\\activate python main.py diff --git a/baselines/system_info_realvisxl.txt b/baselines/system_info_realvisxl.txt index 8a6d3b839..988361e69 100644 --- a/baselines/system_info_realvisxl.txt +++ b/baselines/system_info_realvisxl.txt @@ -22,7 +22,7 @@ Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_0 Branch: load_torch_file() elif os.path.getsize > threshold and cuda available --- Startup command --- -cd C:\Users\LvHHu\ComfyUI +cd %USERPROFILE%\ComfyUI .\venv\Scripts\activate python main.py --disable-dynamic-vram --disable-mmap diff --git a/comfy/utils.py b/comfy/utils.py index ceddf7af1..32db92e61 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -22,6 +22,7 @@ import math import struct import ctypes import os +import sys import comfy.memory_management import safetensors.torch import numpy as np @@ -122,9 +123,11 @@ def load_safetensors(ckpt): _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 # 4 GB -def _load_safetensors_no_mmap(ckpt): +def _load_safetensors_no_mmap(ckpt, device=None): # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved. # Read tensors sequentially from file instead. + if device is None: + device = torch.device("cpu") sd = {} with open(ckpt, "rb") as fh: header_len = struct.unpack(" _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available(): - # File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual - # address space reserved by UMA GPU init. Use sequential file-read instead. - sd, metadata = _load_safetensors_no_mmap(ckpt) + elif (os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD + and sys.platform == "win32" + and torch.cuda.is_available()): + # Windows ROCm/UMA: GPU init reserves ~14 GB of virtual address space, + # preventing mmap of files >4 GB. Use sequential file-read instead. + # Scoped to Windows only to avoid overhead on Linux/Mac CUDA systems. + sd, metadata = _load_safetensors_no_mmap(ckpt, device=device) if not return_metadata: metadata = None else: