mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 13:19:23 +08:00
fix: add no-mmap safetensors loader for >4GB files on Windows ROCm/UMA
Root cause: Strix Halo UMA ROCm init reserves ~14 GB of Windows virtual address space for GPU. This prevents safetensors from mmap-ing files larger than ~4 GB (SDXL fp16 ~6.5 GB), causing access violations. SD1.5 (3.97 GB) is below the threshold and unaffected. Fix in comfy/utils.py: - Add _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 - Add _load_safetensors_no_mmap(): reads tensors via open()+seek()+read() instead of mmap, then clones each tensor for independent ownership - In load_torch_file(): route files >4 GB with CUDA active through _load_safetensors_no_mmap() automatically Tested: RealVisXL_V4.0.safetensors (6.46 GB) loads and generates 768x1024 portrait images at ~5 it/s on AMD Radeon 8050S (gfx1151). SD1.5 baseline unaffected (still uses original mmap path).
This commit is contained in:
parent
b6a730b24e
commit
e912b910a2
55
baselines/system_info_realvisxl.txt
Normal file
55
baselines/system_info_realvisxl.txt
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
=== ComfyUI RealVisXL no-mmap Stable Baseline ===
|
||||||
|
Date: 2026-06-20
|
||||||
|
|
||||||
|
--- Previous baseline ---
|
||||||
|
Tag: rocm-sd15-working-baseline (preserved, not modified)
|
||||||
|
|
||||||
|
--- This baseline adds ---
|
||||||
|
Fix: comfy/utils.py: _load_safetensors_no_mmap() for files >4 GB
|
||||||
|
Model: RealVisXL_V4.0.safetensors (6.46 GB) - path only, not in git
|
||||||
|
Test: 768x1024, 25 steps, cfg=6, dpmpp_2m, karras -> OK
|
||||||
|
|
||||||
|
--- Root cause of crash (diagnosed & fixed) ---
|
||||||
|
Strix Halo UMA: ROCm init reserves ~14 GB GPU virtual address space.
|
||||||
|
safetensors mmap of files >~4 GB then fails (Windows VA space exhausted).
|
||||||
|
SD1.5 (3.97 GB) < threshold -> mmap OK.
|
||||||
|
SDXL fp16 (~6.5 GB) > threshold -> access violation in safe_open().
|
||||||
|
Fix: sequential file-read (open+seek+read) bypasses mmap entirely.
|
||||||
|
|
||||||
|
--- Patch location ---
|
||||||
|
File: comfy/utils.py
|
||||||
|
Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000
|
||||||
|
Branch: load_torch_file() elif os.path.getsize > threshold and cuda available
|
||||||
|
|
||||||
|
--- Startup command ---
|
||||||
|
cd C:\Users\LvHHu\ComfyUI
|
||||||
|
.\venv\Scripts\activate
|
||||||
|
python main.py --disable-dynamic-vram --disable-mmap
|
||||||
|
|
||||||
|
--- GPU / ROCm ---
|
||||||
|
torch: 2.7.0a0+git3f903c3
|
||||||
|
Device: AMD Radeon(TM) 8050S Graphics
|
||||||
|
VRAM GB: 14.37
|
||||||
|
ROCm: 6.5 / gfx1151 (Strix Halo)
|
||||||
|
|
||||||
|
--- Models in checkpoints (not in git) ---
|
||||||
|
v1-5-pruned-emaonly.safetensors 3.97 GB SD1.5 baseline
|
||||||
|
RealVisXL_V4.0.safetensors 6.46 GB SDXL realistic portrait
|
||||||
|
|
||||||
|
--- Working parameters (RealVisXL) ---
|
||||||
|
Resolution: 768x1024
|
||||||
|
Steps: 25
|
||||||
|
CFG: 6
|
||||||
|
Sampler: dpmpp_2m
|
||||||
|
Scheduler: karras
|
||||||
|
Batch size: 1
|
||||||
|
|
||||||
|
--- Recovery commands ---
|
||||||
|
# 1. Return to this code state:
|
||||||
|
git checkout rocm-realvisxl-nommap-working
|
||||||
|
|
||||||
|
# 2. Re-download RealVisXL if needed (not in git):
|
||||||
|
# python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='SG161222/RealVisXL_V4.0', filename='RealVisXL_V4.0.safetensors', local_dir='models/checkpoints')"
|
||||||
|
|
||||||
|
# 3. Start ComfyUI:
|
||||||
|
# python main.py --disable-dynamic-vram --disable-mmap
|
||||||
@ -119,6 +119,32 @@ def load_safetensors(ckpt):
|
|||||||
return sd, header.get("__metadata__", {}),
|
return sd, header.get("__metadata__", {}),
|
||||||
|
|
||||||
|
|
||||||
|
_LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 # 4 GB
|
||||||
|
|
||||||
|
|
||||||
|
def _load_safetensors_no_mmap(ckpt):
|
||||||
|
# Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved.
|
||||||
|
# Read tensors sequentially from file instead.
|
||||||
|
sd = {}
|
||||||
|
with open(ckpt, "rb") as fh:
|
||||||
|
header_len = struct.unpack("<Q", fh.read(8))[0]
|
||||||
|
header = json.loads(fh.read(header_len).decode("utf-8"))
|
||||||
|
data_start = 8 + header_len
|
||||||
|
for name, info in header.items():
|
||||||
|
if name == "__metadata__":
|
||||||
|
continue
|
||||||
|
start, end = info["data_offsets"]
|
||||||
|
dtype = _TYPES[info["dtype"]]
|
||||||
|
shape = info["shape"]
|
||||||
|
fh.seek(data_start + start)
|
||||||
|
raw = fh.read(end - start)
|
||||||
|
if raw:
|
||||||
|
sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone()
|
||||||
|
else:
|
||||||
|
sd[name] = torch.empty(shape, dtype=dtype)
|
||||||
|
return sd, header.get("__metadata__", {})
|
||||||
|
|
||||||
|
|
||||||
def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
|
def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
|
||||||
if device is None:
|
if device is None:
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
@ -129,6 +155,12 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
|
|||||||
sd, metadata = load_safetensors(ckpt)
|
sd, metadata = load_safetensors(ckpt)
|
||||||
if not return_metadata:
|
if not return_metadata:
|
||||||
metadata = None
|
metadata = None
|
||||||
|
elif os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available():
|
||||||
|
# File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual
|
||||||
|
# address space reserved by UMA GPU init. Use sequential file-read instead.
|
||||||
|
sd, metadata = _load_safetensors_no_mmap(ckpt)
|
||||||
|
if not return_metadata:
|
||||||
|
metadata = None
|
||||||
else:
|
else:
|
||||||
with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
|
with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
|
||||||
sd = {}
|
sd = {}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user