From b6a730b24e31d6a88a22cebc915eb3ae4e2bbc30 Mon Sep 17 00:00:00 2001 From: Houde Date: Sat, 20 Jun 2026 17:48:15 +0100 Subject: [PATCH 1/3] chore: add ROCm stable baseline snapshot (gfx1151 / Strix Halo) - torch 2.7.0a0 + ROCm 6.5 via scottt/rocm-TheRock gfx1151 wheels - numpy pinned to 1.26.4 for wheel compatibility - SD1.5 512x512 20 steps ~5 it/s confirmed stable - Saved workflow: sd15_test_rocm_workflow.json - AMD Radeon 8050S, 14.37 GB UMA VRAM correctly detected --- baselines/environment_rocm_working.txt | 83 ++++++++++++++ baselines/system_info.txt | 36 ++++++ .../workflows/sd15_test_rocm_workflow.json | 107 ++++++++++++++++++ 3 files changed, 226 insertions(+) create mode 100644 baselines/environment_rocm_working.txt create mode 100644 baselines/system_info.txt create mode 100644 baselines/workflows/sd15_test_rocm_workflow.json diff --git a/baselines/environment_rocm_working.txt b/baselines/environment_rocm_working.txt new file mode 100644 index 000000000..0bc13ee9f --- /dev/null +++ b/baselines/environment_rocm_working.txt @@ -0,0 +1,83 @@ +aiohappyeyeballs==2.6.2 +aiohttp==3.14.1 +aiosignal==1.4.0 +alembic==1.18.4 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anyio==4.13.0 +attrs==26.1.0 +av==17.1.0 +blake3==1.0.8 +certifi==2026.5.20 +charset-normalizer==3.4.7 +click==8.4.1 +colorama==0.4.6 +comfy-aimdo==0.4.9 +comfy-kitchen==0.2.10 +comfyui-embedded-docs==0.5.3 +comfyui-workflow-templates-core==0.3.252 +comfyui-workflow-templates-media-api==0.3.80 +comfyui-workflow-templates-media-image==0.3.150 +comfyui-workflow-templates-media-other==0.3.217 +comfyui-workflow-templates-media-video==0.3.91 +comfyui_frontend_package==1.45.15 +comfyui_workflow_templates==0.9.98 +einops==0.8.2 +filelock==3.29.4 +frozenlist==1.8.0 +fsspec==2026.4.0 +glfw==2.10.0 +greenlet==3.5.1 +h11==0.16.0 +hf-xet==1.5.1 +httpcore==1.0.9 +httpx==0.28.1 +huggingface_hub==1.19.0 +idna==3.18 +Jinja2==3.1.6 +kornia==0.8.3 +kornia_rs==0.1.14 +Mako==1.3.12 +markdown-it-py==4.2.0 +MarkupSafe==3.0.3 +mdurl==0.1.2 +mpmath==1.3.0 +multidict==6.7.1 +networkx==3.6.1 +numpy==1.26.4 +packaging==26.2 +pillow==12.2.0 +propcache==0.5.2 +psutil==7.2.2 +pydantic==2.13.4 +pydantic-settings==2.14.1 +pydantic_core==2.46.4 +Pygments==2.20.0 +PyOpenGL==3.1.10 +python-dotenv==1.2.2 +PyYAML==6.0.3 +regex==2026.5.9 +requests==2.34.2 +rich==15.0.0 +safetensors==0.8.0 +scipy==1.17.1 +sentencepiece==0.2.1 +setuptools==82.0.1 +shellingham==1.5.4 +simpleeval==1.0.7 +spandrel==0.4.2 +SQLAlchemy==2.0.50 +sympy==1.14.0 +tokenizers==0.22.2 +torch @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torch-2.7.0a0+git3f903c3-cp312-cp312-win_amd64.whl#sha256=ab308d20b8568354781ceaad1c9a1637b6dff16ab42e589fa87b19fa87f3c839 +torchaudio @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp312-cp312-win_amd64.whl#sha256=caa1291b5040325d67ac2d6bddb9c3ec9478337dfc70a4d08bda8a557c834698 +torchsde==0.2.6 +torchvision @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torchvision-0.22.0+9eb57cd-cp312-cp312-win_amd64.whl#sha256=47fbcdc9b5e80ee7ab40c27bbf5cd36f7a7091eae3d43a09eebd833a391de1ec +tqdm==4.68.2 +trampoline==0.1.2 +transformers==5.12.0 +typer==0.25.1 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +urllib3==2.7.0 +yarl==1.24.2 diff --git a/baselines/system_info.txt b/baselines/system_info.txt new file mode 100644 index 000000000..b0c103fa9 --- /dev/null +++ b/baselines/system_info.txt @@ -0,0 +1,36 @@ +=== ComfyUI ROCm Stable Baseline === +Date: 2026-06-20 + +--- Python --- +Python: 3.12.10 + +--- PyTorch / ROCm --- +torch: 2.7.0a0+git3f903c3 +CUDA avail: True +Device: AMD Radeon(TM) 8050S Graphics +VRAM (GB): 14.37 +ROCm/HIP: 6.5.25205-c1c2abe52 + +--- torch packages --- +torch: 2.7.0a0+git3f903c3 +torchvision: 0.22.0+9eb57cd +torchaudio: 2.6.0a0+1a8f621 + +--- ComfyUI --- +Version: 0.24.0 +Backend: ROCm 6.5 (scottt/rocm-TheRock gfx1151 wheel) +Tested: SD1.5 512x512 20steps ~5 it/s, stable + +--- Wheel sources (gfx1151 / Strix Halo) --- +torch: scottt/rocm-TheRock v6.5.0rc-pytorch/torch-2.7.0a0+git3f903c3-cp312-cp312-win_amd64.whl +torchvision: scottt/rocm-TheRock v6.5.0rc-pytorch/torchvision-0.22.0+9eb57cd-cp312-cp312-win_amd64.whl +torchaudio: scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp312-cp312-win_amd64.whl +numpy: pinned to <2 (1.26.4) for wheel compatibility + +--- Startup --- +cd C:\\Users\\LvHHu\\ComfyUI +.\\venv\\Scripts\\activate +python main.py + +--- Saved workflow --- +baselines/workflows/sd15_test_rocm_workflow.json diff --git a/baselines/workflows/sd15_test_rocm_workflow.json b/baselines/workflows/sd15_test_rocm_workflow.json new file mode 100644 index 000000000..bb0f4517b --- /dev/null +++ b/baselines/workflows/sd15_test_rocm_workflow.json @@ -0,0 +1,107 @@ +{ + "2": { + "inputs": { + "ckpt_name": "v1-5-pruned-emaonly.safetensors" + }, + "class_type": "CheckpointLoaderSimple", + "_meta": { + "title": "Checkpoint加载器(简易)" + } + }, + "3": { + "inputs": { + "text": "aa cute fluffy kitten, big round eyes, detailed fur, soft natural window light, cozy indoor background, shallow depth of field, photorealistic, high quality, 50mm lens", + "clip": [ + "2", + 1 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP文本编码" + } + }, + "4": { + "inputs": { + "text": "low quality, blurry, deformed, ugly, bad anatomy, distorted face, extra limbs, bad eyes, oversaturated", + "clip": [ + "2", + 1 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP文本编码" + } + }, + "5": { + "inputs": { + "width": 512, + "height": 512, + "batch_size": 1 + }, + "class_type": "EmptyLatentImage", + "_meta": { + "title": "空Latent图像" + } + }, + "6": { + "inputs": { + "seed": 826325619577598, + "steps": 30, + "cfg": 7, + "sampler_name": "dpmpp_2m", + "scheduler": "normal", + "denoise": 1, + "model": [ + "2", + 0 + ], + "positive": [ + "3", + 0 + ], + "negative": [ + "4", + 0 + ], + "latent_image": [ + "5", + 0 + ] + }, + "class_type": "KSampler", + "_meta": { + "title": "K采样器" + } + }, + "7": { + "inputs": { + "samples": [ + "6", + 0 + ], + "vae": [ + "2", + 2 + ] + }, + "class_type": "VAEDecode", + "_meta": { + "title": "VAE解码" + } + }, + "8": { + "inputs": { + "filename_prefix": "ComfyUI", + "images": [ + "7", + 0 + ] + }, + "class_type": "SaveImage", + "_meta": { + "title": "保存图像" + } + } +} \ No newline at end of file From e912b910a2beaaa7cfeb6c991ea327e586892083 Mon Sep 17 00:00:00 2001 From: Houde Date: Sat, 20 Jun 2026 19:03:20 +0100 Subject: [PATCH 2/3] fix: add no-mmap safetensors loader for >4GB files on Windows ROCm/UMA Root cause: Strix Halo UMA ROCm init reserves ~14 GB of Windows virtual address space for GPU. This prevents safetensors from mmap-ing files larger than ~4 GB (SDXL fp16 ~6.5 GB), causing access violations. SD1.5 (3.97 GB) is below the threshold and unaffected. Fix in comfy/utils.py: - Add _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 - Add _load_safetensors_no_mmap(): reads tensors via open()+seek()+read() instead of mmap, then clones each tensor for independent ownership - In load_torch_file(): route files >4 GB with CUDA active through _load_safetensors_no_mmap() automatically Tested: RealVisXL_V4.0.safetensors (6.46 GB) loads and generates 768x1024 portrait images at ~5 it/s on AMD Radeon 8050S (gfx1151). SD1.5 baseline unaffected (still uses original mmap path). --- baselines/system_info_realvisxl.txt | 55 +++++++++++++++++++++++++++++ comfy/utils.py | 32 +++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 baselines/system_info_realvisxl.txt diff --git a/baselines/system_info_realvisxl.txt b/baselines/system_info_realvisxl.txt new file mode 100644 index 000000000..8a6d3b839 --- /dev/null +++ b/baselines/system_info_realvisxl.txt @@ -0,0 +1,55 @@ +=== ComfyUI RealVisXL no-mmap Stable Baseline === +Date: 2026-06-20 + +--- Previous baseline --- +Tag: rocm-sd15-working-baseline (preserved, not modified) + +--- This baseline adds --- +Fix: comfy/utils.py: _load_safetensors_no_mmap() for files >4 GB +Model: RealVisXL_V4.0.safetensors (6.46 GB) - path only, not in git +Test: 768x1024, 25 steps, cfg=6, dpmpp_2m, karras -> OK + +--- Root cause of crash (diagnosed & fixed) --- +Strix Halo UMA: ROCm init reserves ~14 GB GPU virtual address space. +safetensors mmap of files >~4 GB then fails (Windows VA space exhausted). +SD1.5 (3.97 GB) < threshold -> mmap OK. +SDXL fp16 (~6.5 GB) > threshold -> access violation in safe_open(). +Fix: sequential file-read (open+seek+read) bypasses mmap entirely. + +--- Patch location --- +File: comfy/utils.py +Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 +Branch: load_torch_file() elif os.path.getsize > threshold and cuda available + +--- Startup command --- +cd C:\Users\LvHHu\ComfyUI +.\venv\Scripts\activate +python main.py --disable-dynamic-vram --disable-mmap + +--- GPU / ROCm --- +torch: 2.7.0a0+git3f903c3 +Device: AMD Radeon(TM) 8050S Graphics +VRAM GB: 14.37 +ROCm: 6.5 / gfx1151 (Strix Halo) + +--- Models in checkpoints (not in git) --- +v1-5-pruned-emaonly.safetensors 3.97 GB SD1.5 baseline +RealVisXL_V4.0.safetensors 6.46 GB SDXL realistic portrait + +--- Working parameters (RealVisXL) --- +Resolution: 768x1024 +Steps: 25 +CFG: 6 +Sampler: dpmpp_2m +Scheduler: karras +Batch size: 1 + +--- Recovery commands --- +# 1. Return to this code state: +git checkout rocm-realvisxl-nommap-working + +# 2. Re-download RealVisXL if needed (not in git): +# python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='SG161222/RealVisXL_V4.0', filename='RealVisXL_V4.0.safetensors', local_dir='models/checkpoints')" + +# 3. Start ComfyUI: +# python main.py --disable-dynamic-vram --disable-mmap diff --git a/comfy/utils.py b/comfy/utils.py index 09d783fff..ceddf7af1 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -119,6 +119,32 @@ def load_safetensors(ckpt): return sd, header.get("__metadata__", {}), +_LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 # 4 GB + + +def _load_safetensors_no_mmap(ckpt): + # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved. + # Read tensors sequentially from file instead. + sd = {} + with open(ckpt, "rb") as fh: + header_len = struct.unpack(" _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available(): + # File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual + # address space reserved by UMA GPU init. Use sequential file-read instead. + sd, metadata = _load_safetensors_no_mmap(ckpt) + if not return_metadata: + metadata = None else: with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f: sd = {} From 0df0b0d61394261e2f18616ca77b9c5271a19007 Mon Sep 17 00:00:00 2001 From: Houde Date: Tue, 23 Jun 2026 14:51:22 +0100 Subject: [PATCH 3/3] fix: address CodeRabbit review comments on PR #14587 - utils.py: add device param to _load_safetensors_no_mmap, move tensors to target device instead of always returning CPU tensors - utils.py: validate read length == expected bytes; raise RuntimeError on partial/corrupt reads instead of silently creating empty tensors - utils.py: scope no-mmap fallback to sys.platform == win32 to avoid unnecessary overhead on Linux/Mac CUDA systems; add sys import - baselines: replace hardcoded LvHHu username with %USERPROFILE% in startup commands for portability Co-Authored-By: Claude Sonnet 4.6 --- baselines/system_info.txt | 2 +- baselines/system_info_realvisxl.txt | 2 +- comfy/utils.py | 32 ++++++++++++++++++++--------- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/baselines/system_info.txt b/baselines/system_info.txt index b0c103fa9..c5cb2b08e 100644 --- a/baselines/system_info.txt +++ b/baselines/system_info.txt @@ -28,7 +28,7 @@ torchaudio: scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp3 numpy: pinned to <2 (1.26.4) for wheel compatibility --- Startup --- -cd C:\\Users\\LvHHu\\ComfyUI +cd %USERPROFILE%\ComfyUI .\\venv\\Scripts\\activate python main.py diff --git a/baselines/system_info_realvisxl.txt b/baselines/system_info_realvisxl.txt index 8a6d3b839..988361e69 100644 --- a/baselines/system_info_realvisxl.txt +++ b/baselines/system_info_realvisxl.txt @@ -22,7 +22,7 @@ Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_0 Branch: load_torch_file() elif os.path.getsize > threshold and cuda available --- Startup command --- -cd C:\Users\LvHHu\ComfyUI +cd %USERPROFILE%\ComfyUI .\venv\Scripts\activate python main.py --disable-dynamic-vram --disable-mmap diff --git a/comfy/utils.py b/comfy/utils.py index ceddf7af1..32db92e61 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -22,6 +22,7 @@ import math import struct import ctypes import os +import sys import comfy.memory_management import safetensors.torch import numpy as np @@ -122,9 +123,11 @@ def load_safetensors(ckpt): _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000 # 4 GB -def _load_safetensors_no_mmap(ckpt): +def _load_safetensors_no_mmap(ckpt, device=None): # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved. # Read tensors sequentially from file instead. + if device is None: + device = torch.device("cpu") sd = {} with open(ckpt, "rb") as fh: header_len = struct.unpack(" _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available(): - # File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual - # address space reserved by UMA GPU init. Use sequential file-read instead. - sd, metadata = _load_safetensors_no_mmap(ckpt) + elif (os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD + and sys.platform == "win32" + and torch.cuda.is_available()): + # Windows ROCm/UMA: GPU init reserves ~14 GB of virtual address space, + # preventing mmap of files >4 GB. Use sequential file-read instead. + # Scoped to Windows only to avoid overhead on Linux/Mac CUDA systems. + sd, metadata = _load_safetensors_no_mmap(ckpt, device=device) if not return_metadata: metadata = None else: