From b6a730b24e31d6a88a22cebc915eb3ae4e2bbc30 Mon Sep 17 00:00:00 2001
From: Houde <LvHHuuaadai@proton.me>
Date: Sat, 20 Jun 2026 17:48:15 +0100
Subject: [PATCH 1/3] chore: add ROCm stable baseline snapshot (gfx1151 / Strix
 Halo)

- torch 2.7.0a0 + ROCm 6.5 via scottt/rocm-TheRock gfx1151 wheels
- numpy pinned to 1.26.4 for wheel compatibility
- SD1.5 512x512 20 steps ~5 it/s confirmed stable
- Saved workflow: sd15_test_rocm_workflow.json
- AMD Radeon 8050S, 14.37 GB UMA VRAM correctly detected
---
 baselines/environment_rocm_working.txt        |  83 ++++++++++++++
 baselines/system_info.txt                     |  36 ++++++
 .../workflows/sd15_test_rocm_workflow.json    | 107 ++++++++++++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 baselines/environment_rocm_working.txt
 create mode 100644 baselines/system_info.txt
 create mode 100644 baselines/workflows/sd15_test_rocm_workflow.json

diff --git a/baselines/environment_rocm_working.txt b/baselines/environment_rocm_working.txt
new file mode 100644
index 000000000..0bc13ee9f
--- /dev/null
+++ b/baselines/environment_rocm_working.txt
@@ -0,0 +1,83 @@
+﻿aiohappyeyeballs==2.6.2
+aiohttp==3.14.1
+aiosignal==1.4.0
+alembic==1.18.4
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+attrs==26.1.0
+av==17.1.0
+blake3==1.0.8
+certifi==2026.5.20
+charset-normalizer==3.4.7
+click==8.4.1
+colorama==0.4.6
+comfy-aimdo==0.4.9
+comfy-kitchen==0.2.10
+comfyui-embedded-docs==0.5.3
+comfyui-workflow-templates-core==0.3.252
+comfyui-workflow-templates-media-api==0.3.80
+comfyui-workflow-templates-media-image==0.3.150
+comfyui-workflow-templates-media-other==0.3.217
+comfyui-workflow-templates-media-video==0.3.91
+comfyui_frontend_package==1.45.15
+comfyui_workflow_templates==0.9.98
+einops==0.8.2
+filelock==3.29.4
+frozenlist==1.8.0
+fsspec==2026.4.0
+glfw==2.10.0
+greenlet==3.5.1
+h11==0.16.0
+hf-xet==1.5.1
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.19.0
+idna==3.18
+Jinja2==3.1.6
+kornia==0.8.3
+kornia_rs==0.1.14
+Mako==1.3.12
+markdown-it-py==4.2.0
+MarkupSafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.7.1
+networkx==3.6.1
+numpy==1.26.4
+packaging==26.2
+pillow==12.2.0
+propcache==0.5.2
+psutil==7.2.2
+pydantic==2.13.4
+pydantic-settings==2.14.1
+pydantic_core==2.46.4
+Pygments==2.20.0
+PyOpenGL==3.1.10
+python-dotenv==1.2.2
+PyYAML==6.0.3
+regex==2026.5.9
+requests==2.34.2
+rich==15.0.0
+safetensors==0.8.0
+scipy==1.17.1
+sentencepiece==0.2.1
+setuptools==82.0.1
+shellingham==1.5.4
+simpleeval==1.0.7
+spandrel==0.4.2
+SQLAlchemy==2.0.50
+sympy==1.14.0
+tokenizers==0.22.2
+torch @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torch-2.7.0a0+git3f903c3-cp312-cp312-win_amd64.whl#sha256=ab308d20b8568354781ceaad1c9a1637b6dff16ab42e589fa87b19fa87f3c839
+torchaudio @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp312-cp312-win_amd64.whl#sha256=caa1291b5040325d67ac2d6bddb9c3ec9478337dfc70a4d08bda8a557c834698
+torchsde==0.2.6
+torchvision @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torchvision-0.22.0+9eb57cd-cp312-cp312-win_amd64.whl#sha256=47fbcdc9b5e80ee7ab40c27bbf5cd36f7a7091eae3d43a09eebd833a391de1ec
+tqdm==4.68.2
+trampoline==0.1.2
+transformers==5.12.0
+typer==0.25.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.7.0
+yarl==1.24.2
diff --git a/baselines/system_info.txt b/baselines/system_info.txt
new file mode 100644
index 000000000..b0c103fa9
--- /dev/null
+++ b/baselines/system_info.txt
@@ -0,0 +1,36 @@
+=== ComfyUI ROCm Stable Baseline ===
+Date: 2026-06-20
+
+--- Python ---
+Python: 3.12.10
+
+--- PyTorch / ROCm ---
+torch:      2.7.0a0+git3f903c3
+CUDA avail: True
+Device:     AMD Radeon(TM) 8050S Graphics
+VRAM (GB):  14.37
+ROCm/HIP:   6.5.25205-c1c2abe52
+
+--- torch packages ---
+torch: 2.7.0a0+git3f903c3
+torchvision: 0.22.0+9eb57cd
+torchaudio: 2.6.0a0+1a8f621
+
+--- ComfyUI ---
+Version: 0.24.0
+Backend: ROCm 6.5 (scottt/rocm-TheRock gfx1151 wheel)
+Tested:  SD1.5 512x512 20steps ~5 it/s, stable
+
+--- Wheel sources (gfx1151 / Strix Halo) ---
+torch:       scottt/rocm-TheRock v6.5.0rc-pytorch/torch-2.7.0a0+git3f903c3-cp312-cp312-win_amd64.whl
+torchvision: scottt/rocm-TheRock v6.5.0rc-pytorch/torchvision-0.22.0+9eb57cd-cp312-cp312-win_amd64.whl
+torchaudio:  scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp312-cp312-win_amd64.whl
+numpy:       pinned to <2 (1.26.4) for wheel compatibility
+
+--- Startup ---
+cd C:\\Users\\LvHHu\\ComfyUI
+.\\venv\\Scripts\\activate
+python main.py
+
+--- Saved workflow ---
+baselines/workflows/sd15_test_rocm_workflow.json
diff --git a/baselines/workflows/sd15_test_rocm_workflow.json b/baselines/workflows/sd15_test_rocm_workflow.json
new file mode 100644
index 000000000..bb0f4517b
--- /dev/null
+++ b/baselines/workflows/sd15_test_rocm_workflow.json
@@ -0,0 +1,107 @@
+{
+  "2": {
+    "inputs": {
+      "ckpt_name": "v1-5-pruned-emaonly.safetensors"
+    },
+    "class_type": "CheckpointLoaderSimple",
+    "_meta": {
+      "title": "Checkpoint加载器（简易）"
+    }
+  },
+  "3": {
+    "inputs": {
+      "text": "aa cute fluffy kitten, big round eyes, detailed fur, soft natural window light, cozy indoor background, shallow depth of field, photorealistic, high quality, 50mm lens",
+      "clip": [
+        "2",
+        1
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP文本编码"
+    }
+  },
+  "4": {
+    "inputs": {
+      "text": "low quality, blurry, deformed, ugly, bad anatomy, distorted face, extra limbs, bad eyes, oversaturated",
+      "clip": [
+        "2",
+        1
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP文本编码"
+    }
+  },
+  "5": {
+    "inputs": {
+      "width": 512,
+      "height": 512,
+      "batch_size": 1
+    },
+    "class_type": "EmptyLatentImage",
+    "_meta": {
+      "title": "空Latent图像"
+    }
+  },
+  "6": {
+    "inputs": {
+      "seed": 826325619577598,
+      "steps": 30,
+      "cfg": 7,
+      "sampler_name": "dpmpp_2m",
+      "scheduler": "normal",
+      "denoise": 1,
+      "model": [
+        "2",
+        0
+      ],
+      "positive": [
+        "3",
+        0
+      ],
+      "negative": [
+        "4",
+        0
+      ],
+      "latent_image": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "K采样器"
+    }
+  },
+  "7": {
+    "inputs": {
+      "samples": [
+        "6",
+        0
+      ],
+      "vae": [
+        "2",
+        2
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE解码"
+    }
+  },
+  "8": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "images": [
+        "7",
+        0
+      ]
+    },
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "保存图像"
+    }
+  }
+}
\ No newline at end of file

From e912b910a2beaaa7cfeb6c991ea327e586892083 Mon Sep 17 00:00:00 2001
From: Houde <LvHHuuaadai@proton.me>
Date: Sat, 20 Jun 2026 19:03:20 +0100
Subject: [PATCH 2/3] fix: add no-mmap safetensors loader for >4GB files on
 Windows ROCm/UMA

Root cause: Strix Halo UMA ROCm init reserves ~14 GB of Windows virtual
address space for GPU. This prevents safetensors from mmap-ing files
larger than ~4 GB (SDXL fp16 ~6.5 GB), causing access violations.
SD1.5 (3.97 GB) is below the threshold and unaffected.

Fix in comfy/utils.py:
- Add _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000
- Add _load_safetensors_no_mmap(): reads tensors via open()+seek()+read()
  instead of mmap, then clones each tensor for independent ownership
- In load_torch_file(): route files >4 GB with CUDA active through
  _load_safetensors_no_mmap() automatically

Tested: RealVisXL_V4.0.safetensors (6.46 GB) loads and generates
768x1024 portrait images at ~5 it/s on AMD Radeon 8050S (gfx1151).
SD1.5 baseline unaffected (still uses original mmap path).
---
 baselines/system_info_realvisxl.txt | 55 +++++++++++++++++++++++++++++
 comfy/utils.py                      | 32 +++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 baselines/system_info_realvisxl.txt

diff --git a/baselines/system_info_realvisxl.txt b/baselines/system_info_realvisxl.txt
new file mode 100644
index 000000000..8a6d3b839
--- /dev/null
+++ b/baselines/system_info_realvisxl.txt
@@ -0,0 +1,55 @@
+=== ComfyUI RealVisXL no-mmap Stable Baseline ===
+Date: 2026-06-20
+
+--- Previous baseline ---
+Tag: rocm-sd15-working-baseline (preserved, not modified)
+
+--- This baseline adds ---
+Fix:   comfy/utils.py: _load_safetensors_no_mmap() for files >4 GB
+Model: RealVisXL_V4.0.safetensors (6.46 GB) - path only, not in git
+Test:  768x1024, 25 steps, cfg=6, dpmpp_2m, karras -> OK
+
+--- Root cause of crash (diagnosed & fixed) ---
+Strix Halo UMA: ROCm init reserves ~14 GB GPU virtual address space.
+safetensors mmap of files >~4 GB then fails (Windows VA space exhausted).
+SD1.5 (3.97 GB) < threshold -> mmap OK.
+SDXL fp16 (~6.5 GB) > threshold -> access violation in safe_open().
+Fix: sequential file-read (open+seek+read) bypasses mmap entirely.
+
+--- Patch location ---
+File:      comfy/utils.py
+Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000
+Branch:    load_torch_file() elif os.path.getsize > threshold and cuda available
+
+--- Startup command ---
+cd C:\Users\LvHHu\ComfyUI
+.\venv\Scripts\activate
+python main.py --disable-dynamic-vram --disable-mmap
+
+--- GPU / ROCm ---
+torch:    2.7.0a0+git3f903c3
+Device:   AMD Radeon(TM) 8050S Graphics
+VRAM GB:  14.37
+ROCm:     6.5 / gfx1151 (Strix Halo)
+
+--- Models in checkpoints (not in git) ---
+v1-5-pruned-emaonly.safetensors   3.97 GB  SD1.5 baseline
+RealVisXL_V4.0.safetensors        6.46 GB  SDXL realistic portrait
+
+--- Working parameters (RealVisXL) ---
+Resolution:   768x1024
+Steps:        25
+CFG:          6
+Sampler:      dpmpp_2m
+Scheduler:    karras
+Batch size:   1
+
+--- Recovery commands ---
+# 1. Return to this code state:
+git checkout rocm-realvisxl-nommap-working
+
+# 2. Re-download RealVisXL if needed (not in git):
+# python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='SG161222/RealVisXL_V4.0', filename='RealVisXL_V4.0.safetensors', local_dir='models/checkpoints')"
+
+# 3. Start ComfyUI:
+# python main.py --disable-dynamic-vram --disable-mmap
diff --git a/comfy/utils.py b/comfy/utils.py
index 09d783fff..ceddf7af1 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -119,6 +119,32 @@ def load_safetensors(ckpt):
     return sd, header.get("__metadata__", {}),
 
 
+_LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000  # 4 GB
+
+
+def _load_safetensors_no_mmap(ckpt):
+    # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved.
+    # Read tensors sequentially from file instead.
+    sd = {}
+    with open(ckpt, "rb") as fh:
+        header_len = struct.unpack("<Q", fh.read(8))[0]
+        header = json.loads(fh.read(header_len).decode("utf-8"))
+        data_start = 8 + header_len
+        for name, info in header.items():
+            if name == "__metadata__":
+                continue
+            start, end = info["data_offsets"]
+            dtype = _TYPES[info["dtype"]]
+            shape = info["shape"]
+            fh.seek(data_start + start)
+            raw = fh.read(end - start)
+            if raw:
+                sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone()
+            else:
+                sd[name] = torch.empty(shape, dtype=dtype)
+    return sd, header.get("__metadata__", {})
+
+
 def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
     if device is None:
         device = torch.device("cpu")
@@ -129,6 +155,12 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                 sd, metadata = load_safetensors(ckpt)
                 if not return_metadata:
                     metadata = None
+            elif os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available():
+                # File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual
+                # address space reserved by UMA GPU init. Use sequential file-read instead.
+                sd, metadata = _load_safetensors_no_mmap(ckpt)
+                if not return_metadata:
+                    metadata = None
             else:
                 with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
                     sd = {}

From 0df0b0d61394261e2f18616ca77b9c5271a19007 Mon Sep 17 00:00:00 2001
From: Houde <LvHHuuaadai@proton.me>
Date: Tue, 23 Jun 2026 14:51:22 +0100
Subject: [PATCH 3/3] fix: address CodeRabbit review comments on PR #14587

- utils.py: add device param to _load_safetensors_no_mmap, move tensors
  to target device instead of always returning CPU tensors
- utils.py: validate read length == expected bytes; raise RuntimeError
  on partial/corrupt reads instead of silently creating empty tensors
- utils.py: scope no-mmap fallback to sys.platform == win32 to avoid
  unnecessary overhead on Linux/Mac CUDA systems; add sys import
- baselines: replace hardcoded LvHHu username with %USERPROFILE% in
  startup commands for portability

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 baselines/system_info.txt           |  2 +-
 baselines/system_info_realvisxl.txt |  2 +-
 comfy/utils.py                      | 32 ++++++++++++++++++++---------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/baselines/system_info.txt b/baselines/system_info.txt
index b0c103fa9..c5cb2b08e 100644
--- a/baselines/system_info.txt
+++ b/baselines/system_info.txt
@@ -28,7 +28,7 @@ torchaudio:  scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp3
 numpy:       pinned to <2 (1.26.4) for wheel compatibility
 
 --- Startup ---
-cd C:\\Users\\LvHHu\\ComfyUI
+cd %USERPROFILE%\ComfyUI
 .\\venv\\Scripts\\activate
 python main.py
 
diff --git a/baselines/system_info_realvisxl.txt b/baselines/system_info_realvisxl.txt
index 8a6d3b839..988361e69 100644
--- a/baselines/system_info_realvisxl.txt
+++ b/baselines/system_info_realvisxl.txt
@@ -22,7 +22,7 @@ Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_0
 Branch:    load_torch_file() elif os.path.getsize > threshold and cuda available
 
 --- Startup command ---
-cd C:\Users\LvHHu\ComfyUI
+cd %USERPROFILE%\ComfyUI
 .\venv\Scripts\activate
 python main.py --disable-dynamic-vram --disable-mmap
 
diff --git a/comfy/utils.py b/comfy/utils.py
index ceddf7af1..32db92e61 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -22,6 +22,7 @@ import math
 import struct
 import ctypes
 import os
+import sys
 import comfy.memory_management
 import safetensors.torch
 import numpy as np
@@ -122,9 +123,11 @@ def load_safetensors(ckpt):
 _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000  # 4 GB
 
 
-def _load_safetensors_no_mmap(ckpt):
+def _load_safetensors_no_mmap(ckpt, device=None):
     # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved.
     # Read tensors sequentially from file instead.
+    if device is None:
+        device = torch.device("cpu")
     sd = {}
     with open(ckpt, "rb") as fh:
         header_len = struct.unpack("<Q", fh.read(8))[0]
@@ -136,12 +139,18 @@ def _load_safetensors_no_mmap(ckpt):
             start, end = info["data_offsets"]
             dtype = _TYPES[info["dtype"]]
             shape = info["shape"]
+            expected = end - start
+            if expected == 0:
+                sd[name] = torch.empty(shape, dtype=dtype, device=device)
+                continue
             fh.seek(data_start + start)
-            raw = fh.read(end - start)
-            if raw:
-                sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone()
-            else:
-                sd[name] = torch.empty(shape, dtype=dtype)
+            raw = fh.read(expected)
+            if len(raw) != expected:
+                raise RuntimeError(
+                    f"Safetensors read error: tensor '{name}' expected {expected} bytes, got {len(raw)}. "
+                    f"File may be corrupt or truncated."
+                )
+            sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone().to(device=device)
     return sd, header.get("__metadata__", {})
 
 
@@ -155,10 +164,13 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                 sd, metadata = load_safetensors(ckpt)
                 if not return_metadata:
                     metadata = None
-            elif os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available():
-                # File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual
-                # address space reserved by UMA GPU init. Use sequential file-read instead.
-                sd, metadata = _load_safetensors_no_mmap(ckpt)
+            elif (os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD
+                  and sys.platform == "win32"
+                  and torch.cuda.is_available()):
+                # Windows ROCm/UMA: GPU init reserves ~14 GB of virtual address space,
+                # preventing mmap of files >4 GB. Use sequential file-read instead.
+                # Scoped to Windows only to avoid overhead on Linux/Mac CUDA systems.
+                sd, metadata = _load_safetensors_no_mmap(ckpt, device=device)
                 if not return_metadata:
                     metadata = None
             else: