Merge 0df0b0d613 into 35c1470935

2026-07-03 13:19:23 +08:00 · 2026-07-03 03:05:59 +08:00 · 2026-07-03 03:05:59 +08:00 · 5770141fb9
commit 5770141fb9
parent 35c1470935 0df0b0d613
5 changed files with 325 additions and 0 deletions
--- a/baselines/environment_rocm_working.txt
+++ b/baselines/environment_rocm_working.txt
@ -0,0 +1,83 @@
+aiohappyeyeballs==2.6.2
+aiohttp==3.14.1
+aiosignal==1.4.0
+alembic==1.18.4
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.13.0
+attrs==26.1.0
+av==17.1.0
+blake3==1.0.8
+certifi==2026.5.20
+charset-normalizer==3.4.7
+click==8.4.1
+colorama==0.4.6
+comfy-aimdo==0.4.9
+comfy-kitchen==0.2.10
+comfyui-embedded-docs==0.5.3
+comfyui-workflow-templates-core==0.3.252
+comfyui-workflow-templates-media-api==0.3.80
+comfyui-workflow-templates-media-image==0.3.150
+comfyui-workflow-templates-media-other==0.3.217
+comfyui-workflow-templates-media-video==0.3.91
+comfyui_frontend_package==1.45.15
+comfyui_workflow_templates==0.9.98
+einops==0.8.2
+filelock==3.29.4
+frozenlist==1.8.0
+fsspec==2026.4.0
+glfw==2.10.0
+greenlet==3.5.1
+h11==0.16.0
+hf-xet==1.5.1
+httpcore==1.0.9
+httpx==0.28.1
+huggingface_hub==1.19.0
+idna==3.18
+Jinja2==3.1.6
+kornia==0.8.3
+kornia_rs==0.1.14
+Mako==1.3.12
+markdown-it-py==4.2.0
+MarkupSafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.7.1
+networkx==3.6.1
+numpy==1.26.4
+packaging==26.2
+pillow==12.2.0
+propcache==0.5.2
+psutil==7.2.2
+pydantic==2.13.4
+pydantic-settings==2.14.1
+pydantic_core==2.46.4
+Pygments==2.20.0
+PyOpenGL==3.1.10
+python-dotenv==1.2.2
+PyYAML==6.0.3
+regex==2026.5.9
+requests==2.34.2
+rich==15.0.0
+safetensors==0.8.0
+scipy==1.17.1
+sentencepiece==0.2.1
+setuptools==82.0.1
+shellingham==1.5.4
+simpleeval==1.0.7
+spandrel==0.4.2
+SQLAlchemy==2.0.50
+sympy==1.14.0
+tokenizers==0.22.2
+torch @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torch-2.7.0a0+git3f903c3-cp312-cp312-win_amd64.whl#sha256=ab308d20b8568354781ceaad1c9a1637b6dff16ab42e589fa87b19fa87f3c839
+torchaudio @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp312-cp312-win_amd64.whl#sha256=caa1291b5040325d67ac2d6bddb9c3ec9478337dfc70a4d08bda8a557c834698
+torchsde==0.2.6
+torchvision @ https://github.com/scottt/rocm-TheRock/releases/download/v6.5.0rc-pytorch/torchvision-0.22.0+9eb57cd-cp312-cp312-win_amd64.whl#sha256=47fbcdc9b5e80ee7ab40c27bbf5cd36f7a7091eae3d43a09eebd833a391de1ec
+tqdm==4.68.2
+trampoline==0.1.2
+transformers==5.12.0
+typer==0.25.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.7.0
+yarl==1.24.2
--- a/baselines/system_info.txt
+++ b/baselines/system_info.txt
@ -0,0 +1,36 @@
+=== ComfyUI ROCm Stable Baseline ===
+Date: 2026-06-20
+
+--- Python ---
+Python: 3.12.10
+
+--- PyTorch / ROCm ---
+torch:      2.7.0a0+git3f903c3
+CUDA avail: True
+Device:     AMD Radeon(TM) 8050S Graphics
+VRAM (GB):  14.37
+ROCm/HIP:   6.5.25205-c1c2abe52
+
+--- torch packages ---
+torch: 2.7.0a0+git3f903c3
+torchvision: 0.22.0+9eb57cd
+torchaudio: 2.6.0a0+1a8f621
+
+--- ComfyUI ---
+Version: 0.24.0
+Backend: ROCm 6.5 (scottt/rocm-TheRock gfx1151 wheel)
+Tested:  SD1.5 512x512 20steps ~5 it/s, stable
+
+--- Wheel sources (gfx1151 / Strix Halo) ---
+torch:       scottt/rocm-TheRock v6.5.0rc-pytorch/torch-2.7.0a0+git3f903c3-cp312-cp312-win_amd64.whl
+torchvision: scottt/rocm-TheRock v6.5.0rc-pytorch/torchvision-0.22.0+9eb57cd-cp312-cp312-win_amd64.whl
+torchaudio:  scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp312-cp312-win_amd64.whl
+numpy:       pinned to <2 (1.26.4) for wheel compatibility
+
+--- Startup ---
+cd %USERPROFILE%\ComfyUI
+.\\venv\\Scripts\\activate
+python main.py
+
+--- Saved workflow ---
+baselines/workflows/sd15_test_rocm_workflow.json
--- a/baselines/system_info_realvisxl.txt
+++ b/baselines/system_info_realvisxl.txt
@ -0,0 +1,55 @@
+=== ComfyUI RealVisXL no-mmap Stable Baseline ===
+Date: 2026-06-20
+
+--- Previous baseline ---
+Tag: rocm-sd15-working-baseline (preserved, not modified)
+
+--- This baseline adds ---
+Fix:   comfy/utils.py: _load_safetensors_no_mmap() for files >4 GB
+Model: RealVisXL_V4.0.safetensors (6.46 GB) - path only, not in git
+Test:  768x1024, 25 steps, cfg=6, dpmpp_2m, karras -> OK
+
+--- Root cause of crash (diagnosed & fixed) ---
+Strix Halo UMA: ROCm init reserves ~14 GB GPU virtual address space.
+safetensors mmap of files >~4 GB then fails (Windows VA space exhausted).
+SD1.5 (3.97 GB) < threshold -> mmap OK.
+SDXL fp16 (~6.5 GB) > threshold -> access violation in safe_open().
+Fix: sequential file-read (open+seek+read) bypasses mmap entirely.
+
+--- Patch location ---
+File:      comfy/utils.py
+Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000
+Branch:    load_torch_file() elif os.path.getsize > threshold and cuda available
+
+--- Startup command ---
+cd %USERPROFILE%\ComfyUI
+.\venv\Scripts\activate
+python main.py --disable-dynamic-vram --disable-mmap
+
+--- GPU / ROCm ---
+torch:    2.7.0a0+git3f903c3
+Device:   AMD Radeon(TM) 8050S Graphics
+VRAM GB:  14.37
+ROCm:     6.5 / gfx1151 (Strix Halo)
+
+--- Models in checkpoints (not in git) ---
+v1-5-pruned-emaonly.safetensors   3.97 GB  SD1.5 baseline
+RealVisXL_V4.0.safetensors        6.46 GB  SDXL realistic portrait
+
+--- Working parameters (RealVisXL) ---
+Resolution:   768x1024
+Steps:        25
+CFG:          6
+Sampler:      dpmpp_2m
+Scheduler:    karras
+Batch size:   1
+
+--- Recovery commands ---
+# 1. Return to this code state:
+git checkout rocm-realvisxl-nommap-working
+
+# 2. Re-download RealVisXL if needed (not in git):
+# python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='SG161222/RealVisXL_V4.0', filename='RealVisXL_V4.0.safetensors', local_dir='models/checkpoints')"
+
+# 3. Start ComfyUI:
+# python main.py --disable-dynamic-vram --disable-mmap
--- a/baselines/workflows/sd15_test_rocm_workflow.json
+++ b/baselines/workflows/sd15_test_rocm_workflow.json
@ -0,0 +1,107 @@
+{
+  "2": {
+    "inputs": {
+      "ckpt_name": "v1-5-pruned-emaonly.safetensors"
+    },
+    "class_type": "CheckpointLoaderSimple",
+    "_meta": {
+      "title": "Checkpoint加载器（简易）"
+    }
+  },
+  "3": {
+    "inputs": {
+      "text": "aa cute fluffy kitten, big round eyes, detailed fur, soft natural window light, cozy indoor background, shallow depth of field, photorealistic, high quality, 50mm lens",
+      "clip": [
+        "2",
+        1
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP文本编码"
+    }
+  },
+  "4": {
+    "inputs": {
+      "text": "low quality, blurry, deformed, ugly, bad anatomy, distorted face, extra limbs, bad eyes, oversaturated",
+      "clip": [
+        "2",
+        1
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP文本编码"
+    }
+  },
+  "5": {
+    "inputs": {
+      "width": 512,
+      "height": 512,
+      "batch_size": 1
+    },
+    "class_type": "EmptyLatentImage",
+    "_meta": {
+      "title": "空Latent图像"
+    }
+  },
+  "6": {
+    "inputs": {
+      "seed": 826325619577598,
+      "steps": 30,
+      "cfg": 7,
+      "sampler_name": "dpmpp_2m",
+      "scheduler": "normal",
+      "denoise": 1,
+      "model": [
+        "2",
+        0
+      ],
+      "positive": [
+        "3",
+        0
+      ],
+      "negative": [
+        "4",
+        0
+      ],
+      "latent_image": [
+        "5",
+        0
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "K采样器"
+    }
+  },
+  "7": {
+    "inputs": {
+      "samples": [
+        "6",
+        0
+      ],
+      "vae": [
+        "2",
+        2
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE解码"
+    }
+  },
+  "8": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "images": [
+        "7",
+        0
+      ]
+    },
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "保存图像"
+    }
+  }
+}
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -22,6 +22,7 @@ import math
 import struct
 import ctypes
 import os
+import sys
 import comfy.memory_management
 import safetensors.torch
 import numpy as np
@ -119,6 +120,40 @@ def load_safetensors(ckpt):
    return sd, header.get("__metadata__", {}),


+_LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000  # 4 GB
+
+
+def _load_safetensors_no_mmap(ckpt, device=None):
+    # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved.
+    # Read tensors sequentially from file instead.
+    if device is None:
+        device = torch.device("cpu")
+    sd = {}
+    with open(ckpt, "rb") as fh:
+        header_len = struct.unpack("<Q", fh.read(8))[0]
+        header = json.loads(fh.read(header_len).decode("utf-8"))
+        data_start = 8 + header_len
+        for name, info in header.items():
+            if name == "__metadata__":
+                continue
+            start, end = info["data_offsets"]
+            dtype = _TYPES[info["dtype"]]
+            shape = info["shape"]
+            expected = end - start
+            if expected == 0:
+                sd[name] = torch.empty(shape, dtype=dtype, device=device)
+                continue
+            fh.seek(data_start + start)
+            raw = fh.read(expected)
+            if len(raw) != expected:
+                raise RuntimeError(
+                    f"Safetensors read error: tensor '{name}' expected {expected} bytes, got {len(raw)}. "
+                    f"File may be corrupt or truncated."
+                )
+            sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone().to(device=device)
+    return sd, header.get("__metadata__", {})
+
+
 def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
    if device is None:
        device = torch.device("cpu")
@ -129,6 +164,15 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                sd, metadata = load_safetensors(ckpt)
                if not return_metadata:
                    metadata = None
+            elif (os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD
+                  and sys.platform == "win32"
+                  and torch.cuda.is_available()):
+                # Windows ROCm/UMA: GPU init reserves ~14 GB of virtual address space,
+                # preventing mmap of files >4 GB. Use sequential file-read instead.
+                # Scoped to Windows only to avoid overhead on Linux/Mac CUDA systems.
+                sd, metadata = _load_safetensors_no_mmap(ckpt, device=device)
+                if not return_metadata:
+                    metadata = None
            else:
                with safetensors.safe_open(ckpt, framework="pt", device=device.type) as f:
                    sd = {}