diff --git a/baselines/system_info.txt b/baselines/system_info.txt
index b0c103fa9..c5cb2b08e 100644
--- a/baselines/system_info.txt
+++ b/baselines/system_info.txt
@@ -28,7 +28,7 @@ torchaudio:  scottt/rocm-TheRock v6.5.0rc-pytorch/torchaudio-2.6.0a0+1a8f621-cp3
 numpy:       pinned to <2 (1.26.4) for wheel compatibility
 
 --- Startup ---
-cd C:\\Users\\LvHHu\\ComfyUI
+cd %USERPROFILE%\ComfyUI
 .\\venv\\Scripts\\activate
 python main.py
 
diff --git a/baselines/system_info_realvisxl.txt b/baselines/system_info_realvisxl.txt
index 8a6d3b839..988361e69 100644
--- a/baselines/system_info_realvisxl.txt
+++ b/baselines/system_info_realvisxl.txt
@@ -22,7 +22,7 @@ Functions: _load_safetensors_no_mmap(), _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_0
 Branch:    load_torch_file() elif os.path.getsize > threshold and cuda available
 
 --- Startup command ---
-cd C:\Users\LvHHu\ComfyUI
+cd %USERPROFILE%\ComfyUI
 .\venv\Scripts\activate
 python main.py --disable-dynamic-vram --disable-mmap
 
diff --git a/comfy/utils.py b/comfy/utils.py
index ceddf7af1..32db92e61 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -22,6 +22,7 @@ import math
 import struct
 import ctypes
 import os
+import sys
 import comfy.memory_management
 import safetensors.torch
 import numpy as np
@@ -122,9 +123,11 @@ def load_safetensors(ckpt):
 _LARGE_FILE_MMAP_THRESHOLD = 4_000_000_000  # 4 GB
 
 
-def _load_safetensors_no_mmap(ckpt):
+def _load_safetensors_no_mmap(ckpt, device=None):
     # Windows + ROCm/CUDA UMA: large mmaps fail after GPU virtual address space is reserved.
     # Read tensors sequentially from file instead.
+    if device is None:
+        device = torch.device("cpu")
     sd = {}
     with open(ckpt, "rb") as fh:
         header_len = struct.unpack("<Q", fh.read(8))[0]
@@ -136,12 +139,18 @@ def _load_safetensors_no_mmap(ckpt):
             start, end = info["data_offsets"]
             dtype = _TYPES[info["dtype"]]
             shape = info["shape"]
+            expected = end - start
+            if expected == 0:
+                sd[name] = torch.empty(shape, dtype=dtype, device=device)
+                continue
             fh.seek(data_start + start)
-            raw = fh.read(end - start)
-            if raw:
-                sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone()
-            else:
-                sd[name] = torch.empty(shape, dtype=dtype)
+            raw = fh.read(expected)
+            if len(raw) != expected:
+                raise RuntimeError(
+                    f"Safetensors read error: tensor '{name}' expected {expected} bytes, got {len(raw)}. "
+                    f"File may be corrupt or truncated."
+                )
+            sd[name] = torch.frombuffer(bytearray(raw), dtype=dtype).reshape(shape).clone().to(device=device)
     return sd, header.get("__metadata__", {})
 
 
@@ -155,10 +164,13 @@ def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
                 sd, metadata = load_safetensors(ckpt)
                 if not return_metadata:
                     metadata = None
-            elif os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD and torch.cuda.is_available():
-                # File > 4 GB with active CUDA/ROCm: mmap would exhaust Windows virtual
-                # address space reserved by UMA GPU init. Use sequential file-read instead.
-                sd, metadata = _load_safetensors_no_mmap(ckpt)
+            elif (os.path.getsize(ckpt) > _LARGE_FILE_MMAP_THRESHOLD
+                  and sys.platform == "win32"
+                  and torch.cuda.is_available()):
+                # Windows ROCm/UMA: GPU init reserves ~14 GB of virtual address space,
+                # preventing mmap of files >4 GB. Use sequential file-read instead.
+                # Scoped to Windows only to avoid overhead on Linux/Mac CUDA systems.
+                sd, metadata = _load_safetensors_no_mmap(ckpt, device=device)
                 if not return_metadata:
                     metadata = None
             else: