diff --git a/comfy/utils.py b/comfy/utils.py index 78c491b98..3f54ac324 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -119,6 +119,76 @@ def load_safetensors(ckpt): return sd, header.get("__metadata__", {}), +def load_safetensors_no_mmap(ckpt, device=None, return_metadata=False): + # Load a .safetensors / .sft file without ever mmap'ing it. + # + # safetensors.safe_open() (and therefore safetensors.torch.load_file) always + # mmaps the underlying file in Rust. On systems with unified CPU/GPU memory + # like NVIDIA Grace Blackwell / DGX Spark, Apple Silicon, AMD APUs, etc. + # this is fatal for large models: the OS page-cache pages backing the mmap + # and any subsequent device copy both reside in the same physical memory + # pool, doubling peak memory and causing OOM well before the hardware + # limit is reached. + # See: https://github.com/Comfy-Org/ComfyUI/issues/10896 + # https://github.com/safetensors/safetensors/issues/758 + # https://github.com/safetensors/safetensors/pull/759 + # + # This is a temporary workaround until upstream safetensors exposes a + # public ``mmap=False`` option. Here we parse the safetensors header + # ourselves and read each tensor straight from disk into a per-tensor + # ``bytearray`` via ``readinto``, then zero-copy-wrap it as a torch tensor + # with ``torch.frombuffer``. Peak memory is one model copy (plus, if a + # non-CPU device is requested, the bytes of a single tensor in flight + # while it is being moved). + if device is None: + device = torch.device("cpu") + + sd = {} + metadata = None + with open(ckpt, "rb") as f: + header_bytes = f.read(8) + if len(header_bytes) != 8: + raise ValueError("HeaderTooLarge: file is too small to be a valid safetensors file: {}".format(ckpt)) + header_size = struct.unpack("