From 1fe3a13f8476f333cb825e0b4a7f436a27684f36 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 21:08:15 +1000 Subject: [PATCH 01/15] model_management: disable non-dynamic smart memory Disable smart memory outright for non dynamic models. This is a minor step towards deprecation of --disable-dynamic-vram and the legacy ModelPatcher. This is needed for estimate-free model development, where new models can opt-out of supplying a memory estimate and not have to worry about hard VRAM allocations due to legacy non-dynamic model patchers This is also a general stability increase for a lot of stray use cases where estimates may still be off and going forward we are not going to accurately maintain such estimates. --- comfy/model_management.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 21738a4c7..ebef03ceb 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -674,10 +674,10 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins i = x[-1] memory_to_free = 1e32 pins_to_free = 1e32 - if not DISABLE_SMART_MEMORY or device is None: + if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None): memory_to_free = 0 if device is None else memory_required - get_free_memory(device) pins_to_free = pins_required - get_free_ram() - if current_loaded_models[i].model.is_dynamic() and for_dynamic: + if for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models #as that works on-demand. memory_required -= current_loaded_models[i].model.loaded_size() From 157965a1c99792e6250e6027ba2045efdd148528 Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 4 May 2026 12:32:12 +1000 Subject: [PATCH 02/15] pinned_memory: implement with aimdo growable buffer Use a single growable buffer so we can do threaded pre-warming on pinned memory. --- comfy/model_patcher.py | 34 +++++++++++++++++++++------------- comfy/pinned_memory.py | 30 ++++++++++-------------------- 2 files changed, 31 insertions(+), 33 deletions(-) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 2ea14bc2c..dc5f0e577 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -35,6 +35,7 @@ import comfy.model_management import comfy.ops import comfy.patcher_extension import comfy.utils +import comfy_aimdo.host_buffer from comfy.comfy_types import UnetWrapperFunction from comfy.quant_ops import QuantizedTensor from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP @@ -1543,6 +1544,10 @@ class ModelPatcherDynamic(ModelPatcher): super().__init__(model, load_device, offload_device, size, weight_inplace_update) if not hasattr(self.model, "dynamic_vbars"): self.model.dynamic_vbars = {} + if not hasattr(self.model, "dynamic_pins"): + self.model.dynamic_pins = {} + if self.load_device not in self.model.dynamic_pins: + self.model.dynamic_pins[self.load_device] = {"hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), "stack": [], "failed": False} self.non_dynamic_delegate_model = None assert load_device is not None @@ -1604,6 +1609,8 @@ class ModelPatcherDynamic(ModelPatcher): self.unpatch_hooks() vbar = self._vbar_get(create=True) + pin_state = self.model.dynamic_pins[self.load_device] + pin_state["failed"] = False if vbar is not None: vbar.prioritize() @@ -1655,8 +1662,8 @@ class ModelPatcherDynamic(ModelPatcher): if hasattr(m, "comfy_cast_weights"): m.comfy_cast_weights = True - m.pin_failed = False m.seed_key = n + m._pin_state = pin_state set_dirty(m, dirty) force_load, v_weight_size = setup_param(self, m, n, "weight") @@ -1734,20 +1741,21 @@ class ModelPatcherDynamic(ModelPatcher): return freed def pinned_memory_size(self): - total = 0 - loading = self._load_list(for_dynamic=True) - for x in loading: - _, _, _, _, m, _ = x - pin = comfy.pinned_memory.get_pin(m) - if pin is not None: - total += pin.numel() * pin.element_size() - return total + return self.model.dynamic_pins[self.load_device]["hostbuf"].size def partially_unload_ram(self, ram_to_unload): - loading = self._load_list(for_dynamic=True, default_device=self.offload_device) - for x in loading: - *_, m, _ = x - ram_to_unload -= comfy.pinned_memory.unpin_memory(m) + pin_state = self.model.dynamic_pins[self.load_device] + hostbuf = pin_state["hostbuf"] + stack = self.model.dynamic_pins[self.load_device]["stack"] + while len(stack) > 0: + module, offset = stack.pop() + size = module._pin.numel() * module._pin.element_size() + del module._pin + hostbuf.truncate(offset) + comfy.model_management.TOTAL_PINNED_MEMORY -= size + if comfy.model_management.TOTAL_PINNED_MEMORY < 0: + comfy.model_management.TOTAL_PINNED_MEMORY = 0 + ram_to_unload -= size if ram_to_unload <= 0: return diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 6d3ba367a..3638066c8 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -9,35 +9,25 @@ def get_pin(module): return getattr(module, "_pin", None) def pin_memory(module): - if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None: + pin_state = module._pin_state + if pin_state["failed"] or args.disable_pinned_memory or get_pin(module) is not None: return + hostbuf = pin_state["hostbuf"] size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) - + offset = hostbuf.size if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY: - module.pin_failed = True + pin_state["failed"] = True return False try: - hostbuf = comfy_aimdo.host_buffer.HostBuffer(size) + hostbuf.extend(size=size) except RuntimeError: - module.pin_failed = True + pin_state["failed"] = True return False - module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf) - module._pin_hostbuf = hostbuf + module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] + module._pin.untyped_storage()._comfy_hostbuf = hostbuf + pin_state["stack"].append((module, offset)) comfy.model_management.TOTAL_PINNED_MEMORY += size return True - -def unpin_memory(module): - if get_pin(module) is None: - return 0 - size = module._pin.numel() * module._pin.element_size() - - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 - - del module._pin - del module._pin_hostbuf - return size From b66b6420681a83f5bd247dec42d95f96113503d7 Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 4 May 2026 12:47:28 +1000 Subject: [PATCH 03/15] mm: use aimdo to do transfer from disk to pin Aimdo implements a faster threaded loader. --- comfy/memory_management.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 48e3c11da..4a628b05c 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -48,6 +48,12 @@ def read_tensor_file_slice_into(tensor, destination): if info.size == 0: return True + hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None) + if hostbuf is not None: + hostbuf.read_file_slice(file_obj, info.offset, info.size, + offset=destination.data_ptr() - hostbuf.get_raw_address()) + return True + buf_type = ctypes.c_ubyte * info.size view = memoryview(buf_type.from_address(destination.data_ptr())) From 8070cb77809145e7cf24b94eeb7f55710cdfcd17 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 14:04:48 +1000 Subject: [PATCH 04/15] Add stream host pin buffer for AIMDO casts Introduce per-offload-stream HostBuffer reuse for pinned staging, include it in cast buffer reset synchronization. Defer actual casts that go via this pin path to a separate pass such that the buffer can be allocated monolithically (to avoid cudaHostRegister thrash). --- comfy/model_management.py | 18 +++++++++++-- comfy/ops.py | 56 ++++++++++++++++++++++++++++++++------- 2 files changed, 62 insertions(+), 12 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index ebef03ceb..facdd0873 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -31,6 +31,7 @@ from contextlib import nullcontext import comfy.memory_management import comfy.utils import comfy.quant_ops +import comfy_aimdo.host_buffer import comfy_aimdo.vram_buffer class VRAMState(Enum): @@ -1180,8 +1181,10 @@ STREAM_CAST_BUFFERS = {} LARGEST_CASTED_WEIGHT = (None, 0) STREAM_AIMDO_CAST_BUFFERS = {} LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) +STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 +DEFAULT_PIN_BUFFER_PRIME_SIZE = 1024 ** 2 def get_cast_buffer(offload_stream, device, size, ref): global LARGEST_CASTED_WEIGHT @@ -1220,21 +1223,32 @@ def get_aimdo_cast_buffer(offload_stream, device): if cast_buffer is None: cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index) STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer - return cast_buffer + +def get_pin_buffer(offload_stream): + pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) + if pin_buffer is None: + # A small non-zero default primes HostBuffer's larger virtual reservation. + pin_buffer = comfy_aimdo.host_buffer.HostBuffer(DEFAULT_PIN_BUFFER_PRIME_SIZE) + STREAM_PIN_BUFFERS[offload_stream] = pin_buffer + elif offload_stream is not None: + offload_stream.synchronize() + return pin_buffer + def reset_cast_buffers(): global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT LARGEST_CASTED_WEIGHT = (None, 0) LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) - for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS): + for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS): if offload_stream is not None: offload_stream.synchronize() synchronize() STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() + STREAM_PIN_BUFFERS.clear() soft_empty_cache() def get_offload_stream(device): diff --git a/comfy/ops.py b/comfy/ops.py index 77ad1d527..3d196f438 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -75,6 +75,8 @@ except: cast_to = comfy.model_management.cast_to #TODO: remove once no more references +STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024 + def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -91,6 +93,9 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin offload_stream = None cast_buffer = None cast_buffer_offset = 0 + stream_pin_hostbuf = None + stream_pin_offset = 0 + stream_pin_queue = [] def ensure_offload_stream(module, required_size, check_largest): nonlocal offload_stream @@ -124,6 +129,20 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin cast_buffer_offset += buffer_size return buffer + def get_stream_pin_buffer_offset(buffer_size): + nonlocal stream_pin_hostbuf + nonlocal stream_pin_offset + + if buffer_size == 0 or offload_stream is None: + return None + + if stream_pin_hostbuf is None: + stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) + + offset = stream_pin_offset + stream_pin_offset += buffer_size + return offset + for s in comfy_modules: signature = comfy_aimdo.model_vbar.vbar_fault(s._v) resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) @@ -162,17 +181,21 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - if signature is None and pin is None: - comfy.pinned_memory.pin_memory(s) - pin = comfy.pinned_memory.get_pin(s) - else: - pin = None + if pin is None: + if signature is None: + comfy.pinned_memory.pin_memory(s) + pin = comfy.pinned_memory.get_pin(s) + if pin is not None: + comfy.model_management.cast_to_gathered(xfer_source, pin) + xfer_source = [ pin ] + if pin is None: + pin_offset = get_stream_pin_buffer_offset(dest_size) + if pin_offset is not None: + stream_pin_queue.append((xfer_source, pin_offset, dest_size, xfer_dest)) + xfer_source = None - if pin is not None: - comfy.model_management.cast_to_gathered(xfer_source, pin) - xfer_source = [ pin ] - #send it over - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + if xfer_source is not None: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) for param_key in ("weight", "bias"): lowvram_fn = getattr(s, param_key + "_lowvram_function", None) @@ -186,6 +209,19 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin prefetch["needs_cast"] = needs_cast s._prefetch = prefetch + if stream_pin_offset > 0: + stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size) + if stream_pin_hostbuf_size < stream_pin_offset: + stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM + stream_pin_hostbuf.extend(size=stream_pin_hostbuf_size, reallocate=True) + stream_pin_hostbuf._comfy_stream_pin_size = stream_pin_hostbuf_size + stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) + stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf + for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: + pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] + comfy.model_management.cast_to_gathered(xfer_source, pin) + comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) + return offload_stream From 17955235b2d95e9e8aa6f9719bcc1a29d8976ceb Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 18:33:02 +1000 Subject: [PATCH 05/15] remove old pin path --- comfy/model_management.py | 74 +++++++-------------------------------- comfy/model_patcher.py | 3 -- comfy/utils.py | 2 -- comfy/windows.py | 52 --------------------------- 4 files changed, 13 insertions(+), 118 deletions(-) delete mode 100644 comfy/windows.py diff --git a/comfy/model_management.py b/comfy/model_management.py index facdd0873..4b96d1492 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -496,6 +496,8 @@ except: current_loaded_models = [] +DIRTY_MMAPS = set() + def module_size(module): module_mem = 0 sd = module.state_dict() @@ -504,27 +506,11 @@ def module_size(module): module_mem += t.nbytes return module_mem -def module_mmap_residency(module, free=False): - mmap_touched_mem = 0 - module_mem = 0 - bounced_mmaps = set() - sd = module.state_dict() - for k in sd: - t = sd[k] - module_mem += t.nbytes - storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage() - if not getattr(storage, "_comfy_tensor_mmap_touched", False): - continue - mmap_touched_mem += t.nbytes - if not free: - continue - storage._comfy_tensor_mmap_touched = False - mmap_obj = storage._comfy_tensor_mmap_refs[0] - if mmap_obj in bounced_mmaps: - continue - mmap_obj.bounce() - bounced_mmaps.add(mmap_obj) - return mmap_touched_mem, module_mem +def mark_mmap_dirty(storage): + mmap_refs = getattr(storage, "_comfy_tensor_mmap_refs", None) + if mmap_refs is not None: + DIRTY_MMAPS.add(mmap_refs[0]) + class LoadedModel: def __init__(self, model): @@ -554,9 +540,6 @@ class LoadedModel: def model_memory(self): return self.model.model_size() - def model_mmap_residency(self, free=False): - return self.model.model_mmap_residency(free=free) - def model_loaded_memory(self): return self.model.loaded_size() @@ -636,15 +619,9 @@ WINDOWS = any(platform.win32_ver()) EXTRA_RESERVED_VRAM = 400 * 1024 * 1024 if WINDOWS: - import comfy.windows EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue if total_vram > (15 * 1024): # more extra reserved vram on 16GB+ cards EXTRA_RESERVED_VRAM += 100 * 1024 * 1024 - def get_free_ram(): - return comfy.windows.get_free_ram() -else: - def get_free_ram(): - return psutil.virtual_memory().available if args.reserve_vram is not None: EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024 @@ -658,7 +635,6 @@ def minimum_inference_memory(): def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0): cleanup_models_gc() - comfy.memory_management.extra_ram_release(max(pins_required, ram_required)) unloaded_model = [] can_unload = [] unloaded_models = [] @@ -674,10 +650,8 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins for x in can_unload_sorted: i = x[-1] memory_to_free = 1e32 - pins_to_free = 1e32 if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None): memory_to_free = 0 if device is None else memory_required - get_free_memory(device) - pins_to_free = pins_required - get_free_ram() if for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models #as that works on-demand. @@ -686,18 +660,6 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free): logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}") unloaded_model.append(i) - if pins_to_free > 0: - logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}") - current_loaded_models[i].model.partially_unload_ram(pins_to_free) - - for x in can_unload_sorted: - i = x[-1] - ram_to_free = ram_required - psutil.virtual_memory().available - if ram_to_free <= 0 and i not in unloaded_model: - continue - resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True) - if resident_memory > 0: - logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}") for i in sorted(unloaded_model, reverse=True): unloaded_models.append(current_loaded_models.pop(i)) @@ -763,29 +725,16 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu model_to_unload.model.detach(unpatch_all=False) model_to_unload.model_finalizer.detach() - total_memory_required = {} - total_pins_required = {} - total_ram_required = {} for loaded_model in models_to_load: device = loaded_model.device total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device) - resident_memory, model_memory = loaded_model.model_mmap_residency() - pinned_memory = loaded_model.model.pinned_memory_size() - #FIXME: This can over-free the pins as it budgets to pin the entire model. We should - #make this JIT to keep as much pinned as possible. - pins_required = model_memory - pinned_memory - ram_required = model_memory - resident_memory - total_pins_required[device] = total_pins_required.get(device, 0) + pins_required - total_ram_required[device] = total_ram_required.get(device, 0) + ram_required for device in total_memory_required: if device != torch.device("cpu"): free_memory(total_memory_required[device] * 1.1 + extra_mem, device, - for_dynamic=free_for_dynamic, - pins_required=total_pins_required[device], - ram_required=total_ram_required[device]) + for_dynamic=free_for_dynamic) for device in total_memory_required: if device != torch.device("cpu"): @@ -1246,6 +1195,10 @@ def reset_cast_buffers(): offload_stream.synchronize() synchronize() + for mmap_obj in DIRTY_MMAPS: + mmap_obj.bounce() + + DIRTY_MMAPS.clear() STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() STREAM_PIN_BUFFERS.clear() @@ -1310,8 +1263,7 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None): if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view): continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() - if hasattr(storage, "_comfy_tensor_mmap_touched"): - storage._comfy_tensor_mmap_touched = True + mark_mmap_dirty(storage) dest_view.copy_(tensor, non_blocking=non_blocking) diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index dc5f0e577..43712c7a0 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -342,9 +342,6 @@ class ModelPatcher: self.size = comfy.model_management.module_size(self.model) return self.size - def model_mmap_residency(self, free=False): - return comfy.model_management.module_mmap_residency(self.model, free=free) - def loaded_size(self): return self.model.model_loaded_weight_memory diff --git a/comfy/utils.py b/comfy/utils.py index b75972027..fabe18b51 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -113,7 +113,6 @@ def load_safetensors(ckpt): "_comfy_tensor_file_slice", comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start)) setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv)) - setattr(storage, "_comfy_tensor_mmap_touched", False) sd[name] = tensor return sd, header.get("__metadata__", {}), @@ -1445,4 +1444,3 @@ def deepcopy_list_dict(obj, memo=None): memo[obj_id] = res return res - diff --git a/comfy/windows.py b/comfy/windows.py deleted file mode 100644 index 213dc481d..000000000 --- a/comfy/windows.py +++ /dev/null @@ -1,52 +0,0 @@ -import ctypes -import logging -import psutil -from ctypes import wintypes - -import comfy_aimdo.control - -psapi = ctypes.WinDLL("psapi") -kernel32 = ctypes.WinDLL("kernel32") - -class PERFORMANCE_INFORMATION(ctypes.Structure): - _fields_ = [ - ("cb", wintypes.DWORD), - ("CommitTotal", ctypes.c_size_t), - ("CommitLimit", ctypes.c_size_t), - ("CommitPeak", ctypes.c_size_t), - ("PhysicalTotal", ctypes.c_size_t), - ("PhysicalAvailable", ctypes.c_size_t), - ("SystemCache", ctypes.c_size_t), - ("KernelTotal", ctypes.c_size_t), - ("KernelPaged", ctypes.c_size_t), - ("KernelNonpaged", ctypes.c_size_t), - ("PageSize", ctypes.c_size_t), - ("HandleCount", wintypes.DWORD), - ("ProcessCount", wintypes.DWORD), - ("ThreadCount", wintypes.DWORD), - ] - -def get_free_ram(): - #Windows is way too conservative and chalks recently used uncommitted model RAM - #as "in-use". So, calculate free RAM for the sake of general use as the greater of: - # - #1: What psutil says - #2: Total Memory - (Committed Memory - VRAM in use) - # - #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked - #commit charge for all VRAM used just incase it wants to page it all out. This just - #isn't realistic so "overcommit" on our calculations by just subtracting it off. - - pi = PERFORMANCE_INFORMATION() - pi.cb = ctypes.sizeof(pi) - - if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb): - logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal") - return psutil.virtual_memory().available - - committed = pi.CommitTotal * pi.PageSize - total = pi.PhysicalTotal * pi.PageSize - - return max(psutil.virtual_memory().available, - total - (committed - comfy_aimdo.control.get_total_vram_usage())) - From 8187cd783e20ac71cbf88d51338d632996838cb3 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 19:55:41 +1000 Subject: [PATCH 06/15] Implement JIT pinned memory pressure Replace the predictive pin pressure mechanism with JIT PIN memory pressure. --- comfy/model_management.py | 52 ++++++++++++++++++++++++++++++++++----- comfy/model_patcher.py | 17 ++++++++++--- comfy/ops.py | 8 ++++-- comfy/pinned_memory.py | 4 +-- 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 4b96d1492..6a2126cb5 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -498,6 +498,8 @@ current_loaded_models = [] DIRTY_MMAPS = set() +PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024 + def module_size(module): module_mem = 0 sd = module.state_dict() @@ -511,6 +513,21 @@ def mark_mmap_dirty(storage): if mmap_refs is not None: DIRTY_MMAPS.add(mmap_refs[0]) +def ensure_pin_budget(size, evict_active=False): + if MAX_PINNED_MEMORY <= 0: + return + + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + if shortfall <= 0: + return + + shortfall += PIN_PRESSURE_HYSTERESIS + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.dynamic_pins[model.load_device]["active"]): + shortfall -= model.partially_unload_ram(shortfall) + if shortfall <= 0: + break class LoadedModel: def __init__(self, model): @@ -1133,7 +1150,6 @@ LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 -DEFAULT_PIN_BUFFER_PRIME_SIZE = 1024 ** 2 def get_cast_buffer(offload_stream, device, size, ref): global LARGEST_CASTED_WEIGHT @@ -1177,14 +1193,29 @@ def get_aimdo_cast_buffer(offload_stream, device): def get_pin_buffer(offload_stream): pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) if pin_buffer is None: - # A small non-zero default primes HostBuffer's larger virtual reservation. - pin_buffer = comfy_aimdo.host_buffer.HostBuffer(DEFAULT_PIN_BUFFER_PRIME_SIZE) + pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0) STREAM_PIN_BUFFERS[offload_stream] = pin_buffer elif offload_stream is not None: offload_stream.synchronize() return pin_buffer +def resize_pin_buffer(pin_buffer, size): + global TOTAL_PINNED_MEMORY + old_size = getattr(pin_buffer, "_comfy_stream_pin_size", 0) + if size <= old_size: + return True + growth = size - old_size + ensure_pin_budget(growth, evict_active=True) + try: + pin_buffer.extend(size=size, reallocate=True) + except RuntimeError: + return False + pin_buffer._comfy_stream_pin_size = size + TOTAL_PINNED_MEMORY += growth + return True + def reset_cast_buffers(): + global TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT @@ -1197,8 +1228,18 @@ def reset_cast_buffers(): for mmap_obj in DIRTY_MMAPS: mmap_obj.bounce() - DIRTY_MMAPS.clear() + + for pin_buffer in STREAM_PIN_BUFFERS.values(): + TOTAL_PINNED_MEMORY -= getattr(pin_buffer, "_comfy_stream_pin_size", 0) + if TOTAL_PINNED_MEMORY < 0: + TOTAL_PINNED_MEMORY = 0 + + for loaded_model in current_loaded_models: + model = loaded_model.model + if model is not None and model.is_dynamic(): + model.dynamic_pins[model.load_device]["active"] = False + STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() STREAM_PIN_BUFFERS.clear() @@ -1344,8 +1385,7 @@ def pin_memory(tensor): return False size = tensor.nbytes - if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: - return False + ensure_pin_budget(size) ptr = tensor.data_ptr() if ptr == 0: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 43712c7a0..def0901dc 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1117,7 +1117,7 @@ class ModelPatcher: return 0 def partially_unload_ram(self, ram_to_unload): - pass + return 0 def detach(self, unpatch_all=True): self.eject_model() @@ -1544,7 +1544,12 @@ class ModelPatcherDynamic(ModelPatcher): if not hasattr(self.model, "dynamic_pins"): self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: - self.model.dynamic_pins[self.load_device] = {"hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), "stack": [], "failed": False} + self.model.dynamic_pins[self.load_device] = { + "hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), + "stack": [], + "failed": False, + "active": False, + } self.non_dynamic_delegate_model = None assert load_device is not None @@ -1608,6 +1613,7 @@ class ModelPatcherDynamic(ModelPatcher): vbar = self._vbar_get(create=True) pin_state = self.model.dynamic_pins[self.load_device] pin_state["failed"] = False + pin_state["active"] = True if vbar is not None: vbar.prioritize() @@ -1741,9 +1747,10 @@ class ModelPatcherDynamic(ModelPatcher): return self.model.dynamic_pins[self.load_device]["hostbuf"].size def partially_unload_ram(self, ram_to_unload): + freed = 0 pin_state = self.model.dynamic_pins[self.load_device] hostbuf = pin_state["hostbuf"] - stack = self.model.dynamic_pins[self.load_device]["stack"] + stack = pin_state["stack"] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() @@ -1752,9 +1759,11 @@ class ModelPatcherDynamic(ModelPatcher): comfy.model_management.TOTAL_PINNED_MEMORY -= size if comfy.model_management.TOTAL_PINNED_MEMORY < 0: comfy.model_management.TOTAL_PINNED_MEMORY = 0 + freed += size ram_to_unload -= size if ram_to_unload <= 0: - return + return freed + return freed def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): #This isn't used by the core at all and can only be to load a model out of diff --git a/comfy/ops.py b/comfy/ops.py index 3d196f438..ee3184894 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -138,6 +138,8 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin if stream_pin_hostbuf is None: stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) + if stream_pin_hostbuf is None: + return None offset = stream_pin_offset stream_pin_offset += buffer_size @@ -213,8 +215,10 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size) if stream_pin_hostbuf_size < stream_pin_offset: stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM - stream_pin_hostbuf.extend(size=stream_pin_hostbuf_size, reallocate=True) - stream_pin_hostbuf._comfy_stream_pin_size = stream_pin_hostbuf_size + if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_hostbuf_size): + for xfer_source, _, _, xfer_dest in stream_pin_queue: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + return offload_stream stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 3638066c8..a35759aad 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -16,9 +16,7 @@ def pin_memory(module): hostbuf = pin_state["hostbuf"] size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size - if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY: - pin_state["failed"] = True - return False + comfy.model_management.ensure_pin_budget(size) try: hostbuf.extend(size=size) From 2b927e17838b733dde6a660fe521bb0f13768528 Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 7 May 2026 23:50:37 +1000 Subject: [PATCH 07/15] LowVRAMPatch: change to two-phase visit --- comfy/lora.py | 19 +++++++++++++------ comfy/model_patcher.py | 11 +++++++++-- comfy/ops.py | 2 +- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/comfy/lora.py b/comfy/lora.py index db8f16bcb..f7c7c21a5 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -475,16 +475,23 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori return weight -def prefetch_prepared_value(value, allocate_buffer, stream): +def prefetch_prepared_value(value, counter, destination, stream): if isinstance(value, torch.Tensor): - dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value)) - comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) + size = comfy.memory_management.vram_aligned_size(value) + offset = counter[0] + counter[0] += size + if destination is None: + return value + + dest = destination[offset:offset + size] + if stream is not None: + comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) return comfy.memory_management.interpret_gathered_like([value], dest)[0] elif isinstance(value, weight_adapter.WeightAdapterBase): - return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream)) + return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream)) elif isinstance(value, tuple): - return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value) + return tuple(prefetch_prepared_value(item, counter, destination, stream) for item in value) elif isinstance(value, list): - return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value] + return [prefetch_prepared_value(item, counter, destination, stream) for item in value] return value diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index def0901dc..dc58cd42e 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -125,9 +125,16 @@ class LowVramPatch: self.set_func = set_func self.prepared_patches = None - def prepare(self, allocate_buffer, stream): + def memory_required(self): + counter = [0] + for patch in self.patches[self.key]: + comfy.lora.prefetch_prepared_value(patch[1], counter, None, None) + return counter[0] + + def prepare(self, destination, stream): + counter = [0] self.prepared_patches = [ - (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4]) + (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream), patch[2], patch[3], patch[4]) for patch in self.patches[self.key] ] diff --git a/comfy/ops.py b/comfy/ops.py index ee3184894..bd3de3677 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -203,7 +203,7 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin lowvram_fn = getattr(s, param_key + "_lowvram_function", None) if lowvram_fn is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream) + lowvram_fn.prepare(get_cast_buffer(lowvram_fn.memory_required()), offload_stream) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest From 8e473d756f39c5cac5397a8c3b4442e75617068c Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 08:19:47 +1000 Subject: [PATCH 08/15] lora: re-implement as inplace swiss-army-knife operation --- comfy/lora.py | 10 +++++----- comfy/model_management.py | 7 +++---- comfy/model_patcher.py | 13 +++++++++---- comfy/ops.py | 25 ++++++++++++++++++------- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/comfy/lora.py b/comfy/lora.py index f7c7c21a5..2b8699710 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -475,7 +475,7 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori return weight -def prefetch_prepared_value(value, counter, destination, stream): +def prefetch_prepared_value(value, counter, destination, stream, copy): if isinstance(value, torch.Tensor): size = comfy.memory_management.vram_aligned_size(value) offset = counter[0] @@ -484,14 +484,14 @@ def prefetch_prepared_value(value, counter, destination, stream): return value dest = destination[offset:offset + size] - if stream is not None: + if copy: comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream) return comfy.memory_management.interpret_gathered_like([value], dest)[0] elif isinstance(value, weight_adapter.WeightAdapterBase): - return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream)) + return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy)) elif isinstance(value, tuple): - return tuple(prefetch_prepared_value(item, counter, destination, stream) for item in value) + return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value) elif isinstance(value, list): - return [prefetch_prepared_value(item, counter, destination, stream) for item in value] + return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value] return value diff --git a/comfy/model_management.py b/comfy/model_management.py index 6a2126cb5..40f72fa1b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1201,7 +1201,7 @@ def get_pin_buffer(offload_stream): def resize_pin_buffer(pin_buffer, size): global TOTAL_PINNED_MEMORY - old_size = getattr(pin_buffer, "_comfy_stream_pin_size", 0) + old_size = pin_buffer.size if size <= old_size: return True growth = size - old_size @@ -1210,8 +1210,7 @@ def resize_pin_buffer(pin_buffer, size): pin_buffer.extend(size=size, reallocate=True) except RuntimeError: return False - pin_buffer._comfy_stream_pin_size = size - TOTAL_PINNED_MEMORY += growth + TOTAL_PINNED_MEMORY += pin_buffer.size - old_size return True def reset_cast_buffers(): @@ -1231,7 +1230,7 @@ def reset_cast_buffers(): DIRTY_MMAPS.clear() for pin_buffer in STREAM_PIN_BUFFERS.values(): - TOTAL_PINNED_MEMORY -= getattr(pin_buffer, "_comfy_stream_pin_size", 0) + TOTAL_PINNED_MEMORY -= pin_buffer.size if TOTAL_PINNED_MEMORY < 0: TOTAL_PINNED_MEMORY = 0 diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index dc58cd42e..a88603df9 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -118,6 +118,8 @@ def string_to_seed(data): return comfy.utils.string_to_seed(data) class LowVramPatch: + is_lowvram_patch = True + def __init__(self, key, patches, convert_func=None, set_func=None): self.key = key self.patches = patches @@ -128,15 +130,18 @@ class LowVramPatch: def memory_required(self): counter = [0] for patch in self.patches[self.key]: - comfy.lora.prefetch_prepared_value(patch[1], counter, None, None) + comfy.lora.prefetch_prepared_value(patch[1], counter, None, None, False) return counter[0] - def prepare(self, destination, stream): + def prepare(self, destination, stream, copy=True, commit=True): counter = [0] - self.prepared_patches = [ - (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream), patch[2], patch[3], patch[4]) + prepared_patches = [ + (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream, copy), patch[2], patch[3], patch[4]) for patch in self.patches[self.key] ] + if commit: + self.prepared_patches = prepared_patches + return prepared_patches def clear_prepared(self): self.prepared_patches = None diff --git a/comfy/ops.py b/comfy/ops.py index bd3de3677..8603b50a6 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -203,7 +203,14 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin lowvram_fn = getattr(s, param_key + "_lowvram_function", None) if lowvram_fn is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_fn.prepare(get_cast_buffer(lowvram_fn.memory_required()), offload_stream) + lowvram_size = lowvram_fn.memory_required() + lowvram_dest = get_cast_buffer(lowvram_size) + lowvram_fn.prepare(lowvram_dest, None, copy=False, commit=True) + pin_offset = get_stream_pin_buffer_offset(lowvram_size) + if pin_offset is not None: + stream_pin_queue.append((lowvram_fn, pin_offset, lowvram_size, lowvram_dest)) + else: + lowvram_fn.prepare(lowvram_dest, offload_stream, copy=True, commit=True) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest @@ -211,19 +218,23 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin prefetch["needs_cast"] = needs_cast s._prefetch = prefetch + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + if getattr(xfer_source, "is_lowvram_patch", False): + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + else: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) + if stream_pin_offset > 0: - stream_pin_hostbuf_size = getattr(stream_pin_hostbuf, "_comfy_stream_pin_size", stream_pin_hostbuf.size) - if stream_pin_hostbuf_size < stream_pin_offset: - stream_pin_hostbuf_size = stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM - if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_hostbuf_size): + if stream_pin_hostbuf.size < stream_pin_offset: + if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): for xfer_source, _, _, xfer_dest in stream_pin_queue: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) return offload_stream stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] - comfy.model_management.cast_to_gathered(xfer_source, pin) + cast_maybe_lowvram_patch(xfer_source, pin, None) comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) return offload_stream From e48dace1452df67a3661bcf6d5144e4a7aa8f867 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 09:47:08 +1000 Subject: [PATCH 09/15] prepare for multiple pin sets --- comfy/model_management.py | 2 ++ comfy/model_patcher.py | 37 +++++++++++++++++++------------------ comfy/pinned_memory.py | 13 +++++++------ 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 40f72fa1b..ca4318a45 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1238,6 +1238,8 @@ def reset_cast_buffers(): model = loaded_model.model if model is not None and model.is_dynamic(): model.dynamic_pins[model.load_device]["active"] = False + model.partially_unload_ram(1e30, subsets=[ "patches" ]) + model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0), []) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index a88603df9..530db214c 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ class ModelPatcherDynamic(ModelPatcher): self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "hostbuf": comfy_aimdo.host_buffer.HostBuffer(0), - "stack": [], + "weights": (comfy_aimdo.host_buffer.HostBuffer(0), []), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0), []), "failed": False, "active": False, } @@ -1756,25 +1756,26 @@ class ModelPatcherDynamic(ModelPatcher): return freed def pinned_memory_size(self): - return self.model.dynamic_pins[self.load_device]["hostbuf"].size + return (self.model.dynamic_pins[self.load_device]["weights"][0].size + + self.model.dynamic_pins[self.load_device]["patches"][0].size) - def partially_unload_ram(self, ram_to_unload): + def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] - hostbuf = pin_state["hostbuf"] - stack = pin_state["stack"] - while len(stack) > 0: - module, offset = stack.pop() - size = module._pin.numel() * module._pin.element_size() - del module._pin - hostbuf.truncate(offset) - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 - freed += size - ram_to_unload -= size - if ram_to_unload <= 0: - return freed + for subset in subsets: + hostbuf, stack = pin_state[subset] + while len(stack) > 0: + module, offset = stack.pop() + size = module._pin.numel() * module._pin.element_size() + del module._pin + hostbuf.truncate(offset) + comfy.model_management.TOTAL_PINNED_MEMORY -= size + if comfy.model_management.TOTAL_PINNED_MEMORY < 0: + comfy.model_management.TOTAL_PINNED_MEMORY = 0 + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed return freed def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index a35759aad..208c777f8 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -5,16 +5,17 @@ import comfy_aimdo.torch from comfy.cli_args import args -def get_pin(module): +def get_pin(module, subset="weights"): return getattr(module, "_pin", None) -def pin_memory(module): +def pin_memory(module, subset="weights", size=None): pin_state = module._pin_state - if pin_state["failed"] or args.disable_pinned_memory or get_pin(module) is not None: + if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None: return - hostbuf = pin_state["hostbuf"] - size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) + hostbuf, stack = pin_state[subset] + if size is None: + size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size comfy.model_management.ensure_pin_budget(size) @@ -26,6 +27,6 @@ def pin_memory(module): module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf - pin_state["stack"].append((module, offset)) + stack.append((module, offset)) comfy.model_management.TOTAL_PINNED_MEMORY += size return True From 3a3b75a7e3cc1175e3f9f0d90c5838fb83c9b518 Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 23:28:43 +1000 Subject: [PATCH 10/15] implement pinned loras --- comfy/model_management.py | 6 ++-- comfy/model_patcher.py | 8 +++-- comfy/ops.py | 63 +++++++++++++++++++++------------------ 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index ca4318a45..145a32080 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -524,7 +524,7 @@ def ensure_pin_budget(size, evict_active=False): shortfall += PIN_PRESSURE_HYSTERESIS for loaded_model in reversed(current_loaded_models): model = loaded_model.model - if model is not None and model.is_dynamic() and (evict_active or not model.dynamic_pins[model.load_device]["active"]): + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): shortfall -= model.partially_unload_ram(shortfall) if shortfall <= 0: break @@ -1237,9 +1237,9 @@ def reset_cast_buffers(): for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): - model.dynamic_pins[model.load_device]["active"] = False + model.model.dynamic_pins[model.load_device]["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0), []) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 530db214c..f4845bb43 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ class ModelPatcherDynamic(ModelPatcher): self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0), []), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0), []), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []), "failed": False, "active": False, } @@ -1651,7 +1651,9 @@ class ModelPatcherDynamic(ModelPatcher): if key in self.patches: if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape: return (True, 0) - setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches)) + lowvram_patch = LowVramPatch(key, self.patches) + lowvram_patch._pin_state = pin_state + setattr(m, param_key + "_lowvram_function", lowvram_patch) num_patches += 1 else: setattr(m, param_key + "_lowvram_function", None) diff --git a/comfy/ops.py b/comfy/ops.py index 8603b50a6..629b54e4c 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -183,34 +183,45 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - if pin is None: - if signature is None: - comfy.pinned_memory.pin_memory(s) - pin = comfy.pinned_memory.get_pin(s) - if pin is not None: - comfy.model_management.cast_to_gathered(xfer_source, pin) - xfer_source = [ pin ] - if pin is None: - pin_offset = get_stream_pin_buffer_offset(dest_size) - if pin_offset is not None: - stream_pin_queue.append((xfer_source, pin_offset, dest_size, xfer_dest)) - xfer_source = None + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + if xfer_source is not None: + if getattr(xfer_source, "is_lowvram_patch", False): + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + else: + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) - if xfer_source is not None: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream) + def handle_pin_miss(m, source, dest, subset="weights", size=None): + pin = None + if signature is None: + comfy.pinned_memory.pin_memory(m, subset=subset, size=size) + pin = comfy.pinned_memory.get_pin(m, subset=subset) + if pin is not None: + cast_maybe_lowvram_patch(source, pin, None) + return [ pin ] + if pin is None: + pin_offset = get_stream_pin_buffer_offset(size) + if pin_offset is not None: + stream_pin_queue.append((source, pin_offset, size, dest)) + return None + return source + + if pin is None: + xfer_source = handle_pin_miss(s, xfer_source, xfer_dest, size=dest_size) + + cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) for param_key in ("weight", "bias"): - lowvram_fn = getattr(s, param_key + "_lowvram_function", None) - if lowvram_fn is not None: + lowvram_source = getattr(s, param_key + "_lowvram_function", None) + if lowvram_source is not None: ensure_offload_stream(s, cast_buffer_offset, False) - lowvram_size = lowvram_fn.memory_required() + lowvram_size = lowvram_source.memory_required() lowvram_dest = get_cast_buffer(lowvram_size) - lowvram_fn.prepare(lowvram_dest, None, copy=False, commit=True) - pin_offset = get_stream_pin_buffer_offset(lowvram_size) - if pin_offset is not None: - stream_pin_queue.append((lowvram_fn, pin_offset, lowvram_size, lowvram_dest)) - else: - lowvram_fn.prepare(lowvram_dest, offload_stream, copy=True, commit=True) + lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True) + + pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches") + lowvram_source = handle_pin_miss(lowvram_source, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size) if pin is None else [ pin ] + + cast_maybe_lowvram_patch(lowvram_source, lowvram_dest, offload_stream) prefetch["xfer_dest"] = xfer_dest prefetch["cast_dest"] = cast_dest @@ -218,12 +229,6 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin prefetch["needs_cast"] = needs_cast s._prefetch = prefetch - def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): - if getattr(xfer_source, "is_lowvram_patch", False): - xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) - else: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) - if stream_pin_offset > 0: if stream_pin_hostbuf.size < stream_pin_offset: if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): From c395f2d5b7ec83b22c597e0e0d936e3cc35f822e Mon Sep 17 00:00:00 2001 From: Rattus Date: Fri, 8 May 2026 23:30:47 +1000 Subject: [PATCH 11/15] requirements: comfy-aimdo 0.4.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c5a6f4cec..eba0fc5ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=14.2.0 comfy-kitchen>=0.2.8 -comfy-aimdo==0.3.0 +comfy-aimdo==0.4.0 requests simpleeval>=1.0.0 blake3 From 44c0a0602b575287b48f09cdb11e6969683e39da Mon Sep 17 00:00:00 2001 From: Rattus Date: Mon, 11 May 2026 18:51:39 +1000 Subject: [PATCH 12/15] ops: remove unused arg This was defeatured in aimdo iteration --- comfy/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ops.py b/comfy/ops.py index 629b54e4c..d425ea7eb 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -235,7 +235,7 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin for xfer_source, _, _, xfer_dest in stream_pin_queue: cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) return offload_stream - stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf, size=stream_pin_offset) + stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf) stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] From ee927aafa8770a4bea8cfc2fcbedba5f86656097 Mon Sep 17 00:00:00 2001 From: Rattus Date: Sat, 9 May 2026 18:41:57 +1000 Subject: [PATCH 13/15] ops: sync the CPU with only the offload stream activity This was syncing with the offload stream which itself is synced with the compute stream, so this was syncing CPU with compute transitively. Define the event to sync it more gently. --- comfy/model_management.py | 5 ++++- comfy/ops.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 145a32080..c1d0901fc 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1196,7 +1196,10 @@ def get_pin_buffer(offload_stream): pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0) STREAM_PIN_BUFFERS[offload_stream] = pin_buffer elif offload_stream is not None: - offload_stream.synchronize() + event = getattr(pin_buffer, "_comfy_event", None) + if event is not None: + event.synchronize() + delattr(pin_buffer, "_comfy_event") return pin_buffer def resize_pin_buffer(pin_buffer, size): diff --git a/comfy/ops.py b/comfy/ops.py index d425ea7eb..be744a030 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -241,6 +241,7 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] cast_maybe_lowvram_patch(xfer_source, pin, None) comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) + stream_pin_hostbuf._comfy_event = offload_stream.record_event() return offload_stream From d61026d020946b986a3a4a1969d9198c90b7e8ec Mon Sep 17 00:00:00 2001 From: Rattus Date: Wed, 13 May 2026 09:17:23 +1000 Subject: [PATCH 14/15] pins: implement freeing intermediate for pinned memory Pinning is more important than inactive intermediates and the stream pin buffer is more important than even active intermediates. --- comfy/memory_management.py | 4 ++-- comfy/model_management.py | 2 ++ comfy/pinned_memory.py | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 4a628b05c..7645064f5 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -157,7 +157,7 @@ def set_ram_cache_release_state(callback, headroom): extra_ram_release_callback = callback RAM_CACHE_HEADROOM = max(0, int(headroom)) -def extra_ram_release(target): +def extra_ram_release(target, free_active=False): if extra_ram_release_callback is None: return 0 - return extra_ram_release_callback(target) + return extra_ram_release_callback(target, free_active=free_active) diff --git a/comfy/model_management.py b/comfy/model_management.py index c1d0901fc..697359d3a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1208,6 +1208,7 @@ def resize_pin_buffer(pin_buffer, size): if size <= old_size: return True growth = size - old_size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True) ensure_pin_budget(growth, evict_active=True) try: pin_buffer.extend(size=size, reallocate=True) @@ -1389,6 +1390,7 @@ def pin_memory(tensor): return False size = tensor.nbytes + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) ensure_pin_budget(size) ptr = tensor.data_ptr() diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 208c777f8..35cbbcd9e 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -17,6 +17,7 @@ def pin_memory(module, subset="weights", size=None): if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) comfy.model_management.ensure_pin_budget(size) try: From 3f717816e1f194fa2a9a105fb425b7bbfbb781f7 Mon Sep 17 00:00:00 2001 From: Rattus Date: Wed, 13 May 2026 21:57:35 +1000 Subject: [PATCH 15/15] execution: implement pin eviction on RAM presure Add back proper pin freeing on RAM pressure --- comfy/model_management.py | 19 ++++++++++++------- execution.py | 5 ++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/comfy/model_management.py b/comfy/model_management.py index 697359d3a..f358621c9 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -513,6 +513,17 @@ def mark_mmap_dirty(storage): if mmap_refs is not None: DIRTY_MMAPS.add(mmap_refs[0]) +def free_pins(size, evict_active=False): + if size <= 0: + return + + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + size -= model.partially_unload_ram(size) + if size <= 0: + break + def ensure_pin_budget(size, evict_active=False): if MAX_PINNED_MEMORY <= 0: return @@ -521,13 +532,7 @@ def ensure_pin_budget(size, evict_active=False): if shortfall <= 0: return - shortfall += PIN_PRESSURE_HYSTERESIS - for loaded_model in reversed(current_loaded_models): - model = loaded_model.model - if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): - shortfall -= model.partially_unload_ram(shortfall) - if shortfall <= 0: - break + free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active) class LoadedModel: def __init__(self, model): diff --git a/execution.py b/execution.py index f37d0360d..5605f09e7 100644 --- a/execution.py +++ b/execution.py @@ -2,6 +2,7 @@ import copy import heapq import inspect import logging +import psutil import sys import threading import time @@ -780,7 +781,9 @@ class PromptExecutor: execution_list.complete_node_execution() if self.cache_type == CacheType.RAM_PRESSURE: - comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom) + ram_release_callback(ram_headroom) + ram_shortfall = ram_headroom - psutil.virtual_memory().available + comfy.model_management.free_pins(ram_shortfall) ram_release_callback(ram_headroom, free_active=True) else: # Only execute when the while-loop ends without break