implement pin registration swaps

Uncap the windows pins from 50% by extending the pool and have a pressure
mechanism to move the pin reservations om demand.

This unfortunately implies a GPU sync to do the freeing so significant
hysterisis needs to be added to consolidate these pressure events.
This commit is contained in:
Rattus 2026-05-15 00:28:13 +10:00
parent 3f717816e1
commit 31150538b0
3 changed files with 94 additions and 21 deletions

View File

@ -498,7 +498,11 @@ current_loaded_models = []
DIRTY_MMAPS = set()
PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024
PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024
#Freeing registerables on pressure does imply a GPU sync, so go big on
#the hysteresis so each expensive sync gives us back a good chunk.
REGISTERABLE_PIN_HYSTERESIS = 768 * 1024 * 1024
def module_size(module):
module_mem = 0
@ -525,15 +529,28 @@ def free_pins(size, evict_active=False):
break
def ensure_pin_budget(size, evict_active=False):
if MAX_PINNED_MEMORY <= 0:
if MAX_MODEL_MEMORY <= 0:
return
shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
shortfall = TOTAL_MODEL_MEMORY + size - MAX_MODEL_MEMORY
if shortfall <= 0:
return
free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active)
def ensure_pin_registerable(size, evict_active=False):
shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
if MAX_PINNED_MEMORY <= 0 or shortfall <= 0:
return
shortfall += REGISTERABLE_PIN_HYSTERESIS
for loaded_model in reversed(current_loaded_models):
model = loaded_model.model
if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
shortfall -= model.unregister_inactive_pins(shortfall)
if shortfall <= 0:
return
class LoadedModel:
def __init__(self, model):
self._set_model(model)
@ -1208,22 +1225,24 @@ def get_pin_buffer(offload_stream):
return pin_buffer
def resize_pin_buffer(pin_buffer, size):
global TOTAL_PINNED_MEMORY
global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY
old_size = pin_buffer.size
if size <= old_size:
return True
growth = size - old_size
comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True)
ensure_pin_budget(growth, evict_active=True)
ensure_pin_registerable(growth, evict_active=True)
try:
pin_buffer.extend(size=size, reallocate=True)
except RuntimeError:
return False
TOTAL_MODEL_MEMORY += pin_buffer.size - old_size
TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
return True
def reset_cast_buffers():
global TOTAL_PINNED_MEMORY
global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY
global LARGEST_CASTED_WEIGHT
global LARGEST_AIMDO_CASTED_WEIGHT
@ -1239,16 +1258,17 @@ def reset_cast_buffers():
DIRTY_MMAPS.clear()
for pin_buffer in STREAM_PIN_BUFFERS.values():
TOTAL_MODEL_MEMORY -= pin_buffer.size
TOTAL_PINNED_MEMORY -= pin_buffer.size
if TOTAL_PINNED_MEMORY < 0:
TOTAL_PINNED_MEMORY = 0
TOTAL_MODEL_MEMORY = max(0, TOTAL_MODEL_MEMORY)
TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY)
for loaded_model in current_loaded_models:
model = loaded_model.model
if model is not None and model.is_dynamic():
model.model.dynamic_pins[model.load_device]["active"] = False
model.partially_unload_ram(1e30, subsets=[ "patches" ])
model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [])
model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1])
STREAM_CAST_BUFFERS.clear()
STREAM_AIMDO_CAST_BUFFERS.clear()
@ -1352,14 +1372,18 @@ def cast_to_device(tensor, device, dtype, copy=False):
PINNED_MEMORY = {}
TOTAL_MODEL_MEMORY = 0
TOTAL_PINNED_MEMORY = 0
MAX_MODEL_MEMORY = -1
MAX_PINNED_MEMORY = -1
if not args.disable_pinned_memory:
if is_nvidia() or is_amd():
ram = get_total_memory(torch.device("cpu"))
MAX_MODEL_MEMORY = min(ram - 4 * 1024 * 1024 * 1024, ram * 0.90)
if WINDOWS:
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50%
MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50%
else:
MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
MAX_PINNED_MEMORY = ram * 0.90
logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
@ -1396,7 +1420,7 @@ def pin_memory(tensor):
size = tensor.nbytes
comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
ensure_pin_budget(size)
ensure_pin_registerable(size)
ptr = tensor.data_ptr()
if ptr == 0:
@ -1433,7 +1457,8 @@ def unpin_memory(tensor):
return False
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
size = PINNED_MEMORY.pop(ptr)
TOTAL_PINNED_MEMORY -= size
return True
else:
logging.warning("Unpin error.")

View File

@ -1557,8 +1557,8 @@ class ModelPatcherDynamic(ModelPatcher):
self.model.dynamic_pins = {}
if self.load_device not in self.model.dynamic_pins:
self.model.dynamic_pins[self.load_device] = {
"weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []),
"patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []),
"weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), [], [-1]),
"patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]),
"failed": False,
"active": False,
}
@ -1761,19 +1761,44 @@ class ModelPatcherDynamic(ModelPatcher):
return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
self.model.dynamic_pins[self.load_device]["patches"][0].size)
def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]):
freed = 0
pin_state = self.model.dynamic_pins[self.load_device]
for subset in subsets:
hostbuf, stack, stack_split = pin_state[subset]
split = stack_split[0]
while split >= 0:
module, offset = stack[split]
split -= 1
stack_split[0] = split
if not module._pin_registered:
continue
size = module._pin.numel() * module._pin.element_size()
if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
comfy.model_management.discard_cuda_async_error()
continue
module._pin_registered = False
comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
freed += size
ram_to_unload -= size
if ram_to_unload <= 0:
return freed
return freed
def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
freed = 0
pin_state = self.model.dynamic_pins[self.load_device]
for subset in subsets:
hostbuf, stack = pin_state[subset]
hostbuf, stack, stack_split = pin_state[subset]
while len(stack) > 0:
module, offset = stack.pop()
size = module._pin.numel() * module._pin.element_size()
del module._pin
hostbuf.truncate(offset)
comfy.model_management.TOTAL_PINNED_MEMORY -= size
if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
comfy.model_management.TOTAL_PINNED_MEMORY = 0
hostbuf.truncate(offset, do_unregister=module._pin_registered)
stack_split[0] = min(stack_split[0], len(stack) - 1)
comfy.model_management.TOTAL_MODEL_MEMORY = max(0, comfy.model_management.TOTAL_MODEL_MEMORY - size)
if module._pin_registered:
comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
freed += size
ram_to_unload -= size
if ram_to_unload <= 0:

View File

@ -2,6 +2,7 @@ import comfy.model_management
import comfy.memory_management
import comfy_aimdo.host_buffer
import comfy_aimdo.torch
import torch
from comfy.cli_args import args
@ -10,15 +11,33 @@ def get_pin(module, subset="weights"):
def pin_memory(module, subset="weights", size=None):
pin_state = module._pin_state
if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None:
if pin_state["failed"] or args.disable_pinned_memory:
return
hostbuf, stack = pin_state[subset]
hostbuf, stack, stack_split = pin_state[subset]
pin = get_pin(module, subset)
if pin is not None:
if module._pin_registered:
return
size = module._pin.nbytes
comfy.model_management.ensure_pin_registerable(size)
if torch.cuda.cudart().cudaHostRegister(module._pin.data_ptr(), size, 1) != 0:
comfy.model_management.discard_cuda_async_error()
return False
module._pin_registered = True
stack_split[0] = max(stack_split[0], module._pin_stack_index)
comfy.model_management.TOTAL_PINNED_MEMORY += size
return True
if size is None:
size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
offset = hostbuf.size
comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
comfy.model_management.ensure_pin_budget(size)
comfy.model_management.ensure_pin_registerable(size)
try:
hostbuf.extend(size=size)
@ -29,5 +48,9 @@ def pin_memory(module, subset="weights", size=None):
module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
module._pin.untyped_storage()._comfy_hostbuf = hostbuf
stack.append((module, offset))
module._pin_registered = True
module._pin_stack_index = len(stack) - 1
stack_split[0] = max(stack_split[0], module._pin_stack_index)
comfy.model_management.TOTAL_MODEL_MEMORY += size
comfy.model_management.TOTAL_PINNED_MEMORY += size
return True