diff --git a/comfy/model_management.py b/comfy/model_management.py index f358621c9..19a916362 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -498,7 +498,11 @@ current_loaded_models = [] DIRTY_MMAPS = set() -PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024 +PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024 + +#Freeing registerables on pressure does imply a GPU sync, so go big on +#the hysteresis so each expensive sync gives us back a good chunk. +REGISTERABLE_PIN_HYSTERESIS = 768 * 1024 * 1024 def module_size(module): module_mem = 0 @@ -525,15 +529,28 @@ def free_pins(size, evict_active=False): break def ensure_pin_budget(size, evict_active=False): - if MAX_PINNED_MEMORY <= 0: + if MAX_MODEL_MEMORY <= 0: return - shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + shortfall = TOTAL_MODEL_MEMORY + size - MAX_MODEL_MEMORY if shortfall <= 0: return free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active) +def ensure_pin_registerable(size, evict_active=False): + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0 or shortfall <= 0: + return + + shortfall += REGISTERABLE_PIN_HYSTERESIS + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + shortfall -= model.unregister_inactive_pins(shortfall) + if shortfall <= 0: + return + class LoadedModel: def __init__(self, model): self._set_model(model) @@ -1208,22 +1225,24 @@ def get_pin_buffer(offload_stream): return pin_buffer def resize_pin_buffer(pin_buffer, size): - global TOTAL_PINNED_MEMORY + global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY old_size = pin_buffer.size if size <= old_size: return True growth = size - old_size comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True) ensure_pin_budget(growth, evict_active=True) + ensure_pin_registerable(growth, evict_active=True) try: pin_buffer.extend(size=size, reallocate=True) except RuntimeError: return False + TOTAL_MODEL_MEMORY += pin_buffer.size - old_size TOTAL_PINNED_MEMORY += pin_buffer.size - old_size return True def reset_cast_buffers(): - global TOTAL_PINNED_MEMORY + global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT @@ -1239,16 +1258,17 @@ def reset_cast_buffers(): DIRTY_MMAPS.clear() for pin_buffer in STREAM_PIN_BUFFERS.values(): + TOTAL_MODEL_MEMORY -= pin_buffer.size TOTAL_PINNED_MEMORY -= pin_buffer.size - if TOTAL_PINNED_MEMORY < 0: - TOTAL_PINNED_MEMORY = 0 + TOTAL_MODEL_MEMORY = max(0, TOTAL_MODEL_MEMORY) + TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY) for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): model.model.dynamic_pins[model.load_device]["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() @@ -1352,14 +1372,18 @@ def cast_to_device(tensor, device, dtype, copy=False): PINNED_MEMORY = {} +TOTAL_MODEL_MEMORY = 0 TOTAL_PINNED_MEMORY = 0 +MAX_MODEL_MEMORY = -1 MAX_PINNED_MEMORY = -1 if not args.disable_pinned_memory: if is_nvidia() or is_amd(): + ram = get_total_memory(torch.device("cpu")) + MAX_MODEL_MEMORY = min(ram - 4 * 1024 * 1024 * 1024, ram * 0.90) if WINDOWS: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50% + MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50% else: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90 + MAX_PINNED_MEMORY = ram * 0.90 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"]) @@ -1396,7 +1420,7 @@ def pin_memory(tensor): size = tensor.nbytes comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) - ensure_pin_budget(size) + ensure_pin_registerable(size) ptr = tensor.data_ptr() if ptr == 0: @@ -1433,7 +1457,8 @@ def unpin_memory(tensor): return False if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: - TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) + size = PINNED_MEMORY.pop(ptr) + TOTAL_PINNED_MEMORY -= size return True else: logging.warning("Unpin error.") diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index f4845bb43..7dc4d7801 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ class ModelPatcherDynamic(ModelPatcher): self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), [], [-1]), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]), "failed": False, "active": False, } @@ -1761,19 +1761,44 @@ class ModelPatcherDynamic(ModelPatcher): return (self.model.dynamic_pins[self.load_device]["weights"][0].size + self.model.dynamic_pins[self.load_device]["patches"][0].size) + def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]): + freed = 0 + pin_state = self.model.dynamic_pins[self.load_device] + for subset in subsets: + hostbuf, stack, stack_split = pin_state[subset] + split = stack_split[0] + while split >= 0: + module, offset = stack[split] + split -= 1 + stack_split[0] = split + if not module._pin_registered: + continue + size = module._pin.numel() * module._pin.element_size() + if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0: + comfy.model_management.discard_cuda_async_error() + continue + module._pin_registered = False + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed + return freed + def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack = pin_state[subset] + hostbuf, stack, stack_split = pin_state[subset] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() del module._pin - hostbuf.truncate(offset) - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 + hostbuf.truncate(offset, do_unregister=module._pin_registered) + stack_split[0] = min(stack_split[0], len(stack) - 1) + comfy.model_management.TOTAL_MODEL_MEMORY = max(0, comfy.model_management.TOTAL_MODEL_MEMORY - size) + if module._pin_registered: + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) freed += size ram_to_unload -= size if ram_to_unload <= 0: diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 35cbbcd9e..8fe69916f 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -2,6 +2,7 @@ import comfy.model_management import comfy.memory_management import comfy_aimdo.host_buffer import comfy_aimdo.torch +import torch from comfy.cli_args import args @@ -10,15 +11,33 @@ def get_pin(module, subset="weights"): def pin_memory(module, subset="weights", size=None): pin_state = module._pin_state - if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None: + if pin_state["failed"] or args.disable_pinned_memory: return - hostbuf, stack = pin_state[subset] + hostbuf, stack, stack_split = pin_state[subset] + pin = get_pin(module, subset) + if pin is not None: + if module._pin_registered: + return + + size = module._pin.nbytes + comfy.model_management.ensure_pin_registerable(size) + + if torch.cuda.cudart().cudaHostRegister(module._pin.data_ptr(), size, 1) != 0: + comfy.model_management.discard_cuda_async_error() + return False + module._pin_registered = True + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_PINNED_MEMORY += size + return True + if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) comfy.model_management.ensure_pin_budget(size) + comfy.model_management.ensure_pin_registerable(size) try: hostbuf.extend(size=size) @@ -29,5 +48,9 @@ def pin_memory(module, subset="weights", size=None): module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf stack.append((module, offset)) + module._pin_registered = True + module._pin_stack_index = len(stack) - 1 + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_MODEL_MEMORY += size comfy.model_management.TOTAL_PINNED_MEMORY += size return True