diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 9dadb0093..d5d13008b 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -110,13 +110,11 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.") -CACHE_RAM_AUTO_GB = -1.0 - cache_group = parser.add_mutually_exclusive_group() +cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).") cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") -cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") @@ -246,6 +244,9 @@ if comfy.options.args_parsing: else: args = parser.parse_args([]) +if args.cache_ram is not None and len(args.cache_ram) > 2: + parser.error("--cache-ram accepts at most two values: active GB and inactive GB") + if args.windows_standalone_build: args.auto_launch = True diff --git a/comfy/model_management.py b/comfy/model_management.py index f358621c9..19a916362 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -498,7 +498,11 @@ current_loaded_models = [] DIRTY_MMAPS = set() -PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024 +PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024 + +#Freeing registerables on pressure does imply a GPU sync, so go big on +#the hysteresis so each expensive sync gives us back a good chunk. +REGISTERABLE_PIN_HYSTERESIS = 768 * 1024 * 1024 def module_size(module): module_mem = 0 @@ -525,15 +529,28 @@ def free_pins(size, evict_active=False): break def ensure_pin_budget(size, evict_active=False): - if MAX_PINNED_MEMORY <= 0: + if MAX_MODEL_MEMORY <= 0: return - shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + shortfall = TOTAL_MODEL_MEMORY + size - MAX_MODEL_MEMORY if shortfall <= 0: return free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active) +def ensure_pin_registerable(size, evict_active=False): + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0 or shortfall <= 0: + return + + shortfall += REGISTERABLE_PIN_HYSTERESIS + for loaded_model in reversed(current_loaded_models): + model = loaded_model.model + if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + shortfall -= model.unregister_inactive_pins(shortfall) + if shortfall <= 0: + return + class LoadedModel: def __init__(self, model): self._set_model(model) @@ -1208,22 +1225,24 @@ def get_pin_buffer(offload_stream): return pin_buffer def resize_pin_buffer(pin_buffer, size): - global TOTAL_PINNED_MEMORY + global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY old_size = pin_buffer.size if size <= old_size: return True growth = size - old_size comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True) ensure_pin_budget(growth, evict_active=True) + ensure_pin_registerable(growth, evict_active=True) try: pin_buffer.extend(size=size, reallocate=True) except RuntimeError: return False + TOTAL_MODEL_MEMORY += pin_buffer.size - old_size TOTAL_PINNED_MEMORY += pin_buffer.size - old_size return True def reset_cast_buffers(): - global TOTAL_PINNED_MEMORY + global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT @@ -1239,16 +1258,17 @@ def reset_cast_buffers(): DIRTY_MMAPS.clear() for pin_buffer in STREAM_PIN_BUFFERS.values(): + TOTAL_MODEL_MEMORY -= pin_buffer.size TOTAL_PINNED_MEMORY -= pin_buffer.size - if TOTAL_PINNED_MEMORY < 0: - TOTAL_PINNED_MEMORY = 0 + TOTAL_MODEL_MEMORY = max(0, TOTAL_MODEL_MEMORY) + TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY) for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): model.model.dynamic_pins[model.load_device]["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() @@ -1352,14 +1372,18 @@ def cast_to_device(tensor, device, dtype, copy=False): PINNED_MEMORY = {} +TOTAL_MODEL_MEMORY = 0 TOTAL_PINNED_MEMORY = 0 +MAX_MODEL_MEMORY = -1 MAX_PINNED_MEMORY = -1 if not args.disable_pinned_memory: if is_nvidia() or is_amd(): + ram = get_total_memory(torch.device("cpu")) + MAX_MODEL_MEMORY = min(ram - 4 * 1024 * 1024 * 1024, ram * 0.90) if WINDOWS: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50% + MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50% else: - MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90 + MAX_PINNED_MEMORY = ram * 0.90 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"]) @@ -1396,7 +1420,7 @@ def pin_memory(tensor): size = tensor.nbytes comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) - ensure_pin_budget(size) + ensure_pin_registerable(size) ptr = tensor.data_ptr() if ptr == 0: @@ -1433,7 +1457,8 @@ def unpin_memory(tensor): return False if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: - TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) + size = PINNED_MEMORY.pop(ptr) + TOTAL_PINNED_MEMORY -= size return True else: logging.warning("Unpin error.") diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index f4845bb43..7dc4d7801 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1557,8 +1557,8 @@ class ModelPatcherDynamic(ModelPatcher): self.model.dynamic_pins = {} if self.load_device not in self.model.dynamic_pins: self.model.dynamic_pins[self.load_device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), [], [-1]), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]), "failed": False, "active": False, } @@ -1761,19 +1761,44 @@ class ModelPatcherDynamic(ModelPatcher): return (self.model.dynamic_pins[self.load_device]["weights"][0].size + self.model.dynamic_pins[self.load_device]["patches"][0].size) + def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]): + freed = 0 + pin_state = self.model.dynamic_pins[self.load_device] + for subset in subsets: + hostbuf, stack, stack_split = pin_state[subset] + split = stack_split[0] + while split >= 0: + module, offset = stack[split] + split -= 1 + stack_split[0] = split + if not module._pin_registered: + continue + size = module._pin.numel() * module._pin.element_size() + if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0: + comfy.model_management.discard_cuda_async_error() + continue + module._pin_registered = False + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) + freed += size + ram_to_unload -= size + if ram_to_unload <= 0: + return freed + return freed + def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack = pin_state[subset] + hostbuf, stack, stack_split = pin_state[subset] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() del module._pin - hostbuf.truncate(offset) - comfy.model_management.TOTAL_PINNED_MEMORY -= size - if comfy.model_management.TOTAL_PINNED_MEMORY < 0: - comfy.model_management.TOTAL_PINNED_MEMORY = 0 + hostbuf.truncate(offset, do_unregister=module._pin_registered) + stack_split[0] = min(stack_split[0], len(stack) - 1) + comfy.model_management.TOTAL_MODEL_MEMORY = max(0, comfy.model_management.TOTAL_MODEL_MEMORY - size) + if module._pin_registered: + comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size) freed += size ram_to_unload -= size if ram_to_unload <= 0: diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 35cbbcd9e..8fe69916f 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -2,6 +2,7 @@ import comfy.model_management import comfy.memory_management import comfy_aimdo.host_buffer import comfy_aimdo.torch +import torch from comfy.cli_args import args @@ -10,15 +11,33 @@ def get_pin(module, subset="weights"): def pin_memory(module, subset="weights", size=None): pin_state = module._pin_state - if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None: + if pin_state["failed"] or args.disable_pinned_memory: return - hostbuf, stack = pin_state[subset] + hostbuf, stack, stack_split = pin_state[subset] + pin = get_pin(module, subset) + if pin is not None: + if module._pin_registered: + return + + size = module._pin.nbytes + comfy.model_management.ensure_pin_registerable(size) + + if torch.cuda.cudart().cudaHostRegister(module._pin.data_ptr(), size, 1) != 0: + comfy.model_management.discard_cuda_async_error() + return False + module._pin_registered = True + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_PINNED_MEMORY += size + return True + if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size + comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) comfy.model_management.ensure_pin_budget(size) + comfy.model_management.ensure_pin_registerable(size) try: hostbuf.extend(size=size) @@ -29,5 +48,9 @@ def pin_memory(module, subset="weights", size=None): module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf stack.append((module, offset)) + module._pin_registered = True + module._pin_stack_index = len(stack) - 1 + stack_split[0] = max(stack_split[0], module._pin_stack_index) + comfy.model_management.TOTAL_MODEL_MEMORY += size comfy.model_management.TOTAL_PINNED_MEMORY += size return True diff --git a/execution.py b/execution.py index 5605f09e7..9c3968810 100644 --- a/execution.py +++ b/execution.py @@ -728,6 +728,7 @@ class PromptExecutor: self._notify_prompt_lifecycle("start", prompt_id) ram_headroom = int(self.cache_args["ram"] * (1024 ** 3)) + ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3)) ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom) @@ -781,7 +782,7 @@ class PromptExecutor: execution_list.complete_node_execution() if self.cache_type == CacheType.RAM_PRESSURE: - ram_release_callback(ram_headroom) + ram_release_callback(ram_inactive_headroom) ram_shortfall = ram_headroom - psutil.virtual_memory().available comfy.model_management.free_pins(ram_shortfall) ram_release_callback(ram_headroom, free_active=True) diff --git a/main.py b/main.py index a6fdaf43c..1e47cab84 100644 --- a/main.py +++ b/main.py @@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]: def prompt_worker(q, server_instance): current_time: float = 0.0 - cache_ram = args.cache_ram - if cache_ram < 0: + cache_ram = 0 + cache_ram_inactive = 0 + if not args.cache_classic and not args.cache_none and args.cache_lru <= 0: cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0)) + cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0)) + if len(args.cache_ram) > 0: + cache_ram = args.cache_ram[0] + if len(args.cache_ram) > 1: + cache_ram_inactive = args.cache_ram[1] - cache_type = execution.CacheType.CLASSIC - if args.cache_lru > 0: + cache_type = execution.CacheType.RAM_PRESSURE + if args.cache_classic: + cache_type = execution.CacheType.CLASSIC + elif args.cache_lru > 0: cache_type = execution.CacheType.LRU - elif cache_ram > 0: - cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: cache_type = execution.CacheType.NONE - e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } ) + e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } ) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 diff --git a/requirements.txt b/requirements.txt index eba0fc5ca..6754c94c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=14.2.0 comfy-kitchen>=0.2.8 -comfy-aimdo==0.4.0 +comfy-aimdo==0.4.1 requests simpleeval>=1.0.0 blake3