Merge pull request #9 from rattus128/dev/threaded-loader-2-ram-cache

threaded loader 2 + ram cache (CORE-43,CORE-117)
2026-05-16 03:57:27 +08:00 · 2026-05-15 13:23:16 +10:00 · 2026-05-15 13:23:16 +10:00 · ea5775c620
commit ea5775c620
parent 3f717816e1 55197d8bfc
7 changed files with 114 additions and 33 deletions
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -110,13 +110,11 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
 CACHE_RAM_AUTO_GB = -1.0
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
 cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -246,6 +244,9 @@ if comfy.options.args_parsing:
 else:
    args = parser.parse_args([])
 if args.cache_ram is not None and len(args.cache_ram) > 2:
    parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
 if args.windows_standalone_build:
    args.auto_launch = True
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -498,7 +498,11 @@ current_loaded_models = []
 DIRTY_MMAPS = set()
-PIN_PRESSURE_HYSTERESIS = 128 * 1024 * 1024
+PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024
 #Freeing registerables on pressure does imply a GPU sync, so go big on
 #the hysteresis so each expensive sync gives us back a good chunk.
 REGISTERABLE_PIN_HYSTERESIS = 768 * 1024 * 1024
 def module_size(module):
    module_mem = 0
@ -525,15 +529,28 @@ def free_pins(size, evict_active=False):
                break
 def ensure_pin_budget(size, evict_active=False):
-    if MAX_PINNED_MEMORY <= 0:
+    if MAX_MODEL_MEMORY <= 0:
        return
-    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+    shortfall = TOTAL_MODEL_MEMORY + size - MAX_MODEL_MEMORY
    if shortfall <= 0:
        return
    free_pins(shortfall + PIN_PRESSURE_HYSTERESIS, evict_active=evict_active)
 def ensure_pin_registerable(size, evict_active=False):
    shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
    if MAX_PINNED_MEMORY <= 0 or shortfall <= 0:
        return
    shortfall += REGISTERABLE_PIN_HYSTERESIS
    for loaded_model in reversed(current_loaded_models):
        model = loaded_model.model
        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
            shortfall -= model.unregister_inactive_pins(shortfall)
            if shortfall <= 0:
                return
 class LoadedModel:
    def __init__(self, model):
        self._set_model(model)
@ -1208,22 +1225,24 @@ def get_pin_buffer(offload_stream):
    return pin_buffer
 def resize_pin_buffer(pin_buffer, size):
-    global TOTAL_PINNED_MEMORY
+    global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY
    old_size = pin_buffer.size
    if size <= old_size:
        return True
    growth = size - old_size
    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM, free_active=True)
    ensure_pin_budget(growth, evict_active=True)
    ensure_pin_registerable(growth, evict_active=True)
    try:
        pin_buffer.extend(size=size, reallocate=True)
    except RuntimeError:
        return False
    TOTAL_MODEL_MEMORY += pin_buffer.size - old_size
    TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
    return True
 def reset_cast_buffers():
-    global TOTAL_PINNED_MEMORY
+    global TOTAL_MODEL_MEMORY, TOTAL_PINNED_MEMORY
    global LARGEST_CASTED_WEIGHT
    global LARGEST_AIMDO_CASTED_WEIGHT
@ -1239,16 +1258,17 @@ def reset_cast_buffers():
    DIRTY_MMAPS.clear()
    for pin_buffer in STREAM_PIN_BUFFERS.values():
        TOTAL_MODEL_MEMORY -= pin_buffer.size
        TOTAL_PINNED_MEMORY -= pin_buffer.size
-    if TOTAL_PINNED_MEMORY < 0:
+    TOTAL_MODEL_MEMORY = max(0, TOTAL_MODEL_MEMORY)
-        TOTAL_PINNED_MEMORY = 0
+    TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY)
    for loaded_model in current_loaded_models:
        model = loaded_model.model
        if model is not None and model.is_dynamic():
            model.model.dynamic_pins[model.load_device]["active"] = False
            model.partially_unload_ram(1e30, subsets=[ "patches" ])
-            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [])
+            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1])
    STREAM_CAST_BUFFERS.clear()
    STREAM_AIMDO_CAST_BUFFERS.clear()
@ -1352,14 +1372,18 @@ def cast_to_device(tensor, device, dtype, copy=False):
 PINNED_MEMORY = {}
 TOTAL_MODEL_MEMORY = 0
 TOTAL_PINNED_MEMORY = 0
 MAX_MODEL_MEMORY = -1
 MAX_PINNED_MEMORY = -1
 if not args.disable_pinned_memory:
    if is_nvidia() or is_amd():
        ram = get_total_memory(torch.device("cpu"))
        MAX_MODEL_MEMORY = min(ram - 4 * 1024 * 1024 * 1024, ram * 0.90)
        if WINDOWS:
-            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40  # Windows limit is apparently 50%
+            MAX_PINNED_MEMORY = ram * 0.40  # Windows limit is apparently 50%
        else:
-            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
+            MAX_PINNED_MEMORY = ram * 0.90
        logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
 PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
@ -1396,7 +1420,7 @@ def pin_memory(tensor):
    size = tensor.nbytes
    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
-    ensure_pin_budget(size)
+    ensure_pin_registerable(size)
    ptr = tensor.data_ptr()
    if ptr == 0:
@ -1433,7 +1457,8 @@ def unpin_memory(tensor):
        return False
    if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
-        TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
+        size = PINNED_MEMORY.pop(ptr)
        TOTAL_PINNED_MEMORY -= size
        return True
    else:
        logging.warning("Unpin error.")
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -1557,8 +1557,8 @@ class ModelPatcherDynamic(ModelPatcher):
            self.model.dynamic_pins = {}
        if self.load_device not in self.model.dynamic_pins:
            self.model.dynamic_pins[self.load_device] = {
-                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), []),
+                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024), [], [-1]),
-                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), []),
+                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024), [], [-1]),
                "failed": False,
                "active": False,
            }
@ -1761,19 +1761,44 @@ class ModelPatcherDynamic(ModelPatcher):
        return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
                self.model.dynamic_pins[self.load_device]["patches"][0].size)
    def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]):
        freed = 0
        pin_state = self.model.dynamic_pins[self.load_device]
        for subset in subsets:
            hostbuf, stack, stack_split = pin_state[subset]
            split = stack_split[0]
            while split >= 0:
                module, offset = stack[split]
                split -= 1
                stack_split[0] = split
                if not module._pin_registered:
                    continue
                size = module._pin.numel() * module._pin.element_size()
                if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
                    comfy.model_management.discard_cuda_async_error()
                    continue
                module._pin_registered = False
                comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
                freed += size
                ram_to_unload -= size
                if ram_to_unload <= 0:
                    return freed
        return freed
    def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
        freed = 0
        pin_state = self.model.dynamic_pins[self.load_device]
        for subset in subsets:
-            hostbuf, stack = pin_state[subset]
+            hostbuf, stack, stack_split = pin_state[subset]
            while len(stack) > 0:
                module, offset = stack.pop()
                size = module._pin.numel() * module._pin.element_size()
                del module._pin
-                hostbuf.truncate(offset)
+                hostbuf.truncate(offset, do_unregister=module._pin_registered)
-                comfy.model_management.TOTAL_PINNED_MEMORY -= size
+                stack_split[0] = min(stack_split[0], len(stack) - 1)
-                if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
+                comfy.model_management.TOTAL_MODEL_MEMORY = max(0, comfy.model_management.TOTAL_MODEL_MEMORY - size)
-                    comfy.model_management.TOTAL_PINNED_MEMORY = 0
+                if module._pin_registered:
                    comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
                freed += size
                ram_to_unload -= size
                if ram_to_unload <= 0:
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@ -2,6 +2,7 @@ import comfy.model_management
 import comfy.memory_management
 import comfy_aimdo.host_buffer
 import comfy_aimdo.torch
 import torch
 from comfy.cli_args import args
@ -10,15 +11,33 @@ def get_pin(module, subset="weights"):
 def pin_memory(module, subset="weights", size=None):
    pin_state = module._pin_state
-    if pin_state["failed"] or args.disable_pinned_memory or get_pin(module, subset) is not None:
+    if pin_state["failed"] or args.disable_pinned_memory:
        return
-    hostbuf, stack = pin_state[subset]
+    hostbuf, stack, stack_split = pin_state[subset]
    pin = get_pin(module, subset)
    if pin is not None:
        if module._pin_registered:
            return
        size = module._pin.nbytes
        comfy.model_management.ensure_pin_registerable(size)
        if torch.cuda.cudart().cudaHostRegister(module._pin.data_ptr(), size, 1) != 0:
            comfy.model_management.discard_cuda_async_error()
            return False
        module._pin_registered = True
        stack_split[0] = max(stack_split[0], module._pin_stack_index)
        comfy.model_management.TOTAL_PINNED_MEMORY += size
        return True
    if size is None:
        size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
    offset = hostbuf.size
    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
    comfy.model_management.ensure_pin_budget(size)
    comfy.model_management.ensure_pin_registerable(size)
    try:
        hostbuf.extend(size=size)
@ -29,5 +48,9 @@ def pin_memory(module, subset="weights", size=None):
    module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
    module._pin.untyped_storage()._comfy_hostbuf = hostbuf
    stack.append((module, offset))
    module._pin_registered = True
    module._pin_stack_index = len(stack) - 1
    stack_split[0] = max(stack_split[0], module._pin_stack_index)
    comfy.model_management.TOTAL_MODEL_MEMORY += size
    comfy.model_management.TOTAL_PINNED_MEMORY += size
    return True
--- a/execution.py
+++ b/execution.py
@ -728,6 +728,7 @@ class PromptExecutor:
        self._notify_prompt_lifecycle("start", prompt_id)
        ram_headroom = int(self.cache_args["ram"] * (1024 ** 3))
        ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3))
        ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None
        comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom)
@ -781,7 +782,7 @@ class PromptExecutor:
                        execution_list.complete_node_execution()
                    if self.cache_type == CacheType.RAM_PRESSURE:
-                        ram_release_callback(ram_headroom)
+                        ram_release_callback(ram_inactive_headroom)
                        ram_shortfall = ram_headroom - psutil.virtual_memory().available
                        comfy.model_management.free_pins(ram_shortfall)
                        ram_release_callback(ram_headroom, free_active=True)
--- a/main.py
+++ b/main.py
@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]:
 def prompt_worker(q, server_instance):
    current_time: float = 0.0
-    cache_ram = args.cache_ram
+    cache_ram = 0
-    if cache_ram < 0:
+    cache_ram_inactive = 0
    if not args.cache_classic and not args.cache_none and args.cache_lru <= 0:
        cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0))
        cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0))
        if len(args.cache_ram) > 0:
            cache_ram = args.cache_ram[0]
        if len(args.cache_ram) > 1:
            cache_ram_inactive = args.cache_ram[1]
-    cache_type = execution.CacheType.CLASSIC
+    cache_type = execution.CacheType.RAM_PRESSURE
-    if args.cache_lru > 0:
+    if args.cache_classic:
        cache_type = execution.CacheType.CLASSIC
    elif args.cache_lru > 0:
        cache_type = execution.CacheType.LRU
    elif cache_ram > 0:
        cache_type = execution.CacheType.RAM_PRESSURE
    elif args.cache_none:
        cache_type = execution.CacheType.NONE
-    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } )
+    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } )
    last_gc_collect = 0
    need_gc = False
    gc_collect_interval = 10.0
--- a/requirements.txt
+++ b/requirements.txt
@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=14.2.0
 comfy-kitchen>=0.2.8
-comfy-aimdo==0.4.0
+comfy-aimdo==0.4.1
 requests
 simpleeval>=1.0.0
 blake3