From 18a74cb96ab6137f67229ffc0aa7e0f11a1e5ff3 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Wed, 13 May 2026 22:15:54 +1000
Subject: [PATCH] cli_args/execution: Implement lower background cache-ram
 threshold

Limit the amount of RAM background intermediates can use, so that
switching workflows doesn't degrade performance too much.
---
 comfy/cli_args.py |  7 ++++---
 execution.py      |  3 ++-
 main.py           | 14 ++++++++++----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 9dadb0093..e0d7d4af4 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -110,13 +110,11 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
 
 parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
 
-CACHE_RAM_AUTO_GB = -1.0
-
 cache_group = parser.add_mutually_exclusive_group()
 cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
 cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
 cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")
+cache_group.add_argument("--cache-ram", nargs='*', type=float, default=None, metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 25%% of system RAM (min 4GB, max 32GB), inactive 75%% of system RAM (min 12GB, max 96GB).")
 
 attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@@ -246,6 +244,9 @@ if comfy.options.args_parsing:
 else:
     args = parser.parse_args([])
 
+if args.cache_ram is not None and len(args.cache_ram) > 2:
+    parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
+
 if args.windows_standalone_build:
     args.auto_launch = True
 
diff --git a/execution.py b/execution.py
index 5605f09e7..9c3968810 100644
--- a/execution.py
+++ b/execution.py
@@ -728,6 +728,7 @@ class PromptExecutor:
 
         self._notify_prompt_lifecycle("start", prompt_id)
         ram_headroom = int(self.cache_args["ram"] * (1024 ** 3))
+        ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3))
         ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None
         comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom)
 
@@ -781,7 +782,7 @@ class PromptExecutor:
                         execution_list.complete_node_execution()
 
                     if self.cache_type == CacheType.RAM_PRESSURE:
-                        ram_release_callback(ram_headroom)
+                        ram_release_callback(ram_inactive_headroom)
                         ram_shortfall = ram_headroom - psutil.virtual_memory().available
                         comfy.model_management.free_pins(ram_shortfall)
                         ram_release_callback(ram_headroom, free_active=True)
diff --git a/main.py b/main.py
index a6fdaf43c..ad9742252 100644
--- a/main.py
+++ b/main.py
@@ -283,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]:
 
 def prompt_worker(q, server_instance):
     current_time: float = 0.0
-    cache_ram = args.cache_ram
-    if cache_ram < 0:
+    cache_ram = 0
+    cache_ram_inactive = 0
+    if args.cache_ram is not None:
         cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0))
+        cache_ram_inactive = min(96.0, max(12.0, comfy.model_management.total_ram * 0.75 / 1024.0))
+        if len(args.cache_ram) > 0:
+            cache_ram = args.cache_ram[0]
+        if len(args.cache_ram) > 1:
+            cache_ram_inactive = args.cache_ram[1]
 
     cache_type = execution.CacheType.CLASSIC
     if args.cache_lru > 0:
         cache_type = execution.CacheType.LRU
-    elif cache_ram > 0:
+    elif max(cache_ram, cache_ram_inactive) > 0:
         cache_type = execution.CacheType.RAM_PRESSURE
     elif args.cache_none:
         cache_type = execution.CacheType.NONE
 
-    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } )
+    e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } )
     last_gc_collect = 0
     need_gc = False
     gc_collect_interval = 10.0