From d531e3fb2a885d675d5b6d3a496b4af5d9757af1 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Wed, 4 Mar 2026 07:47:44 -0800
Subject: [PATCH] model_patcher: Improve dynamic offload heuristic (#12759)

Define a threshold below which a weight loading takes priority. This
actually makes the offload consistent with non-dynamic, because what
happens, is when non-dynamic fills ints to_load list, it will fill-up
any left-over pieces that could fix large weights with small weights
and load them, even though they were lower priority. This actually
improves performance because the timy weights dont cost any VRAM and
arent worth the control overhead of the DMA etc.
---
 comfy/model_patcher.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 70f78a089..168ce8430 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -699,7 +699,7 @@ class ModelPatcher:
         for key in list(self.pinned):
             self.unpin_weight(key)
 
-    def _load_list(self, prio_comfy_cast_weights=False, default_device=None):
+    def _load_list(self, for_dynamic=False, default_device=None):
         loading = []
         for n, m in self.model.named_modules():
             default = False
@@ -727,8 +727,13 @@ class ModelPatcher:
                         return 0
                     module_offload_mem += check_module_offload_mem("{}.weight".format(n))
                     module_offload_mem += check_module_offload_mem("{}.bias".format(n))
-                prepend = (not hasattr(m, "comfy_cast_weights"),) if prio_comfy_cast_weights else ()
-                loading.append(prepend + (module_offload_mem, module_mem, n, m, params))
+                # Dynamic: small weights (<64KB) first, then larger weights prioritized by size.
+                # Non-dynamic: prioritize by module offload cost.
+                if for_dynamic:
+                    sort_criteria = (module_offload_mem >= 64 * 1024, -module_offload_mem)
+                else:
+                    sort_criteria = (module_offload_mem,)
+                loading.append(sort_criteria + (module_mem, n, m, params))
         return loading
 
     def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
@@ -1508,11 +1513,11 @@ class ModelPatcherDynamic(ModelPatcher):
             if vbar is not None:
                 vbar.prioritize()
 
-            loading = self._load_list(prio_comfy_cast_weights=True, default_device=device_to)
-            loading.sort(reverse=True)
+            loading = self._load_list(for_dynamic=True, default_device=device_to)
+            loading.sort()
 
             for x in loading:
-                _, _, _, n, m, params = x
+                *_, module_mem, n, m, params = x
 
                 def set_dirty(item, dirty):
                     if dirty or not hasattr(item, "_v_signature"):
@@ -1627,9 +1632,9 @@ class ModelPatcherDynamic(ModelPatcher):
         return freed
 
     def partially_unload_ram(self, ram_to_unload):
-        loading = self._load_list(prio_comfy_cast_weights=True, default_device=self.offload_device)
+        loading = self._load_list(for_dynamic=True, default_device=self.offload_device)
         for x in loading:
-            _, _, _, _, m, _ = x
+            *_, m, _ = x
             ram_to_unload -= comfy.pinned_memory.unpin_memory(m)
             if ram_to_unload <= 0:
                 return