From 37f711d4a1d429e6b390b01729510525155385e1 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:34:46 -0800
Subject: [PATCH 1/3] mm: Fix cast buffers with intel offloading (#12229)

Intel has offloading support but there were some nvidia calls in the
new cast buffer stuff.
---
 comfy/model_management.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 6b1166b94..2167f81bf 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1112,11 +1112,11 @@ def get_cast_buffer(offload_stream, device, size, ref):
             return None
         if cast_buffer is not None and cast_buffer.numel() > 50 * (1024 ** 2):
             #I want my wrongly sized 50MB+ of VRAM back from the caching allocator right now
-            torch.cuda.synchronize()
+            synchronize()
             del STREAM_CAST_BUFFERS[offload_stream]
             del cast_buffer
             #FIXME: This doesn't work in Aimdo because mempool cant clear cache
-            torch.cuda.empty_cache()
+            soft_empty_cache()
         with wf_context:
             cast_buffer = torch.empty((size), dtype=torch.int8, device=device)
             STREAM_CAST_BUFFERS[offload_stream] = cast_buffer
@@ -1132,9 +1132,7 @@ def reset_cast_buffers():
     for offload_stream in STREAM_CAST_BUFFERS:
         offload_stream.synchronize()
     STREAM_CAST_BUFFERS.clear()
-    if comfy.memory_management.aimdo_allocator is None:
-        #Pytorch 2.7 and earlier crashes if you try and empty_cache when mempools exist
-        torch.cuda.empty_cache()
+    soft_empty_cache()
 
 def get_offload_stream(device):
     stream_counter = stream_counters.get(device, 0)
@@ -1284,7 +1282,7 @@ def discard_cuda_async_error():
         a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
         b = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
         _ = a + b
-        torch.cuda.synchronize()
+        synchronize()
     except torch.AcceleratorError:
         #Dump it! We already know about it from the synchronous return
         pass
@@ -1688,6 +1686,12 @@ def lora_compute_dtype(device):
     LORA_COMPUTE_DTYPES[device] = dtype
     return dtype
 
+def synchronize():
+    if is_intel_xpu():
+        torch.xpu.synchronize()
+    elif torch.cuda.is_available():
+        torch.cuda.synchronize()
+
 def soft_empty_cache(force=False):
     global cpu_state
     if cpu_state == CPUState.MPS:

From de9ada6a4147f4626f903cd975a5d2c134af3915 Mon Sep 17 00:00:00 2001
From: rattus <46076784+rattus128@users.noreply.github.com>
Date: Mon, 2 Feb 2026 14:35:20 -0800
Subject: [PATCH 2/3] Dynamic VRAM unloading fix (#12227)

* mp: fix full dynamic unloading

This was not unloading dynamic models when requesting a full unload via
the unpatch() code path.

This was ok, i your workflow was all dynamic models but fails with big
VRAM leaks if you need to fully unload something for a regular ModelPatcher

It also fices the "unload models" button.

* mm: load models outside of Aimdo Mempool

In dynamic_vram mode, escape the Aimdo mempool and load into the regular
mempool. Use a dummy thread to do it.
---
 comfy/model_management.py | 29 ++++++++++++++++++++++-------
 comfy/model_patcher.py    |  2 +-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 2167f81bf..cd035f017 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -19,7 +19,8 @@
 import psutil
 import logging
 from enum import Enum
-from comfy.cli_args import args, PerformanceFeature
+from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram
+import threading
 import torch
 import sys
 import platform
@@ -650,7 +651,7 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, ram_
                 soft_empty_cache()
     return unloaded_models
 
-def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+def load_models_gpu_orig(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
     cleanup_models_gc()
     global vram_state
 
@@ -746,8 +747,25 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
         current_loaded_models.insert(0, loaded_model)
     return
 
-def load_model_gpu(model):
-    return load_models_gpu([model])
+def load_models_gpu_thread(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load):
+    with torch.inference_mode():
+        load_models_gpu_orig(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
+        soft_empty_cache()
+
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+    #Deliberately load models outside of the Aimdo mempool so they can be retained accross
+    #nodes. Use a dummy thread to do it as pytorch documents that mempool contexts are
+    #thread local. So exploit that to escape context
+    if enables_dynamic_vram():
+        t = threading.Thread(
+            target=load_models_gpu_thread,
+            args=(models, memory_required, force_patch_weights, minimum_memory_required, force_full_load)
+        )
+        t.start()
+        t.join()
+    else:
+        load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights,
+                             minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
 
 def loaded_models(only_currently_used=False):
     output = []
@@ -1717,9 +1735,6 @@ def debug_memory_summary():
         return torch.cuda.memory.memory_summary()
     return ""
 
-#TODO: might be cleaner to put this somewhere else
-import threading
-
 class InterruptProcessingException(Exception):
     pass
 
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index b70c031bf..cdf289395 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1597,7 +1597,7 @@ class ModelPatcherDynamic(ModelPatcher):
 
         if unpatch_weights:
             self.partially_unload_ram(1e32)
-            self.partially_unload(None)
+            self.partially_unload(None, 1e32)
 
     def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
         assert not force_patch_weights #See above

From c05a08ae66d8114c30f8d3606b9de3117cb61ef8 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 2 Feb 2026 16:52:07 -0800
Subject: [PATCH 3/3] Add back function. (#12234)

---
 comfy/model_management.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index cd035f017..72348258b 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -767,6 +767,9 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
         load_models_gpu_orig(models, memory_required=memory_required, force_patch_weights=force_patch_weights,
                              minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
 
+def load_model_gpu(model):
+    return load_models_gpu([model])
+
 def loaded_models(only_currently_used=False):
     output = []
     for m in current_loaded_models: