diff --git a/comfy/model_management.py b/comfy/model_management.py
index 6f464e8f9..195d0bb9a 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -398,7 +398,7 @@ try:
                         ENABLE_PYTORCH_ATTENTION = True
                         if rocm_version >= (7, 0):
                             if any((a in arch) for a in ["gfx1201"]):
-                               ENABLE_PYTORCH_ATTENTION = True
+                                ENABLE_PYTORCH_ATTENTION = True
         if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
             if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx950"]):  # TODO: more arches, "gfx942" gives error on pytorch nightly 2.10 1013 rocm7.0
                 SUPPORT_FP8_OPS = True
@@ -641,6 +641,42 @@ def minimum_inference_memory():
     return (1024 * 1024 * 1024) * 0.8 + extra_reserved_memory()
 
 
+def trim_memory() -> bool:
+    """
+    Trims memory usage, returning reserved memory to the system
+
+    Only supported on Windows and Linux
+    :return:
+    """
+    try:
+        if sys.platform.startswith('linux'):
+            import ctypes
+            libc_path = ctypes.util.find_library('c')
+            if not libc_path:
+                return False
+
+            libc = ctypes.CDLL(libc_path)
+
+            if hasattr(libc, 'malloc_trim'):
+                return libc.malloc_trim(0) == 1
+            else:
+                return False
+        elif sys.platform == 'win32':
+            import ctypes.wintypes
+            kernel32 = ctypes.WinDLL("kernel32")
+            EmptyProcessWorkingSet = kernel32.EmptyProcessWorkingSet
+            EmptyProcessWorkingSet.argtypes = [ctypes.wintypes.HANDLE]
+            EmptyProcessWorkingSet.restype = ctypes.wintypes.BOOL
+            handle = -1
+            success = EmptyProcessWorkingSet(handle)
+            return bool(success)
+        else:
+            return False
+    except Exception as exc_info:
+        logger.warning("failed to trim", exc_info=exc_info)
+        return False
+
+
 @tracer.start_as_current_span("Free Memory")
 def free_memory(memory_required, device, keep_loaded=[]) -> List[LoadedModel]:
     span = get_current_span()
@@ -1593,6 +1629,7 @@ def _soft_empty_cache(force=False):
 def unload_all_models():
     with model_management_lock:
         free_memory(1e30, get_torch_device())
+        trim_memory()
 
 
 @_deprecate_method(version="*", message="The comfy.model_management.resolve_lowvram_weight function will be removed soon, please stop using it.")
diff --git a/comfy_extras/nodes/nodes_group_offloading.py b/comfy_extras/nodes/nodes_group_offloading.py
index 635367e53..9ff07025f 100644
--- a/comfy_extras/nodes/nodes_group_offloading.py
+++ b/comfy_extras/nodes/nodes_group_offloading.py
@@ -1,9 +1,10 @@
 import torch
+import logging
 from diffusers import HookRegistry
 from diffusers.hooks import apply_group_offloading, apply_layerwise_casting, ModelHook
 
 from comfy.language.transformers_model_management import TransformersManagedModel
-from comfy.model_management import vram_state, VRAMState
+from comfy.model_management import vram_state, VRAMState, unload_all_models, get_free_memory, get_torch_device
 from comfy.model_management_types import HooksSupport, ModelManageable
 from comfy.model_patcher import ModelPatcher
 from comfy.node_helpers import export_custom_nodes
@@ -14,6 +15,8 @@ from comfy.rmsnorm import RMSNorm
 
 _DISABLE_COMFYUI_CASTING_HOOK = "disable_comfyui_casting_hook"
 
+logger = logging.getLogger(__name__)
+
 
 class DisableComfyWeightCast(ModelHook):
     r"""
@@ -75,6 +78,10 @@ def prepare_group_offloading_factory(load_device: torch.device, offload_device:
     def wrapper(executor, model: ModelPatcher, *args, **kwargs):
         # this model will now just be loaded to CPU, since diffusers will manage moving to gpu
         model.load_device = offload_device
+
+        # we'll have to unload everything to use pinning better, this includes trimming
+        unload_all_models()
+
         # loads the model, prepares everything
         inner_model, conds, models = executor(model, *args, **kwargs)
 
@@ -83,13 +90,23 @@ def prepare_group_offloading_factory(load_device: torch.device, offload_device:
             raise ValueError("manual casting operations, where the model is loaded in different weights than inference will occur, is not supported")
 
         # weights are patched, ready to go, inner model will be correctly deleted at the end of sampling
+        model_size = model.model_size()
+
+        model_too_large = model_size * 2 > get_free_memory(torch.cpu)
+        low_vram_state = vram_state in (VRAMState.LOW_VRAM,)
+        is_cuda_device = load_device.type == 'cuda'
+
+        if model_too_large or low_vram_state:
+            logger.error(f"group offloading did not use memory pinning because model_too_large={model_too_large} low_vram_state={low_vram_state}")
+        if not is_cuda_device:
+            logger.error(f"group offloading did not use stream because load_device.type={load_device.type} != \"cuda\"")
         apply_group_offloading(
             inner_model.diffusion_model,
             load_device,
             offload_device,
-            use_stream=True,
-            record_stream=True,
-            low_cpu_mem_usage=vram_state in (VRAMState.LOW_VRAM,),
+            use_stream=is_cuda_device,
+            record_stream=is_cuda_device,
+            low_cpu_mem_usage=low_vram_state or model_too_large,
             num_blocks_per_group=1
         )
         # then the inputs will be ready on the correct device due to the wrapper factory
@@ -139,7 +156,7 @@ class GroupOffload(CustomNode):
                 num_blocks_per_group=1
             )
         elif isinstance(model, HooksSupport) and isinstance(model, ModelManageable):
-            model.add_wrapper(WrappersMP.PREPARE_SAMPLING, prepare_group_offloading_factory(model.load_device, model.offload_device))
+            model.add_wrapper_with_key(WrappersMP.PREPARE_SAMPLING, "group_offload", prepare_group_offloading_factory(model.load_device, model.offload_device))
         return model,