Merge d56887ac52 into cf758bd256

2026-05-23 15:37:27 +08:00 · 2026-05-01 21:51:06 +02:00
13 changed files with 103 additions and 282 deletions
--- a/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
+++ b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --enable-dynamic-vram
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
 pause
--- a/2
+++ b/2
@ -1,2 +1,2 @@
 # Admins
-* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
+* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128
--- a/README.md
+++ b/README.md
@ -193,15 +193,13 @@ If you have trouble extracting it, right click the file -> properties -> unblock

 The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.

-#### All Official Portable Downloads:
+#### Alternative Downloads:

 [Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)

-[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
+[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)

-[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
-
-[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
+[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).

 #### How do I share models between another UI and ComfyUI?

--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -90,6 +90,7 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")

 parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
 parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")

 class LatentPreviewMethod(enum.Enum):
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -16,7 +16,6 @@ from comfy.ldm.lightricks.model import (
 from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
 from comfy.ldm.lightricks.embeddings_connector import Embeddings1DConnector
 import comfy.ldm.common_dit
-import comfy.model_prefetch

 class CompressedTimestep:
    """Store video timestep embeddings in compressed form using per-frame indexing."""
@ -908,11 +907,9 @@ class LTXAVModel(LTXVModel):
        """Process transformer blocks for LTXAV."""
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
-        prefetch_queue = comfy.model_prefetch.make_prefetch_queue(list(self.transformer_blocks), vx.device, transformer_options)

        # Process transformer blocks
        for i, block in enumerate(self.transformer_blocks):
-            comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, block)
            if ("double_block", i) in blocks_replace:

                def block_wrap(args):
@ -985,8 +982,6 @@ class LTXAVModel(LTXVModel):
                    a_prompt_timestep=a_prompt_timestep,
                )

-        comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, None)
-
        return [vx, ax]

    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -17,7 +17,6 @@
 """

 from __future__ import annotations
-import comfy.memory_management
 import comfy.utils
 import comfy.model_management
 import comfy.model_base
@ -474,17 +473,3 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
            weight = old_weight

    return weight
-
-def prefetch_prepared_value(value, allocate_buffer, stream):
-    if isinstance(value, torch.Tensor):
-        dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value))
-        comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
-        return comfy.memory_management.interpret_gathered_like([value], dest)[0]
-    elif isinstance(value, weight_adapter.WeightAdapterBase):
-        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream))
-    elif isinstance(value, tuple):
-        return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value)
-    elif isinstance(value, list):
-        return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value]
-
-    return value
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -214,11 +214,6 @@ class BaseModel(torch.nn.Module):
        if "latent_shapes" in extra_conds:
            xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes"))

-        transformer_options = transformer_options.copy()
-        transformer_options["prefetch_dynamic_vbars"] = (
-            self.current_patcher is not None and self.current_patcher.is_dynamic()
-        )
-
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds)
        if len(model_output) > 1 and not torch.is_tensor(model_output):
            model_output, _ = utils.pack_latents(model_output)
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -31,7 +31,6 @@ from contextlib import nullcontext
 import comfy.memory_management
 import comfy.utils
 import comfy.quant_ops
-import comfy_aimdo.vram_buffer

 class VRAMState(Enum):
    DISABLED = 0    #No vram present: no need to move models to vram
@ -113,6 +112,10 @@ if args.directml is not None:
    # torch_directml.disable_tiled_resources(True)
    lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.

+try:
+    import intel_extension_for_pytorch as ipex  # noqa: F401
+except:
+    pass

 try:
    _ = torch.xpu.device_count()
@ -580,6 +583,9 @@ class LoadedModel:

        real_model = self.model.model

+        if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None:
+            with torch.no_grad():
+                real_model = ipex.optimize(real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)

        self.real_model = weakref.ref(real_model)
        self.model_finalizer = weakref.finalize(real_model, cleanup_models)
@ -1176,10 +1182,6 @@ stream_counters = {}

 STREAM_CAST_BUFFERS = {}
 LARGEST_CASTED_WEIGHT = (None, 0)
-STREAM_AIMDO_CAST_BUFFERS = {}
-LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-
-DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3

 def get_cast_buffer(offload_stream, device, size, ref):
    global LARGEST_CASTED_WEIGHT
@ -1213,26 +1215,13 @@ def get_cast_buffer(offload_stream, device, size, ref):

    return cast_buffer

-def get_aimdo_cast_buffer(offload_stream, device):
-    cast_buffer = STREAM_AIMDO_CAST_BUFFERS.get(offload_stream, None)
-    if cast_buffer is None:
-        cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index)
-        STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer
-
-    return cast_buffer
 def reset_cast_buffers():
    global LARGEST_CASTED_WEIGHT
-    global LARGEST_AIMDO_CASTED_WEIGHT
-
    LARGEST_CASTED_WEIGHT = (None, 0)
-    LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS):
-        if offload_stream is not None:
-            offload_stream.synchronize()
+    for offload_stream in STREAM_CAST_BUFFERS:
+        offload_stream.synchronize()
    synchronize()
-
    STREAM_CAST_BUFFERS.clear()
-    STREAM_AIMDO_CAST_BUFFERS.clear()
    soft_empty_cache()

 def get_offload_stream(device):
@ -1592,7 +1581,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
        return False

    if is_intel_xpu():
-        return torch.xpu.get_device_properties(device).has_fp16
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.get_device_properties(device).has_fp16

    if is_ascend_npu():
        return True
@ -1658,7 +1650,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        return False

    if is_intel_xpu():
-        return torch.xpu.is_bf16_supported()
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.is_bf16_supported()

    if is_ascend_npu():
        return True
@ -1789,7 +1784,6 @@ def soft_empty_cache(force=False):
    if cpu_state == CPUState.MPS:
        torch.mps.empty_cache()
    elif is_intel_xpu():
-        torch.xpu.synchronize()
        torch.xpu.empty_cache()
    elif is_ascend_npu():
        torch.npu.empty_cache()
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -121,20 +121,9 @@ class LowVramPatch:
        self.patches = patches
        self.convert_func = convert_func # TODO: remove
        self.set_func = set_func
-        self.prepared_patches = None
-
-    def prepare(self, allocate_buffer, stream):
-        self.prepared_patches = [
-            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4])
-            for patch in self.patches[self.key]
-        ]
-
-    def clear_prepared(self):
-        self.prepared_patches = None

    def __call__(self, weight):
-        patches = self.prepared_patches if self.prepared_patches is not None else self.patches[self.key]
-        return comfy.lora.calculate_weight(patches, weight, self.key, intermediate_dtype=weight.dtype)
+        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)

 LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2

--- a/comfy/model_prefetch.py
+++ b/comfy/model_prefetch.py
@ -1,65 +0,0 @@
-import comfy_aimdo.model_vbar
-import comfy.model_management
-import comfy.ops
-
-PREFETCH_QUEUES = []
-
-def cleanup_prefetched_modules(comfy_modules):
-    for s in comfy_modules:
-        prefetch = getattr(s, "_prefetch", None)
-        if prefetch is None:
-            continue
-        for param_key in ("weight", "bias"):
-            lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-            if lowvram_fn is not None:
-                lowvram_fn.clear_prepared()
-        if prefetch["signature"] is not None:
-            comfy_aimdo.model_vbar.vbar_unpin(s._v)
-        delattr(s, "_prefetch")
-
-def cleanup_prefetch_queues():
-    global PREFETCH_QUEUES
-
-    for queue in PREFETCH_QUEUES:
-        for entry in queue:
-            if entry is None or not isinstance(entry, tuple):
-                continue
-            _, prefetch_state = entry
-            comfy_modules = prefetch_state[1]
-            if comfy_modules is not None:
-                cleanup_prefetched_modules(comfy_modules)
-    PREFETCH_QUEUES = []
-
-def prefetch_queue_pop(queue, device, module):
-    if queue is None:
-        return
-
-    consumed = queue.pop(0)
-    if consumed is not None:
-        offload_stream, prefetch_state = consumed
-        offload_stream.wait_stream(comfy.model_management.current_stream(device))
-        _, comfy_modules = prefetch_state
-        if comfy_modules is not None:
-            cleanup_prefetched_modules(comfy_modules)
-
-    prefetch = queue[0]
-    if prefetch is not None:
-        comfy_modules = []
-        for s in prefetch.modules():
-            if hasattr(s, "_v"):
-                comfy_modules.append(s)
-
-        offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True)
-        comfy.model_management.sync_stream(device, offload_stream)
-        queue[0] = (offload_stream, (prefetch, comfy_modules))
-
-def make_prefetch_queue(queue, device, transformer_options):
-    if (not transformer_options.get("prefetch_dynamic_vbars", False)
-        or comfy.model_management.NUM_STREAMS == 0
-        or comfy.model_management.is_device_cpu(device)
-        or not comfy.model_management.device_supports_non_blocking(device)):
-        return None
-
-    queue = [None] + queue + [None]
-    PREFETCH_QUEUES.append(queue)
-    return queue
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -86,61 +86,38 @@ def materialize_meta_param(s, param_keys):
            setattr(s, param_key, torch.nn.Parameter(torch.zeros(param.shape, dtype=param.dtype), requires_grad=param.requires_grad))


-# FIXME: add n=1 cache hit fast path
-def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blocking):
-    offload_stream = None
-    cast_buffer = None
-    cast_buffer_offset = 0
-
-    def ensure_offload_stream(module, required_size, check_largest):
-        nonlocal offload_stream
-        nonlocal cast_buffer
-
-        if offload_stream is None:
-            offload_stream = comfy.model_management.get_offload_stream(device)
-        if offload_stream is None or not check_largest or len(comfy_modules) != 1:
-            return
-
-        current_size = 0 if cast_buffer is None else cast_buffer.size()
-        if current_size < required_size and module is comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT[0]:
-            offload_stream = comfy.model_management.get_offload_stream(device)
-            cast_buffer = None
-        if required_size > comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT[1]:
-            comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT = (module, required_size)
-
-    def get_cast_buffer(buffer_size):
-        nonlocal offload_stream
-        nonlocal cast_buffer
-        nonlocal cast_buffer_offset
-
-        if buffer_size == 0:
-            return None
-
-        if offload_stream is None:
-            return torch.empty((buffer_size,), dtype=torch.uint8, device=device)
-
-        cast_buffer = comfy.model_management.get_aimdo_cast_buffer(offload_stream, device)
-        buffer = comfy_aimdo.torch.aimdo_to_tensor(cast_buffer.get(buffer_size, cast_buffer_offset), device)
-        cast_buffer_offset += buffer_size
-        return buffer
-
-    for s in comfy_modules:
-        signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
-        resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
-        prefetch = {
-            "signature": signature,
-            "resident": resident,
-        }
-
-        if resident:
-            s._prefetch = prefetch
-            continue
-
+def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant):
+    #vbar doesn't support CPU weights, but some custom nodes have weird paths
+    #that might switch the layer to the CPU and expect it to work. We have to take
+    #a clone conservatively as we are mmapped and some SFT files are packed misaligned
+    #If you are a custom node author reading this, please move your layer to the GPU
+    #or declare your ModelPatcher as CPU in the first place.
+    if comfy.model_management.is_device_cpu(device):
+        materialize_meta_param(s, ["weight", "bias"])
+        weight = s.weight.to(dtype=dtype, copy=True)
+        if isinstance(weight, QuantizedTensor):
+            weight = weight.dequantize()
+        bias = None
+        if s.bias is not None:
+            bias = s.bias.to(dtype=bias_dtype, copy=True)
+        return weight, bias, (None, None, None)
+
+    offload_stream = None
+    xfer_dest = None
+
+    signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
+    resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
+    if signature is not None:
+        if resident:
+            weight = s._v_weight
+            bias = s._v_bias
+        else:
+            xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
+
+    if not resident:
        materialize_meta_param(s, ["weight", "bias"])
-        xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device) if signature is not None else None
        cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
        cast_dest = None
-        needs_cast = False

        xfer_source = [ s.weight, s.bias ]

@ -152,15 +129,22 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
            if data is None:
                continue
            if data.dtype != geometry.dtype:
-                needs_cast = True
                cast_dest = xfer_dest
+                if cast_dest is None:
+                    cast_dest = torch.empty((comfy.memory_management.vram_aligned_size(cast_geometry),), dtype=torch.uint8, device=device)
                xfer_dest = None
                break

        dest_size = comfy.memory_management.vram_aligned_size(xfer_source)
-        ensure_offload_stream(s, dest_size if xfer_dest is None else 0, True)
+        offload_stream = comfy.model_management.get_offload_stream(device)
+        if xfer_dest is None and offload_stream is not None:
+                xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
+                if xfer_dest is None:
+                    offload_stream = comfy.model_management.get_offload_stream(device)
+                    xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
        if xfer_dest is None:
-            xfer_dest = get_cast_buffer(dest_size)
+            xfer_dest = torch.empty((dest_size,), dtype=torch.uint8, device=device)
+            offload_stream = None

        if signature is None and pin is None:
            comfy.pinned_memory.pin_memory(s)
@ -173,54 +157,27 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
            xfer_source = [ pin ]
        #send it over
        comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        comfy.model_management.sync_stream(device, offload_stream)

-        for param_key in ("weight", "bias"):
-            lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-            if lowvram_fn is not None:
-                ensure_offload_stream(s, cast_buffer_offset, False)
-                lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream)
-
-        prefetch["xfer_dest"] = xfer_dest
-        prefetch["cast_dest"] = cast_dest
-        prefetch["cast_geometry"] = cast_geometry
-        prefetch["needs_cast"] = needs_cast
-        s._prefetch = prefetch
-
-    return offload_stream
-
-
-def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, want_requant):
-
-    prefetch = getattr(s, "_prefetch", None)
-
-    if prefetch["resident"]:
-        weight = s._v_weight
-        bias = s._v_bias
-    else:
-        xfer_dest = prefetch["xfer_dest"]
-        if prefetch["needs_cast"]:
-            cast_dest = prefetch["cast_dest"] if prefetch["cast_dest"] is not None else torch.empty((comfy.memory_management.vram_aligned_size(prefetch["cast_geometry"]),), dtype=torch.uint8, device=device)
+        if cast_dest is not None:
            for pre_cast, post_cast in zip(comfy.memory_management.interpret_gathered_like([s.weight, s.bias ], xfer_dest),
-                                           comfy.memory_management.interpret_gathered_like(prefetch["cast_geometry"], cast_dest)):
+                                           comfy.memory_management.interpret_gathered_like(cast_geometry, cast_dest)):
                if post_cast is not None:
                    post_cast.copy_(pre_cast)
            xfer_dest = cast_dest

-        params = comfy.memory_management.interpret_gathered_like(prefetch["cast_geometry"], xfer_dest)
+        params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest)
        weight = params[0]
        bias = params[1]
-        if prefetch["signature"] is not None:
+        if signature is not None:
            s._v_weight = weight
            s._v_bias = bias
-        s._v_signature = prefetch["signature"]
+        s._v_signature=signature

    def post_cast(s, param_key, x, dtype, resident, update_weight):
        lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
        fns = getattr(s, param_key + "_function", [])

-        if x is None:
-            return None
-
        orig = x

        def to_dequant(tensor, dtype):
@ -248,12 +205,14 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w
            x = f(x)
        return x

-    update_weight = prefetch["signature"] is not None
-    weight = post_cast(s, "weight", weight, dtype, prefetch["resident"], update_weight)
-    if bias is not None:
-        bias = post_cast(s, "bias", bias, bias_dtype, prefetch["resident"], update_weight)
+    update_weight = signature is not None

-    return weight, bias
+    weight = post_cast(s, "weight", weight, dtype, resident, update_weight)
+    if s.bias is not None:
+        bias = post_cast(s, "bias", bias, bias_dtype, resident, update_weight)
+
+    #FIXME: weird offload return protocol
+    return weight, bias, (offload_stream, device if signature is not None else None, None)


 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False):
@ -271,46 +230,10 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
        if device is None:
            device = input.device

-    def format_return(result, offloadable):
-        weight, bias, offload_stream = result
-        return (weight, bias, offload_stream) if offloadable else (weight, bias)
-
    non_blocking = comfy.model_management.device_supports_non_blocking(device)

    if hasattr(s, "_v"):
-
-        #vbar doesn't support CPU weights, but some custom nodes have weird paths
-        #that might switch the layer to the CPU and expect it to work. We have to take
-        #a clone conservatively as we are mmapped and some SFT files are packed misaligned
-        #If you are a custom node author reading this, please move your layer to the GPU
-        #or declare your ModelPatcher as CPU in the first place.
-        if comfy.model_management.is_device_cpu(device):
-            materialize_meta_param(s, ["weight", "bias"])
-            weight = s.weight.to(dtype=dtype, copy=True)
-            if isinstance(weight, QuantizedTensor):
-                weight = weight.dequantize()
-            bias = s.bias.to(dtype=bias_dtype, copy=True) if s.bias is not None else None
-            return format_return((weight, bias, (None, None, None)), offloadable)
-
-        prefetched = hasattr(s, "_prefetch")
-        offload_stream = None
-        offload_device = None
-        if not prefetched:
-            offload_stream = cast_modules_with_vbar([s], dtype, device, bias_dtype, non_blocking)
-            comfy.model_management.sync_stream(device, offload_stream)
-
-        weight, bias = resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, want_requant)
-
-        if not prefetched:
-            if getattr(s, "_prefetch")["signature"] is not None:
-                offload_device = device
-            for param_key in ("weight", "bias"):
-                lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-                if lowvram_fn is not None:
-                    lowvram_fn.clear_prepared()
-            delattr(s, "_prefetch")
-        return format_return((weight, bias, (offload_stream, offload_device, None)), offloadable)
-
+        return cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant)

    if offloadable and (device != s.weight.device or
                        (s.bias is not None and device != s.bias.device)):
@ -357,7 +280,11 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
        for f in s.weight_function:
            weight = f(weight)

-    return format_return((weight, bias, (offload_stream, weight_a, bias_a)), offloadable)
+    if offloadable:
+        return weight, bias, (offload_stream, weight_a, bias_a)
+    else:
+        #Legacy function signature
+        return weight, bias


 def uncast_bias_weight(s, weight, bias, offload_stream):
--- a/comfy_extras/nodes_sdpose.py
+++ b/comfy_extras/nodes_sdpose.py
@ -459,23 +459,27 @@ class SDPoseKeypointExtractor(io.ComfyNode):
        total_images = image.shape[0]
        captured_feat = None

-        model_w = int(head.heatmap_size[0]) * 4   # 192 * 4 = 768
-        model_h = int(head.heatmap_size[1]) * 4   # 256 * 4 = 1024
+        model_h = int(head.heatmap_size[0]) * 4   # e.g. 192 * 4 = 768
+        model_w = int(head.heatmap_size[1]) * 4   # e.g. 256 * 4 = 1024

        def _resize_to_model(imgs):
-            """Stretch BHWC images to (model_h, model_w), model expects no aspect preservation."""
+            """Aspect-preserving resize + zero-pad BHWC images to (model_h, model_w). Returns (resized_bhwc, scale, pad_top, pad_left)."""
            h, w = imgs.shape[-3], imgs.shape[-2]
-            method = "area" if (model_h <= h and model_w <= w) else "bilinear"
+            scale = min(model_h / h, model_w / w)
+            sh, sw = int(round(h * scale)), int(round(w * scale))
+            pt, pl = (model_h - sh) // 2, (model_w - sw) // 2
            chw = imgs.permute(0, 3, 1, 2).float()
-            scaled = comfy.utils.common_upscale(chw, model_w, model_h, upscale_method=method, crop="disabled")
-            return scaled.permute(0, 2, 3, 1), model_w / w, model_h / h
+            scaled = comfy.utils.common_upscale(chw, sw, sh, upscale_method="bilinear", crop="disabled")
+            padded = torch.zeros(scaled.shape[0], scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
+            padded[:, :, pt:pt + sh, pl:pl + sw] = scaled
+            return padded.permute(0, 2, 3, 1), scale, pt, pl

-        def _remap_keypoints(kp, scale_x, scale_y, offset_x=0, offset_y=0):
+        def _remap_keypoints(kp, scale, pad_top, pad_left, offset_x=0, offset_y=0):
            """Remap keypoints from model space back to original image space."""
            kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
            invalid = kp[..., 0] < 0
-            kp[..., 0] = kp[..., 0] / scale_x + offset_x
-            kp[..., 1] = kp[..., 1] / scale_y + offset_y
+            kp[..., 0] = (kp[..., 0] - pad_left) / scale + offset_x
+            kp[..., 1] = (kp[..., 1] - pad_top)  / scale + offset_y
            kp[invalid] = -1
            return kp

@ -525,18 +529,18 @@ class SDPoseKeypointExtractor(io.ComfyNode):
                            continue

                        crop = img[:, y1:y2, x1:x2, :]  # (1, crop_h, crop_w, C)
-                        crop_resized, sx, sy = _resize_to_model(crop)
+                        crop_resized, scale, pad_top, pad_left = _resize_to_model(crop)

                        latent_crop = vae.encode(crop_resized)
                        kp_batch, sc_batch = _run_on_latent(latent_crop)
-                        kp = _remap_keypoints(kp_batch[0], sx, sy, x1, y1)
+                        kp = _remap_keypoints(kp_batch[0], scale, pad_top, pad_left, x1, y1)
                        img_keypoints.append(kp)
                        img_scores.append(sc_batch[0])
                else:
-                    img_resized, sx, sy = _resize_to_model(img)
+                    img_resized, scale, pad_top, pad_left = _resize_to_model(img)
                    latent_img = vae.encode(img_resized)
                    kp_batch, sc_batch = _run_on_latent(latent_img)
-                    img_keypoints.append(_remap_keypoints(kp_batch[0], sx, sy))
+                    img_keypoints.append(_remap_keypoints(kp_batch[0], scale, pad_top, pad_left))
                    img_scores.append(sc_batch[0])

                all_keypoints.append(img_keypoints)
@ -545,12 +549,12 @@ class SDPoseKeypointExtractor(io.ComfyNode):

        else: # full-image mode, batched
            for batch_start in tqdm(range(0, total_images, batch_size), desc="Extracting keypoints"):
-                batch_resized, sx, sy = _resize_to_model(image[batch_start:batch_start + batch_size])
+                batch_resized, scale, pad_top, pad_left = _resize_to_model(image[batch_start:batch_start + batch_size])
                latent_batch = vae.encode(batch_resized)
                kp_batch, sc_batch = _run_on_latent(latent_batch)

                for kp, sc in zip(kp_batch, sc_batch):
-                    all_keypoints.append([_remap_keypoints(kp, sx, sy)])
+                    all_keypoints.append([_remap_keypoints(kp, scale, pad_top, pad_left)])
                    all_scores.append([sc])

                pbar.update(len(kp_batch))
@ -723,13 +727,13 @@ class CropByBBoxes(io.ComfyNode):
                scale = min(output_width / crop_w, output_height / crop_h)
                scaled_w = int(round(crop_w * scale))
                scaled_h = int(round(crop_h * scale))
-                scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="area", crop="disabled")
+                scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
                pad_left = (output_width  - scaled_w) // 2
                pad_top  = (output_height - scaled_h) // 2
                resized = torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device)
                resized[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
            else:  # "stretch"
-                resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="area", crop="disabled")
+                resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
            crops.append(resized)

        if not crops:
--- a/execution.py
+++ b/execution.py
@ -15,7 +15,6 @@ import torch
 from comfy.cli_args import args
 import comfy.memory_management
 import comfy.model_management
-import comfy.model_prefetch
 import comfy_aimdo.model_vbar

 from latent_preview import set_preview_method
@ -538,7 +537,6 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
                    if args.verbose == "DEBUG":
                        comfy_aimdo.control.analyze()
                    comfy.model_management.reset_cast_buffers()
-                    comfy.model_prefetch.cleanup_prefetch_queues()
                    comfy_aimdo.model_vbar.vbars_reset_watermark_limits()

            if has_pending_tasks: