Merge 2beca418ad into cf9cbec596

2026-05-25 00:17:23 +08:00 · 2026-05-01 11:37:14 -04:00
26 changed files with 830 additions and 296 deletions
--- a/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
+++ b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --enable-dynamic-vram
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
 pause
--- a/2
+++ b/2
@ -1,2 +1,2 @@
 # Admins
-* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
+* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128
--- a/README.md
+++ b/README.md
@ -193,15 +193,13 @@ If you have trouble extracting it, right click the file -> properties -> unblock

 The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.

-#### All Official Portable Downloads:
+#### Alternative Downloads:

 [Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)

-[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
+[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)

-[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
-
-[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
+[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).

 #### How do I share models between another UI and ComfyUI?

--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -90,6 +90,7 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")

 parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
+parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
 parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")

 class LatentPreviewMethod(enum.Enum):
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -16,7 +16,6 @@ from comfy.ldm.lightricks.model import (
 from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
 from comfy.ldm.lightricks.embeddings_connector import Embeddings1DConnector
 import comfy.ldm.common_dit
-import comfy.model_prefetch

 class CompressedTimestep:
    """Store video timestep embeddings in compressed form using per-frame indexing."""
@ -908,11 +907,9 @@ class LTXAVModel(LTXVModel):
        """Process transformer blocks for LTXAV."""
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
-        prefetch_queue = comfy.model_prefetch.make_prefetch_queue(list(self.transformer_blocks), vx.device, transformer_options)

        # Process transformer blocks
        for i, block in enumerate(self.transformer_blocks):
-            comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, block)
            if ("double_block", i) in blocks_replace:

                def block_wrap(args):
@ -985,8 +982,6 @@ class LTXAVModel(LTXVModel):
                    a_prompt_timestep=a_prompt_timestep,
                )

-        comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, None)
-
        return [vx, ax]

    def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -17,7 +17,6 @@
 """

 from __future__ import annotations
-import comfy.memory_management
 import comfy.utils
 import comfy.model_management
 import comfy.model_base
@ -474,17 +473,3 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
            weight = old_weight

    return weight
-
-def prefetch_prepared_value(value, allocate_buffer, stream):
-    if isinstance(value, torch.Tensor):
-        dest = allocate_buffer(comfy.memory_management.vram_aligned_size(value))
-        comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
-        return comfy.memory_management.interpret_gathered_like([value], dest)[0]
-    elif isinstance(value, weight_adapter.WeightAdapterBase):
-        return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, allocate_buffer, stream))
-    elif isinstance(value, tuple):
-        return tuple(prefetch_prepared_value(item, allocate_buffer, stream) for item in value)
-    elif isinstance(value, list):
-        return [prefetch_prepared_value(item, allocate_buffer, stream) for item in value]
-
-    return value
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -214,11 +214,6 @@ class BaseModel(torch.nn.Module):
        if "latent_shapes" in extra_conds:
            xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes"))

-        transformer_options = transformer_options.copy()
-        transformer_options["prefetch_dynamic_vbars"] = (
-            self.current_patcher is not None and self.current_patcher.is_dynamic()
-        )
-
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds)
        if len(model_output) > 1 and not torch.is_tensor(model_output):
            model_output, _ = utils.pack_latents(model_output)
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -31,7 +31,6 @@ from contextlib import nullcontext
 import comfy.memory_management
 import comfy.utils
 import comfy.quant_ops
-import comfy_aimdo.vram_buffer

 class VRAMState(Enum):
    DISABLED = 0    #No vram present: no need to move models to vram
@ -113,6 +112,10 @@ if args.directml is not None:
    # torch_directml.disable_tiled_resources(True)
    lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.

+try:
+    import intel_extension_for_pytorch as ipex  # noqa: F401
+except:
+    pass

 try:
    _ = torch.xpu.device_count()
@ -580,6 +583,9 @@ class LoadedModel:

        real_model = self.model.model

+        if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None:
+            with torch.no_grad():
+                real_model = ipex.optimize(real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)

        self.real_model = weakref.ref(real_model)
        self.model_finalizer = weakref.finalize(real_model, cleanup_models)
@ -1176,10 +1182,6 @@ stream_counters = {}

 STREAM_CAST_BUFFERS = {}
 LARGEST_CASTED_WEIGHT = (None, 0)
-STREAM_AIMDO_CAST_BUFFERS = {}
-LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-
-DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3

 def get_cast_buffer(offload_stream, device, size, ref):
    global LARGEST_CASTED_WEIGHT
@ -1213,26 +1215,13 @@ def get_cast_buffer(offload_stream, device, size, ref):

    return cast_buffer

-def get_aimdo_cast_buffer(offload_stream, device):
-    cast_buffer = STREAM_AIMDO_CAST_BUFFERS.get(offload_stream, None)
-    if cast_buffer is None:
-        cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index)
-        STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer
-
-    return cast_buffer
 def reset_cast_buffers():
    global LARGEST_CASTED_WEIGHT
-    global LARGEST_AIMDO_CASTED_WEIGHT
-
    LARGEST_CASTED_WEIGHT = (None, 0)
-    LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS):
-        if offload_stream is not None:
-            offload_stream.synchronize()
+    for offload_stream in STREAM_CAST_BUFFERS:
+        offload_stream.synchronize()
    synchronize()
-
    STREAM_CAST_BUFFERS.clear()
-    STREAM_AIMDO_CAST_BUFFERS.clear()
    soft_empty_cache()

 def get_offload_stream(device):
@ -1592,7 +1581,10 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
        return False

    if is_intel_xpu():
-        return torch.xpu.get_device_properties(device).has_fp16
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.get_device_properties(device).has_fp16

    if is_ascend_npu():
        return True
@ -1658,7 +1650,10 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
        return False

    if is_intel_xpu():
-        return torch.xpu.is_bf16_supported()
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.is_bf16_supported()

    if is_ascend_npu():
        return True
@ -1789,7 +1784,6 @@ def soft_empty_cache(force=False):
    if cpu_state == CPUState.MPS:
        torch.mps.empty_cache()
    elif is_intel_xpu():
-        torch.xpu.synchronize()
        torch.xpu.empty_cache()
    elif is_ascend_npu():
        torch.npu.empty_cache()
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -121,20 +121,9 @@ class LowVramPatch:
        self.patches = patches
        self.convert_func = convert_func # TODO: remove
        self.set_func = set_func
-        self.prepared_patches = None
-
-    def prepare(self, allocate_buffer, stream):
-        self.prepared_patches = [
-            (patch[0], comfy.lora.prefetch_prepared_value(patch[1], allocate_buffer, stream), patch[2], patch[3], patch[4])
-            for patch in self.patches[self.key]
-        ]
-
-    def clear_prepared(self):
-        self.prepared_patches = None

    def __call__(self, weight):
-        patches = self.prepared_patches if self.prepared_patches is not None else self.patches[self.key]
-        return comfy.lora.calculate_weight(patches, weight, self.key, intermediate_dtype=weight.dtype)
+        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)

 LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2

--- a/comfy/model_prefetch.py
+++ b/comfy/model_prefetch.py
@ -1,65 +0,0 @@
-import comfy_aimdo.model_vbar
-import comfy.model_management
-import comfy.ops
-
-PREFETCH_QUEUES = []
-
-def cleanup_prefetched_modules(comfy_modules):
-    for s in comfy_modules:
-        prefetch = getattr(s, "_prefetch", None)
-        if prefetch is None:
-            continue
-        for param_key in ("weight", "bias"):
-            lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-            if lowvram_fn is not None:
-                lowvram_fn.clear_prepared()
-        if prefetch["signature"] is not None:
-            comfy_aimdo.model_vbar.vbar_unpin(s._v)
-        delattr(s, "_prefetch")
-
-def cleanup_prefetch_queues():
-    global PREFETCH_QUEUES
-
-    for queue in PREFETCH_QUEUES:
-        for entry in queue:
-            if entry is None or not isinstance(entry, tuple):
-                continue
-            _, prefetch_state = entry
-            comfy_modules = prefetch_state[1]
-            if comfy_modules is not None:
-                cleanup_prefetched_modules(comfy_modules)
-    PREFETCH_QUEUES = []
-
-def prefetch_queue_pop(queue, device, module):
-    if queue is None:
-        return
-
-    consumed = queue.pop(0)
-    if consumed is not None:
-        offload_stream, prefetch_state = consumed
-        offload_stream.wait_stream(comfy.model_management.current_stream(device))
-        _, comfy_modules = prefetch_state
-        if comfy_modules is not None:
-            cleanup_prefetched_modules(comfy_modules)
-
-    prefetch = queue[0]
-    if prefetch is not None:
-        comfy_modules = []
-        for s in prefetch.modules():
-            if hasattr(s, "_v"):
-                comfy_modules.append(s)
-
-        offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True)
-        comfy.model_management.sync_stream(device, offload_stream)
-        queue[0] = (offload_stream, (prefetch, comfy_modules))
-
-def make_prefetch_queue(queue, device, transformer_options):
-    if (not transformer_options.get("prefetch_dynamic_vbars", False)
-        or comfy.model_management.NUM_STREAMS == 0
-        or comfy.model_management.is_device_cpu(device)
-        or not comfy.model_management.device_supports_non_blocking(device)):
-        return None
-
-    queue = [None] + queue + [None]
-    PREFETCH_QUEUES.append(queue)
-    return queue
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -86,61 +86,38 @@ def materialize_meta_param(s, param_keys):
            setattr(s, param_key, torch.nn.Parameter(torch.zeros(param.shape, dtype=param.dtype), requires_grad=param.requires_grad))


-# FIXME: add n=1 cache hit fast path
-def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blocking):
-    offload_stream = None
-    cast_buffer = None
-    cast_buffer_offset = 0
-
-    def ensure_offload_stream(module, required_size, check_largest):
-        nonlocal offload_stream
-        nonlocal cast_buffer
-
-        if offload_stream is None:
-            offload_stream = comfy.model_management.get_offload_stream(device)
-        if offload_stream is None or not check_largest or len(comfy_modules) != 1:
-            return
-
-        current_size = 0 if cast_buffer is None else cast_buffer.size()
-        if current_size < required_size and module is comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT[0]:
-            offload_stream = comfy.model_management.get_offload_stream(device)
-            cast_buffer = None
-        if required_size > comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT[1]:
-            comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT = (module, required_size)
-
-    def get_cast_buffer(buffer_size):
-        nonlocal offload_stream
-        nonlocal cast_buffer
-        nonlocal cast_buffer_offset
-
-        if buffer_size == 0:
-            return None
-
-        if offload_stream is None:
-            return torch.empty((buffer_size,), dtype=torch.uint8, device=device)
-
-        cast_buffer = comfy.model_management.get_aimdo_cast_buffer(offload_stream, device)
-        buffer = comfy_aimdo.torch.aimdo_to_tensor(cast_buffer.get(buffer_size, cast_buffer_offset), device)
-        cast_buffer_offset += buffer_size
-        return buffer
-
-    for s in comfy_modules:
-        signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
-        resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
-        prefetch = {
-            "signature": signature,
-            "resident": resident,
-        }
-
-        if resident:
-            s._prefetch = prefetch
-            continue
-
+def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant):
+    #vbar doesn't support CPU weights, but some custom nodes have weird paths
+    #that might switch the layer to the CPU and expect it to work. We have to take
+    #a clone conservatively as we are mmapped and some SFT files are packed misaligned
+    #If you are a custom node author reading this, please move your layer to the GPU
+    #or declare your ModelPatcher as CPU in the first place.
+    if comfy.model_management.is_device_cpu(device):
+        materialize_meta_param(s, ["weight", "bias"])
+        weight = s.weight.to(dtype=dtype, copy=True)
+        if isinstance(weight, QuantizedTensor):
+            weight = weight.dequantize()
+        bias = None
+        if s.bias is not None:
+            bias = s.bias.to(dtype=bias_dtype, copy=True)
+        return weight, bias, (None, None, None)
+
+    offload_stream = None
+    xfer_dest = None
+
+    signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
+    resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
+    if signature is not None:
+        if resident:
+            weight = s._v_weight
+            bias = s._v_bias
+        else:
+            xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
+
+    if not resident:
        materialize_meta_param(s, ["weight", "bias"])
-        xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device) if signature is not None else None
        cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
        cast_dest = None
-        needs_cast = False

        xfer_source = [ s.weight, s.bias ]

@ -152,15 +129,22 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
            if data is None:
                continue
            if data.dtype != geometry.dtype:
-                needs_cast = True
                cast_dest = xfer_dest
+                if cast_dest is None:
+                    cast_dest = torch.empty((comfy.memory_management.vram_aligned_size(cast_geometry),), dtype=torch.uint8, device=device)
                xfer_dest = None
                break

        dest_size = comfy.memory_management.vram_aligned_size(xfer_source)
-        ensure_offload_stream(s, dest_size if xfer_dest is None else 0, True)
+        offload_stream = comfy.model_management.get_offload_stream(device)
+        if xfer_dest is None and offload_stream is not None:
+                xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
+                if xfer_dest is None:
+                    offload_stream = comfy.model_management.get_offload_stream(device)
+                    xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
        if xfer_dest is None:
-            xfer_dest = get_cast_buffer(dest_size)
+            xfer_dest = torch.empty((dest_size,), dtype=torch.uint8, device=device)
+            offload_stream = None

        if signature is None and pin is None:
            comfy.pinned_memory.pin_memory(s)
@ -173,54 +157,27 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
            xfer_source = [ pin ]
        #send it over
        comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+        comfy.model_management.sync_stream(device, offload_stream)

-        for param_key in ("weight", "bias"):
-            lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-            if lowvram_fn is not None:
-                ensure_offload_stream(s, cast_buffer_offset, False)
-                lowvram_fn.prepare(lambda size: get_cast_buffer(size), offload_stream)
-
-        prefetch["xfer_dest"] = xfer_dest
-        prefetch["cast_dest"] = cast_dest
-        prefetch["cast_geometry"] = cast_geometry
-        prefetch["needs_cast"] = needs_cast
-        s._prefetch = prefetch
-
-    return offload_stream
-
-
-def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, want_requant):
-
-    prefetch = getattr(s, "_prefetch", None)
-
-    if prefetch["resident"]:
-        weight = s._v_weight
-        bias = s._v_bias
-    else:
-        xfer_dest = prefetch["xfer_dest"]
-        if prefetch["needs_cast"]:
-            cast_dest = prefetch["cast_dest"] if prefetch["cast_dest"] is not None else torch.empty((comfy.memory_management.vram_aligned_size(prefetch["cast_geometry"]),), dtype=torch.uint8, device=device)
+        if cast_dest is not None:
            for pre_cast, post_cast in zip(comfy.memory_management.interpret_gathered_like([s.weight, s.bias ], xfer_dest),
-                                           comfy.memory_management.interpret_gathered_like(prefetch["cast_geometry"], cast_dest)):
+                                           comfy.memory_management.interpret_gathered_like(cast_geometry, cast_dest)):
                if post_cast is not None:
                    post_cast.copy_(pre_cast)
            xfer_dest = cast_dest

-        params = comfy.memory_management.interpret_gathered_like(prefetch["cast_geometry"], xfer_dest)
+        params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest)
        weight = params[0]
        bias = params[1]
-        if prefetch["signature"] is not None:
+        if signature is not None:
            s._v_weight = weight
            s._v_bias = bias
-        s._v_signature = prefetch["signature"]
+        s._v_signature=signature

    def post_cast(s, param_key, x, dtype, resident, update_weight):
        lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
        fns = getattr(s, param_key + "_function", [])

-        if x is None:
-            return None
-
        orig = x

        def to_dequant(tensor, dtype):
@ -248,12 +205,14 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w
            x = f(x)
        return x

-    update_weight = prefetch["signature"] is not None
-    weight = post_cast(s, "weight", weight, dtype, prefetch["resident"], update_weight)
-    if bias is not None:
-        bias = post_cast(s, "bias", bias, bias_dtype, prefetch["resident"], update_weight)
+    update_weight = signature is not None

-    return weight, bias
+    weight = post_cast(s, "weight", weight, dtype, resident, update_weight)
+    if s.bias is not None:
+        bias = post_cast(s, "bias", bias, bias_dtype, resident, update_weight)
+
+    #FIXME: weird offload return protocol
+    return weight, bias, (offload_stream, device if signature is not None else None, None)


 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False):
@ -271,46 +230,10 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
        if device is None:
            device = input.device

-    def format_return(result, offloadable):
-        weight, bias, offload_stream = result
-        return (weight, bias, offload_stream) if offloadable else (weight, bias)
-
    non_blocking = comfy.model_management.device_supports_non_blocking(device)

    if hasattr(s, "_v"):
-
-        #vbar doesn't support CPU weights, but some custom nodes have weird paths
-        #that might switch the layer to the CPU and expect it to work. We have to take
-        #a clone conservatively as we are mmapped and some SFT files are packed misaligned
-        #If you are a custom node author reading this, please move your layer to the GPU
-        #or declare your ModelPatcher as CPU in the first place.
-        if comfy.model_management.is_device_cpu(device):
-            materialize_meta_param(s, ["weight", "bias"])
-            weight = s.weight.to(dtype=dtype, copy=True)
-            if isinstance(weight, QuantizedTensor):
-                weight = weight.dequantize()
-            bias = s.bias.to(dtype=bias_dtype, copy=True) if s.bias is not None else None
-            return format_return((weight, bias, (None, None, None)), offloadable)
-
-        prefetched = hasattr(s, "_prefetch")
-        offload_stream = None
-        offload_device = None
-        if not prefetched:
-            offload_stream = cast_modules_with_vbar([s], dtype, device, bias_dtype, non_blocking)
-            comfy.model_management.sync_stream(device, offload_stream)
-
-        weight, bias = resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, want_requant)
-
-        if not prefetched:
-            if getattr(s, "_prefetch")["signature"] is not None:
-                offload_device = device
-            for param_key in ("weight", "bias"):
-                lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
-                if lowvram_fn is not None:
-                    lowvram_fn.clear_prepared()
-            delattr(s, "_prefetch")
-        return format_return((weight, bias, (offload_stream, offload_device, None)), offloadable)
-
+        return cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant)

    if offloadable and (device != s.weight.device or
                        (s.bias is not None and device != s.bias.device)):
@ -357,7 +280,11 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
        for f in s.weight_function:
            weight = f(weight)

-    return format_return((weight, bias, (offload_stream, weight_a, bias_a)), offloadable)
+    if offloadable:
+        return weight, bias, (offload_stream, weight_a, bias_a)
+    else:
+        #Legacy function signature
+        return weight, bias


 def uncast_bias_weight(s, weight, bias, offload_stream):
--- a/comfy_api_nodes/apis/moonvalley.py
+++ b/comfy_api_nodes/apis/moonvalley.py
@ -0,0 +1,152 @@
+from enum import Enum
+from typing import Optional, Dict, Any
+
+from pydantic import BaseModel, Field, StrictBytes
+
+
+class MoonvalleyPromptResponse(BaseModel):
+    error: Optional[Dict[str, Any]] = None
+    frame_conditioning: Optional[Dict[str, Any]] = None
+    id: Optional[str] = None
+    inference_params: Optional[Dict[str, Any]] = None
+    meta: Optional[Dict[str, Any]] = None
+    model_params: Optional[Dict[str, Any]] = None
+    output_url: Optional[str] = None
+    prompt_text: Optional[str] = None
+    status: Optional[str] = None
+
+
+class MoonvalleyTextToVideoInferenceParams(BaseModel):
+    add_quality_guidance: Optional[bool] = Field(
+        True, description='Whether to add quality guidance'
+    )
+    caching_coefficient: Optional[float] = Field(
+        0.3, description='Caching coefficient for optimization'
+    )
+    caching_cooldown: Optional[int] = Field(
+        3, description='Number of caching cooldown steps'
+    )
+    caching_warmup: Optional[int] = Field(
+        3, description='Number of caching warmup steps'
+    )
+    clip_value: Optional[float] = Field(
+        3, description='CLIP value for generation control'
+    )
+    conditioning_frame_index: Optional[int] = Field(
+        0, description='Index of the conditioning frame'
+    )
+    cooldown_steps: Optional[int] = Field(
+        75, description='Number of cooldown steps (calculated based on num_frames)'
+    )
+    fps: Optional[int] = Field(
+        24, description='Frames per second of the generated video'
+    )
+    guidance_scale: Optional[float] = Field(
+        10, description='Guidance scale for generation control'
+    )
+    height: Optional[int] = Field(
+        1080, description='Height of the generated video in pixels'
+    )
+    negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
+    num_frames: Optional[int] = Field(64, description='Number of frames to generate')
+    seed: Optional[int] = Field(
+        None, description='Random seed for generation (default: random)'
+    )
+    shift_value: Optional[float] = Field(
+        3, description='Shift value for generation control'
+    )
+    steps: Optional[int] = Field(80, description='Number of denoising steps')
+    use_guidance_schedule: Optional[bool] = Field(
+        True, description='Whether to use guidance scheduling'
+    )
+    use_negative_prompts: Optional[bool] = Field(
+        False, description='Whether to use negative prompts'
+    )
+    use_timestep_transform: Optional[bool] = Field(
+        True, description='Whether to use timestep transformation'
+    )
+    warmup_steps: Optional[int] = Field(
+        0, description='Number of warmup steps (calculated based on num_frames)'
+    )
+    width: Optional[int] = Field(
+        1920, description='Width of the generated video in pixels'
+    )
+
+
+class MoonvalleyTextToVideoRequest(BaseModel):
+    image_url: Optional[str] = None
+    inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None
+    prompt_text: Optional[str] = None
+    webhook_url: Optional[str] = None
+
+
+class MoonvalleyUploadFileRequest(BaseModel):
+    file: Optional[StrictBytes] = None
+
+
+class MoonvalleyUploadFileResponse(BaseModel):
+    access_url: Optional[str] = None
+
+
+class MoonvalleyVideoToVideoInferenceParams(BaseModel):
+    add_quality_guidance: Optional[bool] = Field(
+        True, description='Whether to add quality guidance'
+    )
+    caching_coefficient: Optional[float] = Field(
+        0.3, description='Caching coefficient for optimization'
+    )
+    caching_cooldown: Optional[int] = Field(
+        3, description='Number of caching cooldown steps'
+    )
+    caching_warmup: Optional[int] = Field(
+        3, description='Number of caching warmup steps'
+    )
+    clip_value: Optional[float] = Field(
+        3, description='CLIP value for generation control'
+    )
+    conditioning_frame_index: Optional[int] = Field(
+        0, description='Index of the conditioning frame'
+    )
+    cooldown_steps: Optional[int] = Field(
+        36, description='Number of cooldown steps (calculated based on num_frames)'
+    )
+    guidance_scale: Optional[float] = Field(
+        15, description='Guidance scale for generation control'
+    )
+    negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
+    seed: Optional[int] = Field(
+        None, description='Random seed for generation (default: random)'
+    )
+    shift_value: Optional[float] = Field(
+        3, description='Shift value for generation control'
+    )
+    steps: Optional[int] = Field(80, description='Number of denoising steps')
+    use_guidance_schedule: Optional[bool] = Field(
+        True, description='Whether to use guidance scheduling'
+    )
+    use_negative_prompts: Optional[bool] = Field(
+        False, description='Whether to use negative prompts'
+    )
+    use_timestep_transform: Optional[bool] = Field(
+        True, description='Whether to use timestep transformation'
+    )
+    warmup_steps: Optional[int] = Field(
+        24, description='Number of warmup steps (calculated based on num_frames)'
+    )
+
+
+class ControlType(str, Enum):
+    motion_control = 'motion_control'
+    pose_control = 'pose_control'
+
+
+class MoonvalleyVideoToVideoRequest(BaseModel):
+    control_type: ControlType = Field(
+        ..., description='Supported types for video control'
+    )
+    inference_params: Optional[MoonvalleyVideoToVideoInferenceParams] = None
+    prompt_text: str = Field(..., description='Describes the video to generate')
+    video_url: str = Field(..., description='Url to control video')
+    webhook_url: Optional[str] = Field(
+        None, description='Optional webhook URL for notifications'
+    )
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@ -1403,6 +1403,7 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
            status_extractor=lambda r: r.status,
            price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
            poll_interval=9,
+            max_poll_attempts=180,
        )
        return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))

@ -1584,6 +1585,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
            status_extractor=lambda r: r.status,
            price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
            poll_interval=9,
+            max_poll_attempts=180,
        )
        return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))

@ -1905,6 +1907,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
            status_extractor=lambda r: r.status,
            price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input),
            poll_interval=9,
+            max_poll_attempts=180,
        )
        return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))

--- a/comfy_api_nodes/nodes_hitpaw.py
+++ b/comfy_api_nodes/nodes_hitpaw.py
@ -178,6 +178,7 @@ class HitPawGeneralImageEnhance(IO.ComfyNode):
            status_extractor=lambda x: x.data.status,
            price_extractor=lambda x: request_price,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.res_url))

@ -323,6 +324,7 @@ class HitPawVideoEnhance(IO.ComfyNode):
            status_extractor=lambda x: x.data.status,
            price_extractor=lambda x: request_price,
            poll_interval=10.0,
+            max_poll_attempts=320,
        )
        return IO.NodeOutput(await download_url_to_video_output(final_response.data.res_url))

--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@ -276,6 +276,7 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusRe
        cls,
        ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"),
        response_model=TaskStatusResponse,
+        max_poll_attempts=280,
        status_extractor=lambda r: (r.data.task_status if r.data else None),
    )
    return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
@ -3061,6 +3062,7 @@ class KlingVideoNode(IO.ComfyNode):
            cls,
            ApiEndpoint(path=poll_path),
            response_model=TaskStatusResponse,
+            max_poll_attempts=280,
            status_extractor=lambda r: (r.data.task_status if r.data else None),
        )
        return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
@ -3186,6 +3188,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
            cls,
            ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"),
            response_model=TaskStatusResponse,
+            max_poll_attempts=280,
            status_extractor=lambda r: (r.data.task_status if r.data else None),
        )
        return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
--- a/comfy_api_nodes/nodes_magnific.py
+++ b/comfy_api_nodes/nodes_magnific.py
@ -230,6 +230,7 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
            status_extractor=lambda x: x.status,
            price_extractor=lambda _: price_usd,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))

@ -390,6 +391,7 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
            status_extractor=lambda x: x.status,
            price_extractor=lambda _: price_usd,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))

@ -539,6 +541,7 @@ class MagnificImageStyleTransferNode(IO.ComfyNode):
            response_model=TaskResponse,
            status_extractor=lambda x: x.status,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))

@ -779,6 +782,7 @@ class MagnificImageRelightNode(IO.ComfyNode):
            response_model=TaskResponse,
            status_extractor=lambda x: x.status,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))

@ -920,6 +924,7 @@ class MagnificImageSkinEnhancerNode(IO.ComfyNode):
            response_model=TaskResponse,
            status_extractor=lambda x: x.status,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))

--- a/comfy_api_nodes/nodes_moonvalley.py
+++ b/comfy_api_nodes/nodes_moonvalley.py
@ -0,0 +1,534 @@
+import logging
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.moonvalley import (
+    MoonvalleyPromptResponse,
+    MoonvalleyTextToVideoInferenceParams,
+    MoonvalleyTextToVideoRequest,
+    MoonvalleyVideoToVideoInferenceParams,
+    MoonvalleyVideoToVideoRequest,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_video_output,
+    poll_op,
+    sync_op,
+    trim_video,
+    upload_images_to_comfyapi,
+    upload_video_to_comfyapi,
+    validate_container_format_is_mp4,
+    validate_image_dimensions,
+    validate_string,
+)
+
+API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads"
+API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts"
+API_VIDEO2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/video-to-video"
+API_TXT2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/text-to-video"
+API_IMG2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/image-to-video"
+
+MIN_WIDTH = 300
+MIN_HEIGHT = 300
+
+MAX_WIDTH = 10000
+MAX_HEIGHT = 10000
+
+MIN_VID_WIDTH = 300
+MIN_VID_HEIGHT = 300
+
+MAX_VID_WIDTH = 10000
+MAX_VID_HEIGHT = 10000
+
+MAX_VIDEO_SIZE = 1024 * 1024 * 1024  # 1 GB max for in-memory video processing
+
+MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000
+
+
+def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool:
+    """Verifies that the initial response contains a task ID."""
+    return bool(response.id)
+
+
+def validate_task_creation_response(response) -> None:
+    if not is_valid_task_creation_response(response):
+        error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}"
+        logging.error(error_msg)
+        raise RuntimeError(error_msg)
+
+
+def validate_video_to_video_input(video: Input.Video) -> Input.Video:
+    """
+    Validates and processes video input for Moonvalley Video-to-Video generation.
+
+    Args:
+        video: Input video to validate
+
+    Returns:
+        Validated and potentially trimmed video
+
+    Raises:
+        ValueError: If video doesn't meet requirements
+        MoonvalleyApiError: If video duration is too short
+    """
+    width, height = _get_video_dimensions(video)
+    _validate_video_dimensions(width, height)
+    validate_container_format_is_mp4(video)
+
+    return _validate_and_trim_duration(video)
+
+
+def _get_video_dimensions(video: Input.Video) -> tuple[int, int]:
+    """Extracts video dimensions with error handling."""
+    try:
+        return video.get_dimensions()
+    except Exception as e:
+        logging.error("Error getting dimensions of video: %s", e)
+        raise ValueError(f"Cannot get video dimensions: {e}") from e
+
+
+def _validate_video_dimensions(width: int, height: int) -> None:
+    """Validates video dimensions meet Moonvalley V2V requirements."""
+    supported_resolutions = {
+        (1920, 1080),
+        (1080, 1920),
+        (1152, 1152),
+        (1536, 1152),
+        (1152, 1536),
+    }
+
+    if (width, height) not in supported_resolutions:
+        supported_list = ", ".join([f"{w}x{h}" for w, h in sorted(supported_resolutions)])
+        raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}")
+
+
+def _validate_and_trim_duration(video: Input.Video) -> Input.Video:
+    """Validates video duration and trims to 5 seconds if needed."""
+    duration = video.get_duration()
+    _validate_minimum_duration(duration)
+    return _trim_if_too_long(video, duration)
+
+
+def _validate_minimum_duration(duration: float) -> None:
+    """Ensures video is at least 5 seconds long."""
+    if duration < 5:
+        raise ValueError("Input video must be at least 5 seconds long.")
+
+
+def _trim_if_too_long(video: Input.Video, duration: float) -> Input.Video:
+    """Trims video to 5 seconds if longer."""
+    if duration > 5:
+        return trim_video(video, 5)
+    return video
+
+
+def parse_width_height_from_res(resolution: str):
+    # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict
+    res_map = {
+        "16:9 (1920 x 1080)": {"width": 1920, "height": 1080},
+        "9:16 (1080 x 1920)": {"width": 1080, "height": 1920},
+        "1:1 (1152 x 1152)": {"width": 1152, "height": 1152},
+        "4:3 (1536 x 1152)": {"width": 1536, "height": 1152},
+        "3:4 (1152 x 1536)": {"width": 1152, "height": 1536},
+        # "21:9 (2560 x 1080)": {"width": 2560, "height": 1080},
+    }
+    return res_map.get(resolution, {"width": 1920, "height": 1080})
+
+
+def parse_control_parameter(value):
+    control_map = {
+        "Motion Transfer": "motion_control",
+        "Canny": "canny_control",
+        "Pose Transfer": "pose_control",
+        "Depth": "depth_control",
+    }
+    return control_map.get(value, control_map["Motion Transfer"])
+
+
+async def get_response(cls: type[IO.ComfyNode], task_id: str) -> MoonvalleyPromptResponse:
+    return await poll_op(
+        cls,
+        ApiEndpoint(path=f"{API_PROMPTS_ENDPOINT}/{task_id}"),
+        response_model=MoonvalleyPromptResponse,
+        status_extractor=lambda r: (r.status if r and r.status else None),
+        poll_interval=16.0,
+        max_poll_attempts=240,
+    )
+
+
+class MoonvalleyImg2VideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="MoonvalleyImg2VideoNode",
+            display_name="Moonvalley Marey Image to Video",
+            category="api node/video/Moonvalley Marey",
+            description="Moonvalley Marey Image to Video Node",
+            inputs=[
+                IO.Image.Input(
+                    "image",
+                    tooltip="The reference image used to generate the video",
+                ),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                ),
+                IO.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="<synthetic> <scene cut> gopro, bright, contrast, static, overexposed, vignette, "
+                    "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, "
+                    "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, "
+                    "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, "
+                    "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, "
+                    "wobbly, weird, low quality, plastic, stock footage, video camera, boring",
+                    tooltip="Negative prompt text",
+                ),
+                IO.Combo.Input(
+                    "resolution",
+                    options=[
+                        "16:9 (1920 x 1080)",
+                        "9:16 (1080 x 1920)",
+                        "1:1 (1152 x 1152)",
+                        "4:3 (1536 x 1152)",
+                        "3:4 (1152 x 1536)",
+                        # "21:9 (2560 x 1080)",
+                    ],
+                    default="16:9 (1920 x 1080)",
+                    tooltip="Resolution of the output video",
+                ),
+                IO.Float.Input(
+                    "prompt_adherence",
+                    default=4.5,
+                    min=1.0,
+                    max=20.0,
+                    step=1.0,
+                    tooltip="Guidance scale for generation control",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=9,
+                    min=0,
+                    max=4294967295,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Random seed value",
+                    control_after_generate=True,
+                ),
+                IO.Int.Input(
+                    "steps",
+                    default=80,
+                    min=75,  # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
+                    max=100,
+                    step=1,
+                    tooltip="Number of denoising steps",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(),
+                expr="""{"type":"usd","usd": 1.5}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        image: Input.Image,
+        prompt: str,
+        negative_prompt: str,
+        resolution: str,
+        prompt_adherence: float,
+        seed: int,
+        steps: int,
+    ) -> IO.NodeOutput:
+        validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH)
+        validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        width_height = parse_width_height_from_res(resolution)
+
+        inference_params = MoonvalleyTextToVideoInferenceParams(
+            negative_prompt=negative_prompt,
+            steps=steps,
+            seed=seed,
+            guidance_scale=prompt_adherence,
+            width=width_height["width"],
+            height=width_height["height"],
+            use_negative_prompts=True,
+        )
+
+        # Get MIME type from tensor - assuming PNG format for image tensors
+        mime_type = "image/png"
+        image_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type=mime_type))[0]
+        task_creation_response = await sync_op(
+            cls,
+            endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, method="POST"),
+            response_model=MoonvalleyPromptResponse,
+            data=MoonvalleyTextToVideoRequest(
+                image_url=image_url, prompt_text=prompt, inference_params=inference_params
+            ),
+        )
+        validate_task_creation_response(task_creation_response)
+        final_response = await get_response(cls, task_creation_response.id)
+        video = await download_url_to_video_output(final_response.output_url)
+        return IO.NodeOutput(video)
+
+
+class MoonvalleyVideo2VideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="MoonvalleyVideo2VideoNode",
+            display_name="Moonvalley Marey Video to Video",
+            category="api node/video/Moonvalley Marey",
+            description="",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="Describes the video to generate",
+                ),
+                IO.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="<synthetic> <scene cut> gopro, bright, contrast, static, overexposed, vignette, "
+                    "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, "
+                    "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, "
+                    "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, "
+                    "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, "
+                    "wobbly, weird, low quality, plastic, stock footage, video camera, boring",
+                    tooltip="Negative prompt text",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=9,
+                    min=0,
+                    max=4294967295,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Random seed value",
+                    control_after_generate=False,
+                ),
+                IO.Video.Input(
+                    "video",
+                    tooltip="The reference video used to generate the output video. Must be at least 5 seconds long. "
+                    "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.",
+                ),
+                IO.Combo.Input(
+                    "control_type",
+                    options=["Motion Transfer", "Pose Transfer"],
+                    default="Motion Transfer",
+                    optional=True,
+                ),
+                IO.Int.Input(
+                    "motion_intensity",
+                    default=100,
+                    min=0,
+                    max=100,
+                    step=1,
+                    tooltip="Only used if control_type is 'Motion Transfer'",
+                    optional=True,
+                ),
+                IO.Int.Input(
+                    "steps",
+                    default=60,
+                    min=60,  # steps should be greater or equal to cooldown_steps(36) + warmup_steps(24)
+                    max=100,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Number of inference steps",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(),
+                expr="""{"type":"usd","usd": 2.25}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        negative_prompt: str,
+        seed: int,
+        video: Input.Video | None = None,
+        control_type: str = "Motion Transfer",
+        motion_intensity: int | None = 100,
+        steps=60,
+        prompt_adherence=4.5,
+    ) -> IO.NodeOutput:
+        validated_video = validate_video_to_video_input(video)
+        video_url = await upload_video_to_comfyapi(cls, validated_video)
+        validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+
+        # Only include motion_intensity for Motion Transfer
+        control_params = {}
+        if control_type == "Motion Transfer" and motion_intensity is not None:
+            control_params["motion_intensity"] = motion_intensity
+
+        inference_params = MoonvalleyVideoToVideoInferenceParams(
+            negative_prompt=negative_prompt,
+            seed=seed,
+            control_params=control_params,
+            steps=steps,
+            guidance_scale=prompt_adherence,
+        )
+
+        task_creation_response = await sync_op(
+            cls,
+            endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, method="POST"),
+            response_model=MoonvalleyPromptResponse,
+            data=MoonvalleyVideoToVideoRequest(
+                control_type=parse_control_parameter(control_type),
+                video_url=video_url,
+                prompt_text=prompt,
+                inference_params=inference_params,
+            ),
+        )
+        validate_task_creation_response(task_creation_response)
+        final_response = await get_response(cls, task_creation_response.id)
+        return IO.NodeOutput(await download_url_to_video_output(final_response.output_url))
+
+
+class MoonvalleyTxt2VideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="MoonvalleyTxt2VideoNode",
+            display_name="Moonvalley Marey Text to Video",
+            category="api node/video/Moonvalley Marey",
+            description="",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                ),
+                IO.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="<synthetic> <scene cut> gopro, bright, contrast, static, overexposed, vignette, "
+                    "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, "
+                    "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, "
+                    "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, "
+                    "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, "
+                    "wobbly, weird, low quality, plastic, stock footage, video camera, boring",
+                    tooltip="Negative prompt text",
+                ),
+                IO.Combo.Input(
+                    "resolution",
+                    options=[
+                        "16:9 (1920 x 1080)",
+                        "9:16 (1080 x 1920)",
+                        "1:1 (1152 x 1152)",
+                        "4:3 (1536 x 1152)",
+                        "3:4 (1152 x 1536)",
+                        "21:9 (2560 x 1080)",
+                    ],
+                    default="16:9 (1920 x 1080)",
+                    tooltip="Resolution of the output video",
+                ),
+                IO.Float.Input(
+                    "prompt_adherence",
+                    default=4.0,
+                    min=1.0,
+                    max=20.0,
+                    step=1.0,
+                    tooltip="Guidance scale for generation control",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=9,
+                    min=0,
+                    max=4294967295,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Random seed value",
+                ),
+                IO.Int.Input(
+                    "steps",
+                    default=80,
+                    min=75,  # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
+                    max=100,
+                    step=1,
+                    tooltip="Inference steps",
+                ),
+            ],
+            outputs=[IO.Video.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(),
+                expr="""{"type":"usd","usd": 1.5}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        negative_prompt: str,
+        resolution: str,
+        prompt_adherence: float,
+        seed: int,
+        steps: int,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        width_height = parse_width_height_from_res(resolution)
+
+        inference_params = MoonvalleyTextToVideoInferenceParams(
+            negative_prompt=negative_prompt,
+            steps=steps,
+            seed=seed,
+            guidance_scale=prompt_adherence,
+            num_frames=128,
+            width=width_height["width"],
+            height=width_height["height"],
+        )
+
+        task_creation_response = await sync_op(
+            cls,
+            endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, method="POST"),
+            response_model=MoonvalleyPromptResponse,
+            data=MoonvalleyTextToVideoRequest(prompt_text=prompt, inference_params=inference_params),
+        )
+        validate_task_creation_response(task_creation_response)
+        final_response = await get_response(cls, task_creation_response.id)
+        return IO.NodeOutput(await download_url_to_video_output(final_response.output_url))
+
+
+class MoonvalleyExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            MoonvalleyImg2VideoNode,
+            MoonvalleyTxt2VideoNode,
+            MoonvalleyVideo2VideoNode,
+        ]
+
+
+async def comfy_entrypoint() -> MoonvalleyExtension:
+    return MoonvalleyExtension()
--- a/comfy_api_nodes/nodes_topaz.py
+++ b/comfy_api_nodes/nodes_topaz.py
@ -453,6 +453,7 @@ class TopazVideoEnhance(IO.ComfyNode):
            progress_extractor=lambda x: getattr(x, "progress", 0),
            price_extractor=lambda x: (x.estimates.cost[0] * 0.08 if x.estimates and x.estimates.cost[0] else None),
            poll_interval=10.0,
+            max_poll_attempts=320,
        )
        return IO.NodeOutput(await download_url_to_video_output(final_response.download.url))

--- a/comfy_api_nodes/nodes_vidu.py
+++ b/comfy_api_nodes/nodes_vidu.py
@ -38,7 +38,7 @@ async def execute_task(
    cls: type[IO.ComfyNode],
    vidu_endpoint: str,
    payload: TaskCreationRequest | TaskExtendCreationRequest | TaskMultiFrameCreationRequest,
-    max_poll_attempts: int = 480,
+    max_poll_attempts: int = 320,
 ) -> list[TaskResult]:
    task_creation_response = await sync_op(
        cls,
@ -1097,6 +1097,7 @@ class ViduExtendVideoNode(IO.ComfyNode):
                video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading video"),
                images=[image_url] if image_url else None,
            ),
+            max_poll_attempts=480,
        )
        return IO.NodeOutput(await download_url_to_video_output(results[0].url))

--- a/comfy_api_nodes/nodes_wan.py
+++ b/comfy_api_nodes/nodes_wan.py
@ -818,6 +818,7 @@ class WanReferenceVideoApi(IO.ComfyNode):
            response_model=VideoTaskStatusResponse,
            status_extractor=lambda x: x.output.task_status,
            poll_interval=6,
+            max_poll_attempts=280,
        )
        return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))

--- a/comfy_api_nodes/nodes_wavespeed.py
+++ b/comfy_api_nodes/nodes_wavespeed.py
@ -84,6 +84,7 @@ class WavespeedFlashVSRNode(IO.ComfyNode):
            response_model=TaskResultResponse,
            status_extractor=lambda x: "failed" if x.data is None else x.data.status,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        if final_response.code != 200:
            raise ValueError(
@ -155,6 +156,7 @@ class WavespeedImageUpscaleNode(IO.ComfyNode):
            response_model=TaskResultResponse,
            status_extractor=lambda x: "failed" if x.data is None else x.data.status,
            poll_interval=10.0,
+            max_poll_attempts=480,
        )
        if final_response.code != 200:
            raise ValueError(
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@ -148,7 +148,7 @@ async def poll_op(
    queued_statuses: list[str | int] | None = None,
    data: BaseModel | None = None,
    poll_interval: float = 5.0,
-    max_poll_attempts: int = 480,
+    max_poll_attempts: int = 160,
    timeout_per_poll: float = 120.0,
    max_retries_per_poll: int = 10,
    retry_delay_per_poll: float = 1.0,
@ -254,7 +254,7 @@ async def poll_op_raw(
    queued_statuses: list[str | int] | None = None,
    data: dict[str, Any] | BaseModel | None = None,
    poll_interval: float = 5.0,
-    max_poll_attempts: int = 480,
+    max_poll_attempts: int = 160,
    timeout_per_poll: float = 120.0,
    max_retries_per_poll: int = 10,
    retry_delay_per_poll: float = 1.0,
--- a/comfy_extras/nodes_sdpose.py
+++ b/comfy_extras/nodes_sdpose.py
@ -459,23 +459,27 @@ class SDPoseKeypointExtractor(io.ComfyNode):
        total_images = image.shape[0]
        captured_feat = None

-        model_w = int(head.heatmap_size[0]) * 4   # 192 * 4 = 768
-        model_h = int(head.heatmap_size[1]) * 4   # 256 * 4 = 1024
+        model_h = int(head.heatmap_size[0]) * 4   # e.g. 192 * 4 = 768
+        model_w = int(head.heatmap_size[1]) * 4   # e.g. 256 * 4 = 1024

        def _resize_to_model(imgs):
-            """Stretch BHWC images to (model_h, model_w), model expects no aspect preservation."""
+            """Aspect-preserving resize + zero-pad BHWC images to (model_h, model_w). Returns (resized_bhwc, scale, pad_top, pad_left)."""
            h, w = imgs.shape[-3], imgs.shape[-2]
-            method = "area" if (model_h <= h and model_w <= w) else "bilinear"
+            scale = min(model_h / h, model_w / w)
+            sh, sw = int(round(h * scale)), int(round(w * scale))
+            pt, pl = (model_h - sh) // 2, (model_w - sw) // 2
            chw = imgs.permute(0, 3, 1, 2).float()
-            scaled = comfy.utils.common_upscale(chw, model_w, model_h, upscale_method=method, crop="disabled")
-            return scaled.permute(0, 2, 3, 1), model_w / w, model_h / h
+            scaled = comfy.utils.common_upscale(chw, sw, sh, upscale_method="bilinear", crop="disabled")
+            padded = torch.zeros(scaled.shape[0], scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
+            padded[:, :, pt:pt + sh, pl:pl + sw] = scaled
+            return padded.permute(0, 2, 3, 1), scale, pt, pl

-        def _remap_keypoints(kp, scale_x, scale_y, offset_x=0, offset_y=0):
+        def _remap_keypoints(kp, scale, pad_top, pad_left, offset_x=0, offset_y=0):
            """Remap keypoints from model space back to original image space."""
            kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
            invalid = kp[..., 0] < 0
-            kp[..., 0] = kp[..., 0] / scale_x + offset_x
-            kp[..., 1] = kp[..., 1] / scale_y + offset_y
+            kp[..., 0] = (kp[..., 0] - pad_left) / scale + offset_x
+            kp[..., 1] = (kp[..., 1] - pad_top)  / scale + offset_y
            kp[invalid] = -1
            return kp

@ -525,18 +529,18 @@ class SDPoseKeypointExtractor(io.ComfyNode):
                            continue

                        crop = img[:, y1:y2, x1:x2, :]  # (1, crop_h, crop_w, C)
-                        crop_resized, sx, sy = _resize_to_model(crop)
+                        crop_resized, scale, pad_top, pad_left = _resize_to_model(crop)

                        latent_crop = vae.encode(crop_resized)
                        kp_batch, sc_batch = _run_on_latent(latent_crop)
-                        kp = _remap_keypoints(kp_batch[0], sx, sy, x1, y1)
+                        kp = _remap_keypoints(kp_batch[0], scale, pad_top, pad_left, x1, y1)
                        img_keypoints.append(kp)
                        img_scores.append(sc_batch[0])
                else:
-                    img_resized, sx, sy = _resize_to_model(img)
+                    img_resized, scale, pad_top, pad_left = _resize_to_model(img)
                    latent_img = vae.encode(img_resized)
                    kp_batch, sc_batch = _run_on_latent(latent_img)
-                    img_keypoints.append(_remap_keypoints(kp_batch[0], sx, sy))
+                    img_keypoints.append(_remap_keypoints(kp_batch[0], scale, pad_top, pad_left))
                    img_scores.append(sc_batch[0])

                all_keypoints.append(img_keypoints)
@ -545,12 +549,12 @@ class SDPoseKeypointExtractor(io.ComfyNode):

        else: # full-image mode, batched
            for batch_start in tqdm(range(0, total_images, batch_size), desc="Extracting keypoints"):
-                batch_resized, sx, sy = _resize_to_model(image[batch_start:batch_start + batch_size])
+                batch_resized, scale, pad_top, pad_left = _resize_to_model(image[batch_start:batch_start + batch_size])
                latent_batch = vae.encode(batch_resized)
                kp_batch, sc_batch = _run_on_latent(latent_batch)

                for kp, sc in zip(kp_batch, sc_batch):
-                    all_keypoints.append([_remap_keypoints(kp, sx, sy)])
+                    all_keypoints.append([_remap_keypoints(kp, scale, pad_top, pad_left)])
                    all_scores.append([sc])

                pbar.update(len(kp_batch))
@ -723,13 +727,13 @@ class CropByBBoxes(io.ComfyNode):
                scale = min(output_width / crop_w, output_height / crop_h)
                scaled_w = int(round(crop_w * scale))
                scaled_h = int(round(crop_h * scale))
-                scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="area", crop="disabled")
+                scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
                pad_left = (output_width  - scaled_w) // 2
                pad_top  = (output_height - scaled_h) // 2
                resized = torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device)
                resized[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
            else:  # "stretch"
-                resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="area", crop="disabled")
+                resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
            crops.append(resized)

        if not crops:
--- a/execution.py
+++ b/execution.py
@ -15,7 +15,6 @@ import torch
 from comfy.cli_args import args
 import comfy.memory_management
 import comfy.model_management
-import comfy.model_prefetch
 import comfy_aimdo.model_vbar

 from latent_preview import set_preview_method
@ -538,7 +537,6 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
                    if args.verbose == "DEBUG":
                        comfy_aimdo.control.analyze()
                    comfy.model_management.reset_cast_buffers()
-                    comfy.model_prefetch.cleanup_prefetch_queues()
                    comfy_aimdo.model_vbar.vbars_reset_watermark_limits()

            if has_pending_tasks:
--- a/nodes.py
+++ b/nodes.py
@ -1694,27 +1694,26 @@ class LoadImage:

    RETURN_TYPES = ("IMAGE", "MASK")
    FUNCTION = "load_image"
-
    def load_image(self, image):
        image_path = folder_paths.get_annotated_filepath(image)

-        dtype = comfy.model_management.intermediate_dtype()
-        device = comfy.model_management.intermediate_device()
-
        components = InputImpl.VideoFromFile(image_path).get_components()
        if components.images.shape[0] > 0:
-            return (components.images.to(device=device, dtype=dtype), (1.0 - components.alpha[..., -1]).to(device=device, dtype=dtype) if components.alpha is not None else torch.zeros((components.images.shape[0], 64, 64), dtype=dtype, device=device))
+            return (components.images, 1.0 - components.alpha[..., -1] if components.alpha is not None else torch.zeros((components.images.shape[0], 64, 64), dtype=torch.float32, device="cpu"))

-        # This code is left here to handle animated webp which pyav does not support loading
        img = node_helpers.pillow(Image.open, image_path)

        output_images = []
        output_masks = []
        w, h = None, None

+        dtype = comfy.model_management.intermediate_dtype()
+
        for i in ImageSequence.Iterator(img):
            i = node_helpers.pillow(ImageOps.exif_transpose, i)

+            if i.mode == 'I':
+                i = i.point(lambda i: i * (1 / 255))
            image = i.convert("RGB")

            if len(output_images) == 0:
@ -1729,15 +1728,25 @@ class LoadImage:
            if 'A' in i.getbands():
                mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0
                mask = 1. - torch.from_numpy(mask)
+            elif i.mode == 'P' and 'transparency' in i.info:
+                mask = np.array(i.convert('RGBA').getchannel('A')).astype(np.float32) / 255.0
+                mask = 1. - torch.from_numpy(mask)
            else:
-                mask = torch.zeros((64, 64), dtype=torch.float32, device="cpu")
+                mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
            output_images.append(image.to(dtype=dtype))
            output_masks.append(mask.unsqueeze(0).to(dtype=dtype))

-        output_image = torch.cat(output_images, dim=0)
-        output_mask = torch.cat(output_masks, dim=0)
+            if img.format == "MPO":
+                break  # ignore all frames except the first one for MPO format

-        return (output_image.to(device=device, dtype=dtype), output_mask.to(device=device, dtype=dtype))
+        if len(output_images) > 1:
+            output_image = torch.cat(output_images, dim=0)
+            output_mask = torch.cat(output_masks, dim=0)
+        else:
+            output_image = output_images[0]
+            output_mask = output_masks[0]
+
+        return (output_image, output_mask)

    @classmethod
    def IS_CHANGED(s, image):
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.42.15
-comfyui-workflow-templates==0.9.66
+comfyui-workflow-templates==0.9.65
 comfyui-embedded-docs==0.4.4
 torch
 torchsde