Merge branch 'master' into dr-support-pip-cm

2026-02-22 11:57:44 +08:00 · 2025-07-12 09:19:32 +09:00 · 2025-07-12 09:19:32 +09:00 · 16a0b24da4
commit 16a0b24da4
parent 552fe9df02 b43916a134
39 changed files with 3528 additions and 781 deletions
--- a/README.md
+++ b/README.md
@ -178,10 +178,6 @@ If you have trouble extracting it, right click the file -> properties -> unblock

 See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor.

-## Jupyter Notebook
-
-To run it on services like paperspace, kaggle or colab you can use my [Jupyter Notebook](notebooks/comfyui_colab.ipynb)
-

 ## [comfy-cli](https://docs.comfy.org/comfy-cli/getting-started)

--- a/comfy/k_diffusion/sa_solver.py
+++ b/comfy/k_diffusion/sa_solver.py
@ -0,0 +1,121 @@
+# SA-Solver: Stochastic Adams Solver (NeurIPS 2023, arXiv:2309.05019)
+# Conference: https://proceedings.neurips.cc/paper_files/paper/2023/file/f4a6806490d31216a3ba667eb240c897-Paper-Conference.pdf
+# Codebase ref: https://github.com/scxue/SA-Solver
+
+import math
+from typing import Union, Callable
+import torch
+
+
+def compute_exponential_coeffs(s: torch.Tensor, t: torch.Tensor, solver_order: int, tau_t: float) -> torch.Tensor:
+    """Compute (1 + tau^2) * integral of exp((1 + tau^2) * x) * x^p dx from s to t with exp((1 + tau^2) * t) factored out, using integration by parts.
+
+    Integral of exp((1 + tau^2) * x) * x^p dx
+        = product_terms[p] - (p / (1 + tau^2)) * integral of exp((1 + tau^2) * x) * x^(p-1) dx,
+    with base case p=0 where integral equals product_terms[0].
+
+    where
+        product_terms[p] = x^p * exp((1 + tau^2) * x) / (1 + tau^2).
+
+    Construct a recursive coefficient matrix following the above recursive relation to compute all integral terms up to p = (solver_order - 1).
+    Return coefficients used by the SA-Solver in data prediction mode.
+
+    Args:
+        s: Start time s.
+        t: End time t.
+        solver_order: Current order of the solver.
+        tau_t: Stochastic strength parameter in the SDE.
+
+    Returns:
+        Exponential coefficients used in data prediction, with exp((1 + tau^2) * t) factored out, ordered from p=0 to p=solver_order−1, shape (solver_order,).
+    """
+    tau_mul = 1 + tau_t ** 2
+    h = t - s
+    p = torch.arange(solver_order, dtype=s.dtype, device=s.device)
+
+    # product_terms after factoring out exp((1 + tau^2) * t)
+    # Includes (1 + tau^2) factor from outside the integral
+    product_terms_factored = (t ** p - s ** p * (-tau_mul * h).exp())
+
+    # Lower triangular recursive coefficient matrix
+    # Accumulates recursive coefficients based on p / (1 + tau^2)
+    recursive_depth_mat = p.unsqueeze(1) - p.unsqueeze(0)
+    log_factorial = (p + 1).lgamma()
+    recursive_coeff_mat = log_factorial.unsqueeze(1) - log_factorial.unsqueeze(0)
+    if tau_t > 0:
+        recursive_coeff_mat = recursive_coeff_mat - (recursive_depth_mat * math.log(tau_mul))
+    signs = torch.where(recursive_depth_mat % 2 == 0, 1.0, -1.0)
+    recursive_coeff_mat = (recursive_coeff_mat.exp() * signs).tril()
+
+    return recursive_coeff_mat @ product_terms_factored
+
+
+def compute_simple_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, is_corrector_step: bool = False) -> torch.Tensor:
+    """Compute simple order-2 b coefficients from SA-Solver paper (Appendix D. Implementation Details)."""
+    tau_mul = 1 + tau_t ** 2
+    h = lambda_t - lambda_s
+    alpha_t = sigma_next * lambda_t.exp()
+    if is_corrector_step:
+        # Simplified 1-step (order-2) corrector
+        b_1 = alpha_t * (0.5 * tau_mul * h)
+        b_2 = alpha_t * (-h * tau_mul).expm1().neg() - b_1
+    else:
+        # Simplified 2-step predictor
+        b_2 = alpha_t * (0.5 * tau_mul * h ** 2) / (curr_lambdas[-2] - lambda_s)
+        b_1 = alpha_t * (-h * tau_mul).expm1().neg() - b_2
+    return torch.stack([b_2, b_1])
+
+
+def compute_stochastic_adams_b_coeffs(sigma_next: torch.Tensor, curr_lambdas: torch.Tensor, lambda_s: torch.Tensor, lambda_t: torch.Tensor, tau_t: float, simple_order_2: bool = False, is_corrector_step: bool = False) -> torch.Tensor:
+    """Compute b_i coefficients for the SA-Solver (see eqs. 15 and 18).
+
+    The solver order corresponds to the number of input lambdas (half-logSNR points).
+
+    Args:
+        sigma_next: Sigma at end time t.
+        curr_lambdas: Lambda time points used to construct the Lagrange basis, shape (N,).
+        lambda_s: Lambda at start time s.
+        lambda_t: Lambda at end time t.
+        tau_t: Stochastic strength parameter in the SDE.
+        simple_order_2: Whether to enable the simple order-2 scheme.
+        is_corrector_step: Flag for corrector step in simple order-2 mode.
+
+    Returns:
+        b_i coefficients for the SA-Solver, shape (N,), where N is the solver order.
+    """
+    num_timesteps = curr_lambdas.shape[0]
+
+    if simple_order_2 and num_timesteps == 2:
+        return compute_simple_stochastic_adams_b_coeffs(sigma_next, curr_lambdas, lambda_s, lambda_t, tau_t, is_corrector_step)
+
+    # Compute coefficients by solving a linear system from Lagrange basis interpolation
+    exp_integral_coeffs = compute_exponential_coeffs(lambda_s, lambda_t, num_timesteps, tau_t)
+    vandermonde_matrix_T = torch.vander(curr_lambdas, num_timesteps, increasing=True).T
+    lagrange_integrals = torch.linalg.solve(vandermonde_matrix_T, exp_integral_coeffs)
+
+    # (sigma_t * exp(-tau^2 * lambda_t)) * exp((1 + tau^2) * lambda_t)
+    # = sigma_t * exp(lambda_t) = alpha_t
+    # exp((1 + tau^2) * lambda_t) is extracted from the integral
+    alpha_t = sigma_next * lambda_t.exp()
+    return alpha_t * lagrange_integrals
+
+
+def get_tau_interval_func(start_sigma: float, end_sigma: float, eta: float = 1.0) -> Callable[[Union[torch.Tensor, float]], float]:
+    """Return a function that controls the stochasticity of SA-Solver.
+
+    When eta = 0, SA-Solver runs as ODE. The official approach uses
+    time t to determine the SDE interval, while here we use sigma instead.
+
+    See:
+        https://github.com/scxue/SA-Solver/blob/main/README.md
+    """
+
+    def tau_func(sigma: Union[torch.Tensor, float]) -> float:
+        if eta <= 0:
+            return 0.0  # ODE
+
+        if isinstance(sigma, torch.Tensor):
+            sigma = sigma.item()
+        return eta if start_sigma >= sigma >= end_sigma else 0.0
+
+    return tau_func
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -9,6 +9,7 @@ from tqdm.auto import trange, tqdm

 from . import utils
 from . import deis
+from . import sa_solver
 import comfy.model_patcher
 import comfy.model_sampling

@ -1648,3 +1649,113 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
            if inject_noise:
                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
    return x
+
+
+@torch.no_grad()
+def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, use_pece=False, simple_order_2=False):
+    """Stochastic Adams Solver with predictor-corrector method (NeurIPS 2023)."""
+    if len(sigmas) <= 1:
+        return x
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+
+    model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
+    sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
+    lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
+
+    if tau_func is None:
+        # Use default interval for stochastic sampling
+        start_sigma = model_sampling.percent_to_sigma(0.2)
+        end_sigma = model_sampling.percent_to_sigma(0.8)
+        tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=1.0)
+
+    max_used_order = max(predictor_order, corrector_order)
+    x_pred = x  # x: current state, x_pred: predicted next state
+
+    h = 0.0
+    tau_t = 0.0
+    noise = 0.0
+    pred_list = []
+
+    # Lower order near the end to improve stability
+    lower_order_to_end = sigmas[-1].item() == 0
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        # Evaluation
+        denoised = model(x_pred, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({"x": x_pred, "i": i, "sigma": sigmas[i], "sigma_hat": sigmas[i], "denoised": denoised})
+        pred_list.append(denoised)
+        pred_list = pred_list[-max_used_order:]
+
+        predictor_order_used = min(predictor_order, len(pred_list))
+        if i == 0 or (sigmas[i + 1] == 0 and not use_pece):
+            corrector_order_used = 0
+        else:
+            corrector_order_used = min(corrector_order, len(pred_list))
+
+        if lower_order_to_end:
+            predictor_order_used = min(predictor_order_used, len(sigmas) - 2 - i)
+            corrector_order_used = min(corrector_order_used, len(sigmas) - 1 - i)
+
+        # Corrector
+        if corrector_order_used == 0:
+            # Update by the predicted state
+            x = x_pred
+        else:
+            curr_lambdas = lambdas[i - corrector_order_used + 1:i + 1]
+            b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs(
+                sigmas[i],
+                curr_lambdas,
+                lambdas[i - 1],
+                lambdas[i],
+                tau_t,
+                simple_order_2,
+                is_corrector_step=True,
+            )
+            pred_mat = torch.stack(pred_list[-corrector_order_used:], dim=1)    # (B, K, ...)
+            corr_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0]))  # (B, ...)
+            x = sigmas[i] / sigmas[i - 1] * (-(tau_t ** 2) * h).exp() * x + corr_res
+
+            if tau_t > 0 and s_noise > 0:
+                # The noise from the previous predictor step
+                x = x + noise
+
+            if use_pece:
+                # Evaluate the corrected state
+                denoised = model(x, sigmas[i] * s_in, **extra_args)
+                pred_list[-1] = denoised
+
+        # Predictor
+        if sigmas[i + 1] == 0:
+            # Denoising step
+            x = denoised
+        else:
+            tau_t = tau_func(sigmas[i + 1])
+            curr_lambdas = lambdas[i - predictor_order_used + 1:i + 1]
+            b_coeffs = sa_solver.compute_stochastic_adams_b_coeffs(
+                sigmas[i + 1],
+                curr_lambdas,
+                lambdas[i],
+                lambdas[i + 1],
+                tau_t,
+                simple_order_2,
+                is_corrector_step=False,
+            )
+            pred_mat = torch.stack(pred_list[-predictor_order_used:], dim=1)    # (B, K, ...)
+            pred_res = torch.tensordot(pred_mat, b_coeffs, dims=([1], [0]))  # (B, ...)
+            h = lambdas[i + 1] - lambdas[i]
+            x_pred = sigmas[i + 1] / sigmas[i] * (-(tau_t ** 2) * h).exp() * x + pred_res
+
+            if tau_t > 0 and s_noise > 0:
+                noise = noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * tau_t ** 2 * h).expm1().neg().sqrt() * s_noise
+                x_pred = x_pred + noise
+    return x
+
+
+@torch.no_grad()
+def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
+    """Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023)."""
+    return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@ -254,13 +254,12 @@ class Chroma(nn.Module):

    def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs):
        bs, c, h, w = x.shape
-        patch_size = 2
-        x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size))
+        x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))

-        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
+        img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=self.patch_size, pw=self.patch_size)

-        h_len = ((h + (patch_size // 2)) // patch_size)
-        w_len = ((w + (patch_size // 2)) // patch_size)
+        h_len = ((h + (self.patch_size // 2)) // self.patch_size)
+        w_len = ((w + (self.patch_size // 2)) // self.patch_size)
        img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype)
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1)
        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
@ -268,4 +267,4 @@ class Chroma(nn.Module):

        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h,:w]
--- a/comfy/ldm/pixart/pixartms.py
+++ b/comfy/ldm/pixart/pixartms.py
@ -1,256 +1,256 @@
-# Based on:
-# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license]
-# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license]
-import torch
-import torch.nn as nn
-
-from .blocks import (
-    t2i_modulate,
-    CaptionEmbedder,
-    AttentionKVCompress,
-    MultiHeadCrossAttention,
-    T2IFinalLayer,
-    SizeEmbedder,
-)
-from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch
-
-
-def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32):
-    grid_h, grid_w = torch.meshgrid(
-        torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation,
-        torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation,
-        indexing='ij'
-    )
-    emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype)
-    emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype)
-    emb = torch.cat([emb_w, emb_h], dim=1)  # (H*W, D)
-    return emb
-
-class PixArtMSBlock(nn.Module):
-    """
-    A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning.
-    """
-    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None,
-                 sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.attn = AttentionKVCompress(
-            hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio,
-            qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs
-        )
-        self.cross_attn = MultiHeadCrossAttention(
-            hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs
-        )
-        self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        # to be compatible with lower version pytorch
-        approx_gelu = lambda: nn.GELU(approximate="tanh")
-        self.mlp = Mlp(
-            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu,
-            dtype=dtype, device=device, operations=operations
-        )
-        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
-
-    def forward(self, x, y, t, mask=None, HW=None, **kwargs):
-        B, N, C = x.shape
-
-        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1)
-        x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW))
-        x = x + self.cross_attn(x, y, mask)
-        x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
-
-        return x
-
-
-### Core PixArt Model ###
-class PixArtMS(nn.Module):
-    """
-    Diffusion model with a Transformer backbone.
-    """
-    def __init__(
-            self,
-            input_size=32,
-            patch_size=2,
-            in_channels=4,
-            hidden_size=1152,
-            depth=28,
-            num_heads=16,
-            mlp_ratio=4.0,
-            class_dropout_prob=0.1,
-            learn_sigma=True,
-            pred_sigma=True,
-            drop_path: float = 0.,
-            caption_channels=4096,
-            pe_interpolation=None,
-            pe_precision=None,
-            config=None,
-            model_max_length=120,
-            micro_condition=True,
-            qk_norm=False,
-            kv_compress_config=None,
-            dtype=None,
-            device=None,
-            operations=None,
-            **kwargs,
-    ):
-        nn.Module.__init__(self)
-        self.dtype = dtype
-        self.pred_sigma = pred_sigma
-        self.in_channels = in_channels
-        self.out_channels = in_channels * 2 if pred_sigma else in_channels
-        self.patch_size = patch_size
-        self.num_heads = num_heads
-        self.pe_interpolation = pe_interpolation
-        self.pe_precision = pe_precision
-        self.hidden_size = hidden_size
-        self.depth = depth
-
-        approx_gelu = lambda: nn.GELU(approximate="tanh")
-        self.t_block = nn.Sequential(
-            nn.SiLU(),
-            operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)
-        )
-        self.x_embedder = PatchEmbed(
-            patch_size=patch_size,
-            in_chans=in_channels,
-            embed_dim=hidden_size,
-            bias=True,
-            dtype=dtype,
-            device=device,
-            operations=operations
-        )
-        self.t_embedder = TimestepEmbedder(
-            hidden_size, dtype=dtype, device=device, operations=operations,
-        )
-        self.y_embedder = CaptionEmbedder(
-            in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob,
-            act_layer=approx_gelu, token_num=model_max_length,
-            dtype=dtype, device=device, operations=operations,
-        )
-
-        self.micro_conditioning = micro_condition
-        if self.micro_conditioning:
-            self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations)
-            self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations)
-
-        # For fixed sin-cos embedding:
-        # num_patches = (input_size // patch_size) * (input_size // patch_size)
-        # self.base_size = input_size // self.patch_size
-        # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))
-
-        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
-        if kv_compress_config is None:
-            kv_compress_config = {
-                'sampling': None,
-                'scale_factor': 1,
-                'kv_compress_layer': [],
-            }
-        self.blocks = nn.ModuleList([
-            PixArtMSBlock(
-                hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
-                sampling=kv_compress_config['sampling'],
-                sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1,
-                qk_norm=qk_norm,
-                dtype=dtype,
-                device=device,
-                operations=operations,
-            )
-            for i in range(depth)
-        ])
-        self.final_layer = T2IFinalLayer(
-            hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations
-        )
-
-    def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs):
-        """
-        Original forward pass of PixArt.
-        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
-        t: (N,) tensor of diffusion timesteps
-        y: (N, 1, 120, C) conditioning
-        ar: (N, 1): aspect ratio
-        cs: (N ,2) size conditioning for height/width
-        """
-        B, C, H, W = x.shape
-        c_res = (H + W) // 2
-        pe_interpolation = self.pe_interpolation
-        if pe_interpolation is None or self.pe_precision is not None:
-            # calculate pe_interpolation on-the-fly
-            pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0)
-
-        pos_embed = get_2d_sincos_pos_embed_torch(
-            self.hidden_size,
-            h=(H // self.patch_size),
-            w=(W // self.patch_size),
-            pe_interpolation=pe_interpolation,
-            base_size=((round(c_res / 64) * 64) // self.patch_size),
-            device=x.device,
-            dtype=x.dtype,
-        ).unsqueeze(0)
-
-        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
-        t = self.t_embedder(timestep, x.dtype)  # (N, D)
-
-        if self.micro_conditioning and (c_size is not None and c_ar is not None):
-            bs = x.shape[0]
-            c_size = self.csize_embedder(c_size, bs)  # (N, D)
-            c_ar = self.ar_embedder(c_ar, bs)  # (N, D)
-            t = t + torch.cat([c_size, c_ar], dim=1)
-
-        t0 = self.t_block(t)
-        y = self.y_embedder(y, self.training)  # (N, D)
-
-        if mask is not None:
-            if mask.shape[0] != y.shape[0]:
-                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
-            mask = mask.squeeze(1).squeeze(1)
-            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
-            y_lens = mask.sum(dim=1).tolist()
-        else:
-            y_lens = None
-            y = y.squeeze(1).view(1, -1, x.shape[-1])
-        for block in self.blocks:
-            x = block(x, y, t0, y_lens, (H, W), **kwargs)  # (N, T, D)
-
-        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
-        x = self.unpatchify(x, H, W)  # (N, out_channels, H, W)
-
-        return x
-
-    def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs):
-        B, C, H, W = x.shape
-
-        # Fallback for missing microconds
-        if self.micro_conditioning:
-            if c_size is None:
-                c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1)
-
-            if c_ar is None:
-                c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1)
-
-        ## Still accepts the input w/o that dim but returns garbage
-        if len(context.shape) == 3:
-            context = context.unsqueeze(1)
-
-        ## run original forward pass
-        out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar)
-
-        ## only return EPS
-        if self.pred_sigma:
-            return out[:, :self.in_channels]
-        return out
-
-    def unpatchify(self, x, h, w):
-        """
-        x: (N, T, patch_size**2 * C)
-        imgs: (N, H, W, C)
-        """
-        c = self.out_channels
-        p = self.x_embedder.patch_size[0]
-        h = h // self.patch_size
-        w = w // self.patch_size
-        assert h * w == x.shape[1]
-
-        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
-        x = torch.einsum('nhwpqc->nchpwq', x)
-        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
-        return imgs
+# Based on:
+# https://github.com/PixArt-alpha/PixArt-alpha [Apache 2.0 license]
+# https://github.com/PixArt-alpha/PixArt-sigma [Apache 2.0 license]
+import torch
+import torch.nn as nn
+
+from .blocks import (
+    t2i_modulate,
+    CaptionEmbedder,
+    AttentionKVCompress,
+    MultiHeadCrossAttention,
+    T2IFinalLayer,
+    SizeEmbedder,
+)
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder, PatchEmbed, Mlp, get_1d_sincos_pos_embed_from_grid_torch
+
+
+def get_2d_sincos_pos_embed_torch(embed_dim, w, h, pe_interpolation=1.0, base_size=16, device=None, dtype=torch.float32):
+    grid_h, grid_w = torch.meshgrid(
+        torch.arange(h, device=device, dtype=dtype) / (h/base_size) / pe_interpolation,
+        torch.arange(w, device=device, dtype=dtype) / (w/base_size) / pe_interpolation,
+        indexing='ij'
+    )
+    emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_h, device=device, dtype=dtype)
+    emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_w, device=device, dtype=dtype)
+    emb = torch.cat([emb_w, emb_h], dim=1)  # (H*W, D)
+    return emb
+
+class PixArtMSBlock(nn.Module):
+    """
+    A PixArt block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, drop_path=0., input_size=None,
+                 sampling=None, sr_ratio=1, qk_norm=False, dtype=None, device=None, operations=None, **block_kwargs):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        self.attn = AttentionKVCompress(
+            hidden_size, num_heads=num_heads, qkv_bias=True, sampling=sampling, sr_ratio=sr_ratio,
+            qk_norm=qk_norm, dtype=dtype, device=device, operations=operations, **block_kwargs
+        )
+        self.cross_attn = MultiHeadCrossAttention(
+            hidden_size, num_heads, dtype=dtype, device=device, operations=operations, **block_kwargs
+        )
+        self.norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
+        # to be compatible with lower version pytorch
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size, hidden_features=int(hidden_size * mlp_ratio), act_layer=approx_gelu,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size ** 0.5)
+
+    def forward(self, x, y, t, mask=None, HW=None, **kwargs):
+        B, N, C = x.shape
+
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None].to(dtype=x.dtype, device=x.device) + t.reshape(B, 6, -1)).chunk(6, dim=1)
+        x = x + (gate_msa * self.attn(t2i_modulate(self.norm1(x), shift_msa, scale_msa), HW=HW))
+        x = x + self.cross_attn(x, y, mask)
+        x = x + (gate_mlp * self.mlp(t2i_modulate(self.norm2(x), shift_mlp, scale_mlp)))
+
+        return x
+
+
+### Core PixArt Model ###
+class PixArtMS(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+            self,
+            input_size=32,
+            patch_size=2,
+            in_channels=4,
+            hidden_size=1152,
+            depth=28,
+            num_heads=16,
+            mlp_ratio=4.0,
+            class_dropout_prob=0.1,
+            learn_sigma=True,
+            pred_sigma=True,
+            drop_path: float = 0.,
+            caption_channels=4096,
+            pe_interpolation=None,
+            pe_precision=None,
+            config=None,
+            model_max_length=120,
+            micro_condition=True,
+            qk_norm=False,
+            kv_compress_config=None,
+            dtype=None,
+            device=None,
+            operations=None,
+            **kwargs,
+    ):
+        nn.Module.__init__(self)
+        self.dtype = dtype
+        self.pred_sigma = pred_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if pred_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.pe_interpolation = pe_interpolation
+        self.pe_precision = pe_precision
+        self.hidden_size = hidden_size
+        self.depth = depth
+
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.t_block = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)
+        )
+        self.x_embedder = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_channels,
+            embed_dim=hidden_size,
+            bias=True,
+            dtype=dtype,
+            device=device,
+            operations=operations
+        )
+        self.t_embedder = TimestepEmbedder(
+            hidden_size, dtype=dtype, device=device, operations=operations,
+        )
+        self.y_embedder = CaptionEmbedder(
+            in_channels=caption_channels, hidden_size=hidden_size, uncond_prob=class_dropout_prob,
+            act_layer=approx_gelu, token_num=model_max_length,
+            dtype=dtype, device=device, operations=operations,
+        )
+
+        self.micro_conditioning = micro_condition
+        if self.micro_conditioning:
+            self.csize_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations)
+            self.ar_embedder = SizeEmbedder(hidden_size//3, dtype=dtype, device=device, operations=operations)
+
+        # For fixed sin-cos embedding:
+        # num_patches = (input_size // patch_size) * (input_size // patch_size)
+        # self.base_size = input_size // self.patch_size
+        # self.register_buffer("pos_embed", torch.zeros(1, num_patches, hidden_size))
+
+        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]  # stochastic depth decay rule
+        if kv_compress_config is None:
+            kv_compress_config = {
+                'sampling': None,
+                'scale_factor': 1,
+                'kv_compress_layer': [],
+            }
+        self.blocks = nn.ModuleList([
+            PixArtMSBlock(
+                hidden_size, num_heads, mlp_ratio=mlp_ratio, drop_path=drop_path[i],
+                sampling=kv_compress_config['sampling'],
+                sr_ratio=int(kv_compress_config['scale_factor']) if i in kv_compress_config['kv_compress_layer'] else 1,
+                qk_norm=qk_norm,
+                dtype=dtype,
+                device=device,
+                operations=operations,
+            )
+            for i in range(depth)
+        ])
+        self.final_layer = T2IFinalLayer(
+            hidden_size, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations
+        )
+
+    def forward_orig(self, x, timestep, y, mask=None, c_size=None, c_ar=None, **kwargs):
+        """
+        Original forward pass of PixArt.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N, 1, 120, C) conditioning
+        ar: (N, 1): aspect ratio
+        cs: (N ,2) size conditioning for height/width
+        """
+        B, C, H, W = x.shape
+        c_res = (H + W) // 2
+        pe_interpolation = self.pe_interpolation
+        if pe_interpolation is None or self.pe_precision is not None:
+            # calculate pe_interpolation on-the-fly
+            pe_interpolation = round(c_res / (512/8.0), self.pe_precision or 0)
+
+        pos_embed = get_2d_sincos_pos_embed_torch(
+            self.hidden_size,
+            h=(H // self.patch_size),
+            w=(W // self.patch_size),
+            pe_interpolation=pe_interpolation,
+            base_size=((round(c_res / 64) * 64) // self.patch_size),
+            device=x.device,
+            dtype=x.dtype,
+        ).unsqueeze(0)
+
+        x = self.x_embedder(x) + pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(timestep, x.dtype)  # (N, D)
+
+        if self.micro_conditioning and (c_size is not None and c_ar is not None):
+            bs = x.shape[0]
+            c_size = self.csize_embedder(c_size, bs)  # (N, D)
+            c_ar = self.ar_embedder(c_ar, bs)  # (N, D)
+            t = t + torch.cat([c_size, c_ar], dim=1)
+
+        t0 = self.t_block(t)
+        y = self.y_embedder(y, self.training)  # (N, D)
+
+        if mask is not None:
+            if mask.shape[0] != y.shape[0]:
+                mask = mask.repeat(y.shape[0] // mask.shape[0], 1)
+            mask = mask.squeeze(1).squeeze(1)
+            y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1])
+            y_lens = mask.sum(dim=1).tolist()
+        else:
+            y_lens = None
+            y = y.squeeze(1).view(1, -1, x.shape[-1])
+        for block in self.blocks:
+            x = block(x, y, t0, y_lens, (H, W), **kwargs)  # (N, T, D)
+
+        x = self.final_layer(x, t)  # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x, H, W)  # (N, out_channels, H, W)
+
+        return x
+
+    def forward(self, x, timesteps, context, c_size=None, c_ar=None, **kwargs):
+        B, C, H, W = x.shape
+
+        # Fallback for missing microconds
+        if self.micro_conditioning:
+            if c_size is None:
+                c_size = torch.tensor([H*8, W*8], dtype=x.dtype, device=x.device).repeat(B, 1)
+
+            if c_ar is None:
+                c_ar = torch.tensor([H/W], dtype=x.dtype, device=x.device).repeat(B, 1)
+
+        ## Still accepts the input w/o that dim but returns garbage
+        if len(context.shape) == 3:
+            context = context.unsqueeze(1)
+
+        ## run original forward pass
+        out = self.forward_orig(x, timesteps, context, c_size=c_size, c_ar=c_ar)
+
+        ## only return EPS
+        if self.pred_sigma:
+            return out[:, :self.in_channels]
+        return out
+
+    def unpatchify(self, x, h, w):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = h // self.patch_size
+        w = w // self.patch_size
+        assert h * w == x.shape[1]
+
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -720,7 +720,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp",
-                  "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3"]
+                  "gradient_estimation", "gradient_estimation_cfg_pp", "er_sde", "seeds_2", "seeds_3", "sa_solver", "sa_solver_pece"]

 class KSAMPLER(Sampler):
    def __init__(self, sampler_function, extra_options={}, inpaint_options={}):
--- a/comfy/text_encoders/pixart_t5.py
+++ b/comfy/text_encoders/pixart_t5.py
@ -1,42 +1,42 @@
-import os
-
-from comfy import sd1_clip
-import comfy.text_encoders.t5
-import comfy.text_encoders.sd3_clip
-from comfy.sd1_clip import gen_empty_tokens
-
-from transformers import T5TokenizerFast
-
-class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def gen_empty_tokens(self, special_tokens, *args, **kwargs):
-        # PixArt expects the negative to be all pad tokens
-        special_tokens = special_tokens.copy()
-        special_tokens.pop("end")
-        return gen_empty_tokens(special_tokens, *args, **kwargs)
-
-class PixArtT5XXL(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}):
-        super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options)
-
-class T5XXLTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding
-
-class PixArtTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
-
-def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None):
-    class PixArtTEModel_(PixArtT5XXL):
-        def __init__(self, device="cpu", dtype=None, model_options={}):
-            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
-                model_options = model_options.copy()
-                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
-            if dtype is None:
-                dtype = dtype_t5
-            super().__init__(device=device, dtype=dtype, model_options=model_options)
-    return PixArtTEModel_
+import os
+
+from comfy import sd1_clip
+import comfy.text_encoders.t5
+import comfy.text_encoders.sd3_clip
+from comfy.sd1_clip import gen_empty_tokens
+
+from transformers import T5TokenizerFast
+
+class T5XXLModel(comfy.text_encoders.sd3_clip.T5XXLModel):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def gen_empty_tokens(self, special_tokens, *args, **kwargs):
+        # PixArt expects the negative to be all pad tokens
+        special_tokens = special_tokens.copy()
+        special_tokens.pop("end")
+        return gen_empty_tokens(special_tokens, *args, **kwargs)
+
+class PixArtT5XXL(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options)
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) # no padding
+
+class PixArtTokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer)
+
+def pixart_te(dtype_t5=None, t5xxl_scaled_fp8=None):
+    class PixArtTEModel_(PixArtT5XXL):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
+            if dtype is None:
+                dtype = dtype_t5
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return PixArtTEModel_
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -998,11 +998,12 @@ def set_progress_bar_global_hook(function):
    PROGRESS_BAR_HOOK = function

 class ProgressBar:
-    def __init__(self, total):
+    def __init__(self, total, node_id=None):
        global PROGRESS_BAR_HOOK
        self.total = total
        self.current = 0
        self.hook = PROGRESS_BAR_HOOK
+        self.node_id = node_id

    def update_absolute(self, value, total=None, preview=None):
        if total is not None:
@ -1011,7 +1012,7 @@ class ProgressBar:
            value = self.total
        self.current = value
        if self.hook is not None:
-            self.hook(self.current, self.total, preview)
+            self.hook(self.current, self.total, preview, node_id=self.node_id)

    def update(self, value):
        self.update_absolute(self.current + value)
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@ -0,0 +1,69 @@
+"""
+Feature flags module for ComfyUI WebSocket protocol negotiation.
+
+This module handles capability negotiation between frontend and backend,
+allowing graceful protocol evolution while maintaining backward compatibility.
+"""
+
+from typing import Any, Dict
+
+from comfy.cli_args import args
+
+# Default server capabilities
+SERVER_FEATURE_FLAGS: Dict[str, Any] = {
+    "supports_preview_metadata": True,
+    "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
+}
+
+
+def get_connection_feature(
+    sockets_metadata: Dict[str, Dict[str, Any]],
+    sid: str,
+    feature_name: str,
+    default: Any = False
+) -> Any:
+    """
+    Get a feature flag value for a specific connection.
+
+    Args:
+        sockets_metadata: Dictionary of socket metadata
+        sid: Session ID of the connection
+        feature_name: Name of the feature to check
+        default: Default value if feature not found
+
+    Returns:
+        Feature value or default if not found
+    """
+    if sid not in sockets_metadata:
+        return default
+
+    return sockets_metadata[sid].get("feature_flags", {}).get(feature_name, default)
+
+
+def supports_feature(
+    sockets_metadata: Dict[str, Dict[str, Any]],
+    sid: str,
+    feature_name: str
+) -> bool:
+    """
+    Check if a connection supports a specific feature.
+
+    Args:
+        sockets_metadata: Dictionary of socket metadata
+        sid: Session ID of the connection
+        feature_name: Name of the feature to check
+
+    Returns:
+        Boolean indicating if feature is supported
+    """
+    return get_connection_feature(sockets_metadata, sid, feature_name, False) is True
+
+
+def get_server_features() -> Dict[str, Any]:
+    """
+    Get the server's feature flags.
+
+    Returns:
+        Dictionary of server feature flags
+    """
+    return SERVER_FEATURE_FLAGS.copy()
--- a/comfy_api/input/video_types.py
+++ b/comfy_api/input/video_types.py
@ -1,6 +1,7 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Optional, Union
+import io
 from comfy_api.util import VideoContainer, VideoCodec, VideoComponents

 class VideoInput(ABC):
@ -31,6 +32,22 @@ class VideoInput(ABC):
        """
        pass

+    def get_stream_source(self) -> Union[str, io.BytesIO]:
+        """
+        Get a streamable source for the video. This allows processing without
+        loading the entire video into memory.
+
+        Returns:
+            Either a file path (str) or a BytesIO object that can be opened with av.
+
+        Default implementation creates a BytesIO buffer, but subclasses should
+        override this for better performance when possible.
+        """
+        buffer = io.BytesIO()
+        self.save_to(buffer)
+        buffer.seek(0)
+        return buffer
+
    # Provide a default implementation, but subclasses can provide optimized versions
    # if possible.
    def get_dimensions(self) -> tuple[int, int]:
--- a/comfy_api/input_impl/video_types.py
+++ b/comfy_api/input_impl/video_types.py
@ -64,6 +64,15 @@ class VideoFromFile(VideoInput):
        """
        self.__file = file

+    def get_stream_source(self) -> str | io.BytesIO:
+        """
+        Return the underlying file source for efficient streaming.
+        This avoids unnecessary memory copies when the source is already a file path.
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+        return self.__file
+
    def get_dimensions(self) -> tuple[int, int]:
        """
        Returns the dimensions of the video input.
--- a/comfy_api_nodes/apis/init.py
+++ b/comfy_api_nodes/apis/init.py
@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  filtered-openapi.yaml
-#   timestamp: 2025-05-19T21:38:55+00:00
+#   timestamp: 2025-07-06T09:47:31+00:00

 from __future__ import annotations

@ -1355,6 +1355,158 @@ class ModelResponseProperties(BaseModel):
    )


+class Keyframes(BaseModel):
+    image_url: Optional[str] = None
+
+
+class MoonvalleyPromptResponse(BaseModel):
+    error: Optional[Dict[str, Any]] = None
+    frame_conditioning: Optional[Dict[str, Any]] = None
+    id: Optional[str] = None
+    inference_params: Optional[Dict[str, Any]] = None
+    meta: Optional[Dict[str, Any]] = None
+    model_params: Optional[Dict[str, Any]] = None
+    output_url: Optional[str] = None
+    prompt_text: Optional[str] = None
+    status: Optional[str] = None
+
+
+class MoonvalleyTextToVideoInferenceParams(BaseModel):
+    add_quality_guidance: Optional[bool] = Field(
+        True, description='Whether to add quality guidance'
+    )
+    caching_coefficient: Optional[float] = Field(
+        0.3, description='Caching coefficient for optimization'
+    )
+    caching_cooldown: Optional[int] = Field(
+        3, description='Number of caching cooldown steps'
+    )
+    caching_warmup: Optional[int] = Field(
+        3, description='Number of caching warmup steps'
+    )
+    clip_value: Optional[float] = Field(
+        3, description='CLIP value for generation control'
+    )
+    conditioning_frame_index: Optional[int] = Field(
+        0, description='Index of the conditioning frame'
+    )
+    cooldown_steps: Optional[int] = Field(
+        None, description='Number of cooldown steps (calculated based on num_frames)'
+    )
+    fps: Optional[int] = Field(
+        24, description='Frames per second of the generated video'
+    )
+    guidance_scale: Optional[float] = Field(
+        12.5, description='Guidance scale for generation control'
+    )
+    height: Optional[int] = Field(
+        1080, description='Height of the generated video in pixels'
+    )
+    negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
+    num_frames: Optional[int] = Field(64, description='Number of frames to generate')
+    seed: Optional[int] = Field(
+        None, description='Random seed for generation (default: random)'
+    )
+    shift_value: Optional[float] = Field(
+        3, description='Shift value for generation control'
+    )
+    steps: Optional[int] = Field(80, description='Number of denoising steps')
+    use_guidance_schedule: Optional[bool] = Field(
+        True, description='Whether to use guidance scheduling'
+    )
+    use_negative_prompts: Optional[bool] = Field(
+        False, description='Whether to use negative prompts'
+    )
+    use_timestep_transform: Optional[bool] = Field(
+        True, description='Whether to use timestep transformation'
+    )
+    warmup_steps: Optional[int] = Field(
+        None, description='Number of warmup steps (calculated based on num_frames)'
+    )
+    width: Optional[int] = Field(
+        1920, description='Width of the generated video in pixels'
+    )
+
+
+class MoonvalleyTextToVideoRequest(BaseModel):
+    image_url: Optional[str] = None
+    inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None
+    prompt_text: Optional[str] = None
+    webhook_url: Optional[str] = None
+
+
+class MoonvalleyUploadFileRequest(BaseModel):
+    file: Optional[StrictBytes] = None
+
+
+class MoonvalleyUploadFileResponse(BaseModel):
+    access_url: Optional[str] = None
+
+
+class MoonvalleyVideoToVideoInferenceParams(BaseModel):
+    add_quality_guidance: Optional[bool] = Field(
+        True, description='Whether to add quality guidance'
+    )
+    caching_coefficient: Optional[float] = Field(
+        0.3, description='Caching coefficient for optimization'
+    )
+    caching_cooldown: Optional[int] = Field(
+        3, description='Number of caching cooldown steps'
+    )
+    caching_warmup: Optional[int] = Field(
+        3, description='Number of caching warmup steps'
+    )
+    clip_value: Optional[float] = Field(
+        3, description='CLIP value for generation control'
+    )
+    conditioning_frame_index: Optional[int] = Field(
+        0, description='Index of the conditioning frame'
+    )
+    cooldown_steps: Optional[int] = Field(
+        None, description='Number of cooldown steps (calculated based on num_frames)'
+    )
+    guidance_scale: Optional[float] = Field(
+        12.5, description='Guidance scale for generation control'
+    )
+    negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
+    seed: Optional[int] = Field(
+        None, description='Random seed for generation (default: random)'
+    )
+    shift_value: Optional[float] = Field(
+        3, description='Shift value for generation control'
+    )
+    steps: Optional[int] = Field(80, description='Number of denoising steps')
+    use_guidance_schedule: Optional[bool] = Field(
+        True, description='Whether to use guidance scheduling'
+    )
+    use_negative_prompts: Optional[bool] = Field(
+        False, description='Whether to use negative prompts'
+    )
+    use_timestep_transform: Optional[bool] = Field(
+        True, description='Whether to use timestep transformation'
+    )
+    warmup_steps: Optional[int] = Field(
+        None, description='Number of warmup steps (calculated based on num_frames)'
+    )
+
+
+class ControlType(str, Enum):
+    motion_control = 'motion_control'
+    pose_control = 'pose_control'
+
+
+class MoonvalleyVideoToVideoRequest(BaseModel):
+    control_type: ControlType = Field(
+        ..., description='Supported types for video control'
+    )
+    inference_params: Optional[MoonvalleyVideoToVideoInferenceParams] = None
+    prompt_text: str = Field(..., description='Describes the video to generate')
+    video_url: str = Field(..., description='Url to control video')
+    webhook_url: Optional[str] = Field(
+        None, description='Optional webhook URL for notifications'
+    )
+
+
 class Moderation(str, Enum):
    low = 'low'
    auto = 'auto'
@ -3107,6 +3259,23 @@ class LumaUpscaleVideoGenerationRequest(BaseModel):
    resolution: Optional[LumaVideoModelOutputResolution] = None


+class MoonvalleyImageToVideoRequest(MoonvalleyTextToVideoRequest):
+    keyframes: Optional[Dict[str, Keyframes]] = None
+
+
+class MoonvalleyResizeVideoRequest(MoonvalleyVideoToVideoRequest):
+    frame_position: Optional[List[int]] = Field(None, max_length=2, min_length=2)
+    frame_resolution: Optional[List[int]] = Field(None, max_length=2, min_length=2)
+    scale: Optional[List[int]] = Field(None, max_length=2, min_length=2)
+
+
+class MoonvalleyTextToImageRequest(BaseModel):
+    image_url: Optional[str] = None
+    inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None
+    prompt_text: Optional[str] = None
+    webhook_url: Optional[str] = None
+
+
 class OutputContent(RootModel[Union[OutputTextContent, OutputAudioContent]]):
    root: Union[OutputTextContent, OutputAudioContent]

--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@ -132,6 +132,8 @@ def poll_until_finished(
        result_url_extractor=result_url_extractor,
        estimated_duration=estimated_duration,
        node_id=node_id,
+        poll_interval=16.0,
+        max_poll_attempts=256,
    ).execute()


--- a/comfy_api_nodes/nodes_moonvalley.py
+++ b/comfy_api_nodes/nodes_moonvalley.py
@ -0,0 +1,639 @@
+import logging
+from typing import Any, Callable, Optional, TypeVar
+import random
+import torch
+from comfy_api_nodes.util.validation_utils import get_image_dimensions, validate_image_dimensions, validate_video_dimensions
+
+
+from comfy_api_nodes.apis import (
+    MoonvalleyTextToVideoRequest,
+    MoonvalleyTextToVideoInferenceParams,
+    MoonvalleyVideoToVideoInferenceParams,
+    MoonvalleyVideoToVideoRequest,
+    MoonvalleyPromptResponse
+)
+from comfy_api_nodes.apis.client import (
+    ApiEndpoint,
+    HttpMethod,
+    SynchronousOperation,
+    PollingOperation,
+    EmptyRequest,
+)
+from comfy_api_nodes.apinode_utils import (
+    download_url_to_video_output,
+    upload_images_to_comfyapi,
+    upload_video_to_comfyapi,
+)
+from comfy_api_nodes.mapper_utils import model_field_to_node_input
+
+from comfy_api.input.video_types import VideoInput
+from comfy.comfy_types.node_typing import IO
+from comfy_api.input_impl import VideoFromFile
+import av
+import io
+
+API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads"
+API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts"
+API_VIDEO2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/video-to-video"
+API_TXT2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/text-to-video"
+API_IMG2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/image-to-video"
+
+MIN_WIDTH = 300
+MIN_HEIGHT = 300
+
+MAX_WIDTH = 10000
+MAX_HEIGHT = 10000
+
+MIN_VID_WIDTH = 300
+MIN_VID_HEIGHT = 300
+
+MAX_VID_WIDTH = 10000
+MAX_VID_HEIGHT = 10000
+
+MAX_VIDEO_SIZE = 1024 * 1024 * 1024  # 1 GB max for in-memory video processing
+
+MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000
+R = TypeVar("R")
+class MoonvalleyApiError(Exception):
+    """Base exception for Moonvalley API errors."""
+    pass
+
+def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool:
+    """Verifies that the initial response contains a task ID."""
+    return bool(response.id)
+
+def validate_task_creation_response(response) -> None:
+    if not is_valid_task_creation_response(response):
+        error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}"
+        logging.error(error_msg)
+        raise MoonvalleyApiError(error_msg)
+
+def get_video_from_response(response):
+    video = response.output_url
+    logging.info(
+        "Moonvalley Marey API: Task %s succeeded. Video URL: %s", response.id, video
+    )
+    return video
+
+
+def get_video_url_from_response(response) -> Optional[str]:
+    """Returns the first video url from the Moonvalley video generation task result.
+    Will not raise an error if the response is not valid.
+    """
+    if response:
+        return str(get_video_from_response(response))
+    else:
+        return None
+
+
+def poll_until_finished(
+    auth_kwargs: dict[str, str],
+    api_endpoint: ApiEndpoint[Any, R],
+    result_url_extractor: Optional[Callable[[R], str]] = None,
+    node_id: Optional[str] = None,
+) -> R:
+    """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response."""
+    return PollingOperation(
+        poll_endpoint=api_endpoint,
+        completed_statuses=[
+            "completed",
+        ],
+        max_poll_attempts=240,  # 64 minutes with 16s interval
+        poll_interval=16.0,
+        failed_statuses=["error"],
+        status_extractor=lambda response: (
+            response.status
+            if response and response.status
+            else None
+        ),
+        auth_kwargs=auth_kwargs,
+        result_url_extractor=result_url_extractor,
+        node_id=node_id,
+    ).execute()
+
+def validate_prompts(prompt:str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH):
+    """Verifies that the prompt isn't empty and that neither prompt is too long."""
+    if not prompt:
+        raise ValueError("Positive prompt is empty")
+    if len(prompt) > max_length:
+        raise ValueError(f"Positive prompt is too long: {len(prompt)} characters")
+    if negative_prompt and len(negative_prompt) > max_length:
+        raise ValueError(
+            f"Negative prompt is too long: {len(negative_prompt)} characters"
+        )
+    return True
+
+def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None):
+        # inference validation
+        # T = num_frames
+        # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition...
+        # with image conditioning: H*W must be divisible by 8192
+        # without image conditioning: T divisible by 32
+    if num_frames_in and not num_frames_in % 16 == 0 :
+        return False, (
+            "The input video total frame count must be divisible by 16!"
+        )
+
+    if height % 8 != 0 or width % 8 != 0:
+        return False, (
+            f"Height ({height}) and width ({width}) must be " "divisible by 8"
+        )
+
+    if with_frame_conditioning:
+        if (height * width) % 8192 != 0:
+            return False, (
+                f"Height * width ({height * width}) must be "
+                "divisible by 8192 for frame conditioning"
+            )
+    else:
+        if num_frames_in and not num_frames_in % 32 == 0 :
+            return False, (
+                "The input video total frame count must be divisible by 32!"
+            )
+
+
+def validate_input_image(image: torch.Tensor, with_frame_conditioning: bool=False) -> None:
+    """
+    Validates the input image adheres to the expectations of the API:
+    - The image resolution should not be less than 300*300px
+    - The aspect ratio of the image should be between 1:2.5 ~ 2.5:1
+
+    """
+    height, width = get_image_dimensions(image)
+    validate_input_media(width, height, with_frame_conditioning )
+    validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH)
+
+def validate_input_video(video: VideoInput, num_frames_out: int, with_frame_conditioning: bool=False):
+    try:
+        width, height = video.get_dimensions()
+    except Exception as e:
+        logging.error("Error getting dimensions of video: %s", e)
+        raise ValueError(f"Cannot get video dimensions: {e}") from e
+
+    validate_input_media(width, height, with_frame_conditioning)
+    validate_video_dimensions(video, min_width=MIN_VID_WIDTH, min_height=MIN_VID_HEIGHT, max_width=MAX_VID_WIDTH, max_height=MAX_VID_HEIGHT)
+
+    trimmed_video = validate_input_video_length(video, num_frames_out)
+    return trimmed_video
+
+
+def validate_input_video_length(video: VideoInput, num_frames: int):
+
+    if video.get_duration() > 60:
+        raise MoonvalleyApiError("Input Video lenth should be less than 1min. Please trim.")
+
+    if num_frames == 128:
+       if video.get_duration() < 5:
+           raise MoonvalleyApiError("Input Video length is less than 5s. Please use a video longer than or equal to 5s.")
+       if video.get_duration() > 5:
+        #    trim video to 5s
+        video = trim_video(video, 5)
+    if num_frames == 256:
+        if video.get_duration() < 10:
+            raise MoonvalleyApiError("Input Video length is less than 10s. Please use a video longer than or equal to 10s.")
+        if video.get_duration() > 10:
+            # trim video to 10s
+            video = trim_video(video, 10)
+    return video
+
+def trim_video(video: VideoInput, duration_sec: float) -> VideoInput:
+    """
+    Returns a new VideoInput object trimmed from the beginning to the specified duration,
+    using av to avoid loading entire video into memory.
+
+    Args:
+        video: Input video to trim
+        duration_sec: Duration in seconds to keep from the beginning
+
+    Returns:
+        VideoFromFile object that owns the output buffer
+    """
+    output_buffer = io.BytesIO()
+
+    input_container = None
+    output_container = None
+
+    try:
+        # Get the stream source - this avoids loading entire video into memory
+        # when the source is already a file path
+        input_source = video.get_stream_source()
+
+        # Open containers
+        input_container = av.open(input_source, mode='r')
+        output_container = av.open(output_buffer, mode='w', format='mp4')
+
+        # Set up output streams for re-encoding
+        video_stream = None
+        audio_stream = None
+
+        for stream in input_container.streams:
+            logging.info(f"Found stream: type={stream.type}, class={type(stream)}")
+            if isinstance(stream, av.VideoStream):
+                # Create output video stream with same parameters
+                video_stream = output_container.add_stream('h264', rate=stream.average_rate)
+                video_stream.width = stream.width
+                video_stream.height = stream.height
+                video_stream.pix_fmt = 'yuv420p'
+                logging.info(f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps")
+            elif isinstance(stream, av.AudioStream):
+                # Create output audio stream with same parameters
+                audio_stream = output_container.add_stream('aac', rate=stream.sample_rate)
+                audio_stream.sample_rate = stream.sample_rate
+                audio_stream.layout = stream.layout
+                logging.info(f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels")
+
+        # Calculate target frame count that's divisible by 32
+        fps = input_container.streams.video[0].average_rate
+        estimated_frames = int(duration_sec * fps)
+        target_frames = (estimated_frames // 32) * 32  # Round down to nearest multiple of 32
+
+        if target_frames == 0:
+            raise ValueError("Video too short: need at least 32 frames for Moonvalley")
+
+        frame_count = 0
+        audio_frame_count = 0
+
+        # Decode and re-encode video frames
+        if video_stream:
+            for frame in input_container.decode(video=0):
+                if frame_count >= target_frames:
+                    break
+
+                # Re-encode frame
+                for packet in video_stream.encode(frame):
+                    output_container.mux(packet)
+                frame_count += 1
+
+            # Flush encoder
+            for packet in video_stream.encode():
+                output_container.mux(packet)
+
+            logging.info(f"Encoded {frame_count} video frames (target: {target_frames})")
+
+        # Decode and re-encode audio frames
+        if audio_stream:
+            input_container.seek(0)  # Reset to beginning for audio
+            for frame in input_container.decode(audio=0):
+                if frame.time >= duration_sec:
+                    break
+
+                # Re-encode frame
+                for packet in audio_stream.encode(frame):
+                    output_container.mux(packet)
+                audio_frame_count += 1
+
+            # Flush encoder
+            for packet in audio_stream.encode():
+                output_container.mux(packet)
+
+            logging.info(f"Encoded {audio_frame_count} audio frames")
+
+        # Close containers
+        output_container.close()
+        input_container.close()
+
+
+        # Return as VideoFromFile using the buffer
+        output_buffer.seek(0)
+        return VideoFromFile(output_buffer)
+
+    except Exception as e:
+        # Clean up on error
+        if input_container is not None:
+            input_container.close()
+        if output_container is not None:
+            output_container.close()
+        raise RuntimeError(f"Failed to trim video: {str(e)}") from e
+
+# --- BaseMoonvalleyVideoNode ---
+class BaseMoonvalleyVideoNode:
+    def parseWidthHeightFromRes(self, resolution: str):
+        # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict
+        res_map = {
+            "16:9 (1920 x 1080)": {"width": 1920, "height": 1080},
+            "9:16 (1080 x 1920)": {"width": 1080, "height": 1920},
+            "1:1 (1152 x 1152)": {"width": 1152, "height": 1152},
+            "4:3 (1440 x 1080)": {"width": 1440, "height": 1080},
+            "3:4 (1080 x 1440)": {"width": 1080, "height": 1440},
+            "21:9 (2560 x 1080)": {"width": 2560, "height": 1080},
+        }
+        if resolution in res_map:
+            return res_map[resolution]
+        else:
+            # Default to 1920x1080 if unknown
+            return {"width": 1920, "height": 1080}
+
+    def parseControlParameter(self, value):
+        control_map = {
+            "Motion Transfer": "motion_control",
+            "Canny": "canny_control",
+            "Pose Transfer": "pose_control",
+            "Depth": "depth_control"
+        }
+        if value in control_map:
+            return control_map[value]
+        else:
+            return control_map["Motion Transfer"]
+
+    def get_response(
+        self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None
+    ) -> MoonvalleyPromptResponse:
+        return poll_until_finished(
+            auth_kwargs,
+            ApiEndpoint(
+                path=f"{API_PROMPTS_ENDPOINT}/{task_id}",
+                method=HttpMethod.GET,
+                request_model=EmptyRequest,
+                response_model=MoonvalleyPromptResponse,
+            ),
+            result_url_extractor=get_video_url_from_response,
+            node_id=node_id,
+        )
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "prompt": model_field_to_node_input(
+                    IO.STRING, MoonvalleyTextToVideoRequest, "prompt_text",
+                    multiline=True
+                ),
+                "negative_prompt": model_field_to_node_input(
+                    IO.STRING,
+                    MoonvalleyTextToVideoInferenceParams,
+                    "negative_prompt",
+                    multiline=True,
+                    default="gopro, bright, contrast, static, overexposed, bright, vignette, artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, flare, saturation, distorted, warped, wide angle, contrast, saturated, vibrant, glowing, cross dissolve, texture, videogame, saturation, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, blown out, horrible, blurry, worst quality, bad, transition, dissolve, cross-dissolve, melt, fade in, fade out, wobbly, weird, low quality, plastic, stock footage, video camera, boring, static",
+                ),
+
+                "resolution": (IO.COMBO, {
+                        "options": ["16:9 (1920 x 1080)",
+                                    "9:16 (1080 x 1920)",
+                                    "1:1 (1152 x 1152)",
+                                    "4:3 (1440 x 1080)",
+                                    "3:4 (1080 x 1440)",
+                                    "21:9 (2560 x 1080)"],
+                        "default": "16:9 (1920 x 1080)",
+                        "tooltip": "Resolution of the output video",
+                    }),
+                # "length": (IO.COMBO,{"options":['5s','10s'], "default": '5s'}),
+                "prompt_adherence": model_field_to_node_input(IO.FLOAT,MoonvalleyTextToVideoInferenceParams,"guidance_scale",default=7.0, step=1, min=1, max=20),
+                "seed": model_field_to_node_input(IO.INT,MoonvalleyTextToVideoInferenceParams, "seed", default=random.randint(0, 2**32 - 1), min=0, max=4294967295, step=1, display="number", tooltip="Random seed value", control_after_generate=True),
+                "steps": model_field_to_node_input(IO.INT, MoonvalleyTextToVideoInferenceParams, "steps", default=100, min=1, max=100),
+            },
+            "hidden": {
+                "auth_token": "AUTH_TOKEN_COMFY_ORG",
+                "comfy_api_key": "API_KEY_COMFY_ORG",
+                "unique_id": "UNIQUE_ID",
+            },
+            "optional": {
+                "image": model_field_to_node_input(
+                    IO.IMAGE,
+                    MoonvalleyTextToVideoRequest,
+                    "image_url",
+                    tooltip="The reference image used to generate the video",
+                ),
+            }
+        }
+
+    RETURN_TYPES = ("STRING",)
+    FUNCTION = "generate"
+    CATEGORY = "api node/video/Moonvalley Marey"
+    API_NODE = True
+
+    def generate(self, **kwargs):
+        return None
+
+# --- MoonvalleyImg2VideoNode ---
+class MoonvalleyImg2VideoNode(BaseMoonvalleyVideoNode):
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return super().INPUT_TYPES()
+
+    RETURN_TYPES = ("VIDEO",)
+    RETURN_NAMES = ("video",)
+    DESCRIPTION = "Moonvalley Marey Image to Video Node"
+
+    def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs):
+        image = kwargs.get("image", None)
+        if (image is None):
+            raise MoonvalleyApiError("image is required")
+        total_frames = get_total_frames_from_length()
+
+        validate_input_image(image,True)
+        validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        width_height = self.parseWidthHeightFromRes(kwargs.get("resolution"))
+
+        inference_params=MoonvalleyTextToVideoInferenceParams(
+                    negative_prompt=negative_prompt,
+                    steps=kwargs.get("steps"),
+                    seed=kwargs.get("seed"),
+                    guidance_scale=kwargs.get("prompt_adherence"),
+                    num_frames=total_frames,
+                    width=width_height.get("width"),
+                    height=width_height.get("height"),
+                    use_negative_prompts=True
+                )
+        """Upload image to comfy backend to have a URL available for further processing"""
+        # Get MIME type from tensor - assuming PNG format for image tensors
+        mime_type = "image/png"
+
+        image_url = upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs, mime_type=mime_type)[0]
+
+        request = MoonvalleyTextToVideoRequest(
+                image_url=image_url,
+                prompt_text=prompt,
+                inference_params=inference_params
+            )
+        initial_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT,
+                                 method=HttpMethod.POST,
+                                 request_model=MoonvalleyTextToVideoRequest,
+                                 response_model=MoonvalleyPromptResponse
+                                 ),
+            request=request,
+            auth_kwargs=kwargs,
+        )
+        task_creation_response = initial_operation.execute()
+        validate_task_creation_response(task_creation_response)
+        task_id = task_creation_response.id
+
+        final_response = self.get_response(
+            task_id, auth_kwargs=kwargs, node_id=unique_id
+        )
+        video = download_url_to_video_output(final_response.output_url)
+        return (video, )
+
+# --- MoonvalleyVid2VidNode ---
+class MoonvalleyVideo2VideoNode(BaseMoonvalleyVideoNode):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        input_types = super().INPUT_TYPES()
+        for param in ["resolution", "image"]:
+            if param in input_types["required"]:
+                del input_types["required"][param]
+            if param in input_types["optional"]:
+                del input_types["optional"][param]
+        input_types["optional"] = {
+                "video": (IO.VIDEO, {"default": "", "multiline": False, "tooltip": "The reference video used to generate the output video. Input a 5s video for 128 frames and a 10s video for 256 frames. Longer videos will be trimmed automatically."}),
+                "control_type": (
+                    ["Motion Transfer", "Pose Transfer"],
+                    {"default": "Motion Transfer"},
+                ),
+                "motion_intensity": (
+                    "INT",
+                    {
+                        "default": 100,
+                        "step": 1,
+                        "min": 0,
+                        "max": 100,
+                        "tooltip": "Only used if control_type is 'Motion Transfer'",
+                    },
+                )
+            }
+
+        return input_types
+
+    RETURN_TYPES = ("VIDEO",)
+    RETURN_NAMES = ("video",)
+
+    def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs):
+        video = kwargs.get("video")
+        num_frames = get_total_frames_from_length()
+
+        if not video :
+            raise MoonvalleyApiError("video is required")
+
+
+        """Validate video input"""
+        video_url=""
+        if video:
+            validated_video = validate_input_video(video, num_frames, False)
+            video_url = upload_video_to_comfyapi(validated_video, auth_kwargs=kwargs)
+
+        control_type = kwargs.get("control_type")
+        motion_intensity = kwargs.get("motion_intensity")
+
+        """Validate prompts and inference input"""
+        validate_prompts(prompt, negative_prompt)
+        inference_params=MoonvalleyVideoToVideoInferenceParams(
+            negative_prompt=negative_prompt,
+            steps=kwargs.get("steps"),
+            seed=kwargs.get("seed"),
+            guidance_scale=kwargs.get("prompt_adherence"),
+            control_params={'motion_intensity': motion_intensity}
+        )
+
+        control = self.parseControlParameter(control_type)
+
+        request = MoonvalleyVideoToVideoRequest(
+                control_type=control,
+                video_url=video_url,
+                prompt_text=prompt,
+                inference_params=inference_params
+            )
+
+        initial_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT,
+                                 method=HttpMethod.POST,
+                                 request_model=MoonvalleyVideoToVideoRequest,
+                                 response_model=MoonvalleyPromptResponse
+                                 ),
+            request=request,
+            auth_kwargs=kwargs,
+        )
+        task_creation_response = initial_operation.execute()
+        validate_task_creation_response(task_creation_response)
+        task_id = task_creation_response.id
+
+        final_response = self.get_response(
+            task_id, auth_kwargs=kwargs, node_id=unique_id
+        )
+
+        video = download_url_to_video_output(final_response.output_url)
+
+        return (video, )
+
+# --- MoonvalleyTxt2VideoNode ---
+class MoonvalleyTxt2VideoNode(BaseMoonvalleyVideoNode):
+    def __init__(self):
+        super().__init__()
+
+    RETURN_TYPES = ("VIDEO",)
+    RETURN_NAMES = ("video",)
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        input_types = super().INPUT_TYPES()
+        # Remove image-specific parameters
+        for param in ["image"]:
+            if param in input_types["optional"]:
+                del input_types["optional"][param]
+        return input_types
+
+    def generate(self, prompt, negative_prompt, unique_id: Optional[str] = None, **kwargs):
+        validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
+        width_height = self.parseWidthHeightFromRes(kwargs.get("resolution"))
+        num_frames = get_total_frames_from_length()
+
+        inference_params=MoonvalleyTextToVideoInferenceParams(
+                    negative_prompt=negative_prompt,
+                    steps=kwargs.get("steps"),
+                    seed=kwargs.get("seed"),
+                    guidance_scale=kwargs.get("prompt_adherence"),
+                    num_frames=num_frames,
+                    width=width_height.get("width"),
+                    height=width_height.get("height"),
+                )
+        request = MoonvalleyTextToVideoRequest(
+                prompt_text=prompt,
+                inference_params=inference_params
+            )
+
+        initial_operation = SynchronousOperation(
+            endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT,
+                                 method=HttpMethod.POST,
+                                 request_model=MoonvalleyTextToVideoRequest,
+                                 response_model=MoonvalleyPromptResponse
+                                 ),
+            request=request,
+            auth_kwargs=kwargs,
+        )
+        task_creation_response = initial_operation.execute()
+        validate_task_creation_response(task_creation_response)
+        task_id = task_creation_response.id
+
+        final_response = self.get_response(
+            task_id, auth_kwargs=kwargs, node_id=unique_id
+        )
+
+        video = download_url_to_video_output(final_response.output_url)
+        return (video, )
+
+
+
+NODE_CLASS_MAPPINGS = {
+    "MoonvalleyImg2VideoNode": MoonvalleyImg2VideoNode,
+    "MoonvalleyTxt2VideoNode": MoonvalleyTxt2VideoNode,
+    # "MoonvalleyVideo2VideoNode": MoonvalleyVideo2VideoNode,
+}
+
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "MoonvalleyImg2VideoNode": "Moonvalley Marey Image to Video",
+    "MoonvalleyTxt2VideoNode": "Moonvalley Marey Text to Video",
+    # "MoonvalleyVideo2VideoNode": "Moonvalley Marey Video to Video",
+}
+
+def get_total_frames_from_length(length="5s"):
+    # if length == '5s':
+    #     return 128
+    # elif length == '10s':
+    #     return 256
+    return 128
+    # else:
+    #     raise MoonvalleyApiError("length is required")
--- a/comfy_execution/caching.py
+++ b/comfy_execution/caching.py
@ -1,6 +1,7 @@
 import itertools
 from typing import Sequence, Mapping, Dict
 from comfy_execution.graph import DynamicPrompt
+from abc import ABC, abstractmethod

 import nodes

@ -16,12 +17,13 @@ def include_unique_id_in_input(class_type: str) -> bool:
    NODE_CLASS_CONTAINS_UNIQUE_ID[class_type] = "UNIQUE_ID" in class_def.INPUT_TYPES().get("hidden", {}).values()
    return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type]

-class CacheKeySet:
+class CacheKeySet(ABC):
    def __init__(self, dynprompt, node_ids, is_changed_cache):
        self.keys = {}
        self.subcache_keys = {}

-    def add_keys(self, node_ids):
+    @abstractmethod
+    async def add_keys(self, node_ids):
        raise NotImplementedError()

    def all_node_ids(self):
@ -60,9 +62,8 @@ class CacheKeySetID(CacheKeySet):
    def __init__(self, dynprompt, node_ids, is_changed_cache):
        super().__init__(dynprompt, node_ids, is_changed_cache)
        self.dynprompt = dynprompt
-        self.add_keys(node_ids)

-    def add_keys(self, node_ids):
+    async def add_keys(self, node_ids):
        for node_id in node_ids:
            if node_id in self.keys:
                continue
@ -77,37 +78,36 @@ class CacheKeySetInputSignature(CacheKeySet):
        super().__init__(dynprompt, node_ids, is_changed_cache)
        self.dynprompt = dynprompt
        self.is_changed_cache = is_changed_cache
-        self.add_keys(node_ids)

    def include_node_id_in_input(self) -> bool:
        return False

-    def add_keys(self, node_ids):
+    async def add_keys(self, node_ids):
        for node_id in node_ids:
            if node_id in self.keys:
                continue
            if not self.dynprompt.has_node(node_id):
                continue
            node = self.dynprompt.get_node(node_id)
-            self.keys[node_id] = self.get_node_signature(self.dynprompt, node_id)
+            self.keys[node_id] = await self.get_node_signature(self.dynprompt, node_id)
            self.subcache_keys[node_id] = (node_id, node["class_type"])

-    def get_node_signature(self, dynprompt, node_id):
+    async def get_node_signature(self, dynprompt, node_id):
        signature = []
        ancestors, order_mapping = self.get_ordered_ancestry(dynprompt, node_id)
-        signature.append(self.get_immediate_node_signature(dynprompt, node_id, order_mapping))
+        signature.append(await self.get_immediate_node_signature(dynprompt, node_id, order_mapping))
        for ancestor_id in ancestors:
-            signature.append(self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping))
+            signature.append(await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping))
        return to_hashable(signature)

-    def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping):
+    async def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping):
        if not dynprompt.has_node(node_id):
            # This node doesn't exist -- we can't cache it.
            return [float("NaN")]
        node = dynprompt.get_node(node_id)
        class_type = node["class_type"]
        class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
-        signature = [class_type, self.is_changed_cache.get(node_id)]
+        signature = [class_type, await self.is_changed_cache.get(node_id)]
        if self.include_node_id_in_input() or (hasattr(class_def, "NOT_IDEMPOTENT") and class_def.NOT_IDEMPOTENT) or include_unique_id_in_input(class_type):
            signature.append(node_id)
        inputs = node["inputs"]
@ -150,9 +150,10 @@ class BasicCache:
        self.cache = {}
        self.subcaches = {}

-    def set_prompt(self, dynprompt, node_ids, is_changed_cache):
+    async def set_prompt(self, dynprompt, node_ids, is_changed_cache):
        self.dynprompt = dynprompt
        self.cache_key_set = self.key_class(dynprompt, node_ids, is_changed_cache)
+        await self.cache_key_set.add_keys(node_ids)
        self.is_changed_cache = is_changed_cache
        self.initialized = True

@ -201,13 +202,13 @@ class BasicCache:
        else:
            return None

-    def _ensure_subcache(self, node_id, children_ids):
+    async def _ensure_subcache(self, node_id, children_ids):
        subcache_key = self.cache_key_set.get_subcache_key(node_id)
        subcache = self.subcaches.get(subcache_key, None)
        if subcache is None:
            subcache = BasicCache(self.key_class)
            self.subcaches[subcache_key] = subcache
-        subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache)
+        await subcache.set_prompt(self.dynprompt, children_ids, self.is_changed_cache)
        return subcache

    def _get_subcache(self, node_id):
@ -259,10 +260,10 @@ class HierarchicalCache(BasicCache):
        assert cache is not None
        cache._set_immediate(node_id, value)

-    def ensure_subcache_for(self, node_id, children_ids):
+    async def ensure_subcache_for(self, node_id, children_ids):
        cache = self._get_cache_for(node_id)
        assert cache is not None
-        return cache._ensure_subcache(node_id, children_ids)
+        return await cache._ensure_subcache(node_id, children_ids)

 class LRUCache(BasicCache):
    def __init__(self, key_class, max_size=100):
@ -273,8 +274,8 @@ class LRUCache(BasicCache):
        self.used_generation = {}
        self.children = {}

-    def set_prompt(self, dynprompt, node_ids, is_changed_cache):
-        super().set_prompt(dynprompt, node_ids, is_changed_cache)
+    async def set_prompt(self, dynprompt, node_ids, is_changed_cache):
+        await super().set_prompt(dynprompt, node_ids, is_changed_cache)
        self.generation += 1
        for node_id in node_ids:
            self._mark_used(node_id)
@ -303,11 +304,11 @@ class LRUCache(BasicCache):
        self._mark_used(node_id)
        return self._set_immediate(node_id, value)

-    def ensure_subcache_for(self, node_id, children_ids):
+    async def ensure_subcache_for(self, node_id, children_ids):
        # Just uses subcaches for tracking 'live' nodes
-        super()._ensure_subcache(node_id, children_ids)
+        await super()._ensure_subcache(node_id, children_ids)

-        self.cache_key_set.add_keys(children_ids)
+        await self.cache_key_set.add_keys(children_ids)
        self._mark_used(node_id)
        cache_key = self.cache_key_set.get_data_key(node_id)
        self.children[cache_key] = []
@ -337,7 +338,7 @@ class DependencyAwareCache(BasicCache):
        self.ancestors = {}    # Maps node_id -> set of ancestor node_ids
        self.executed_nodes = set()  # Tracks nodes that have been executed

-    def set_prompt(self, dynprompt, node_ids, is_changed_cache):
+    async def set_prompt(self, dynprompt, node_ids, is_changed_cache):
        """
        Clear the entire cache and rebuild the dependency graph.

@ -354,7 +355,7 @@ class DependencyAwareCache(BasicCache):
        self.executed_nodes.clear()

        # Call the parent method to initialize the cache with the new prompt
-        super().set_prompt(dynprompt, node_ids, is_changed_cache)
+        await super().set_prompt(dynprompt, node_ids, is_changed_cache)

        # Rebuild the dependency graph
        self._build_dependency_graph(dynprompt, node_ids)
@ -405,7 +406,7 @@ class DependencyAwareCache(BasicCache):
        """
        return self._get_immediate(node_id)

-    def ensure_subcache_for(self, node_id, children_ids):
+    async def ensure_subcache_for(self, node_id, children_ids):
        """
        Ensure a subcache exists for a node and update dependencies.

@ -416,7 +417,7 @@ class DependencyAwareCache(BasicCache):
        Returns:
            The subcache object for the node.
        """
-        subcache = super()._ensure_subcache(node_id, children_ids)
+        subcache = await super()._ensure_subcache(node_id, children_ids)
        for child_id in children_ids:
            self.descendants[node_id].add(child_id)
            self.ancestors[child_id].add(node_id)
--- a/comfy_execution/graph.py
+++ b/comfy_execution/graph.py
@ -2,6 +2,8 @@ from __future__ import annotations
 from typing import Type, Literal

 import nodes
+import asyncio
+import inspect
 from comfy_execution.graph_utils import is_link
 from comfy.comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions

@ -100,6 +102,8 @@ class TopologicalSort:
        self.pendingNodes = {}
        self.blockCount = {} # Number of nodes this node is directly blocked by
        self.blocking = {} # Which nodes are blocked by this node
+        self.externalBlocks = 0
+        self.unblockedEvent = asyncio.Event()

    def get_input_info(self, unique_id, input_name):
        class_type = self.dynprompt.get_node(unique_id)["class_type"]
@ -153,6 +157,16 @@ class TopologicalSort:
        for link in links:
            self.add_strong_link(*link)

+    def add_external_block(self, node_id):
+        assert node_id in self.blockCount, "Can't add external block to a node that isn't pending"
+        self.externalBlocks += 1
+        self.blockCount[node_id] += 1
+        def unblock():
+            self.externalBlocks -= 1
+            self.blockCount[node_id] -= 1
+            self.unblockedEvent.set()
+        return unblock
+
    def is_cached(self, node_id):
        return False

@ -181,11 +195,16 @@ class ExecutionList(TopologicalSort):
    def is_cached(self, node_id):
        return self.output_cache.get(node_id) is not None

-    def stage_node_execution(self):
+    async def stage_node_execution(self):
        assert self.staged_node_id is None
        if self.is_empty():
            return None, None, None
        available = self.get_ready_nodes()
+        while len(available) == 0 and self.externalBlocks > 0:
+            # Wait for an external block to be released
+            await self.unblockedEvent.wait()
+            self.unblockedEvent.clear()
+            available = self.get_ready_nodes()
        if len(available) == 0:
            cycled_nodes = self.get_nodes_in_cycle()
            # Because cycles composed entirely of static nodes are caught during initial validation,
@ -221,8 +240,15 @@ class ExecutionList(TopologicalSort):
                return True
            return False

+        # If an available node is async, do that first.
+        # This will execute the asynchronous function earlier, reducing the overall time.
+        def is_async(node_id):
+            class_type = self.dynprompt.get_node(node_id)["class_type"]
+            class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
+            return inspect.iscoroutinefunction(getattr(class_def, class_def.FUNCTION))
+
        for node_id in node_list:
-            if is_output(node_id):
+            if is_output(node_id) or is_async(node_id):
                return node_id

        #This should handle the VAEDecode -> preview case
--- a/comfy_execution/progress.py
+++ b/comfy_execution/progress.py
@ -0,0 +1,347 @@
+from typing import TypedDict, Dict, Optional
+from typing_extensions import override
+from PIL import Image
+from enum import Enum
+from abc import ABC
+from tqdm import tqdm
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from comfy_execution.graph import DynamicPrompt
+from protocol import BinaryEventTypes
+from comfy_api import feature_flags
+
+
+class NodeState(Enum):
+    Pending = "pending"
+    Running = "running"
+    Finished = "finished"
+    Error = "error"
+
+
+class NodeProgressState(TypedDict):
+    """
+    A class to represent the state of a node's progress.
+    """
+
+    state: NodeState
+    value: float
+    max: float
+
+
+class ProgressHandler(ABC):
+    """
+    Abstract base class for progress handlers.
+    Progress handlers receive progress updates and display them in various ways.
+    """
+
+    def __init__(self, name: str):
+        self.name = name
+        self.enabled = True
+
+    def set_registry(self, registry: "ProgressRegistry"):
+        pass
+
+    def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str):
+        """Called when a node starts processing"""
+        pass
+
+    def update_handler(
+        self,
+        node_id: str,
+        value: float,
+        max_value: float,
+        state: NodeProgressState,
+        prompt_id: str,
+        image: Optional[Image.Image] = None,
+    ):
+        """Called when a node's progress is updated"""
+        pass
+
+    def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str):
+        """Called when a node finishes processing"""
+        pass
+
+    def reset(self):
+        """Called when the progress registry is reset"""
+        pass
+
+    def enable(self):
+        """Enable this handler"""
+        self.enabled = True
+
+    def disable(self):
+        """Disable this handler"""
+        self.enabled = False
+
+
+class CLIProgressHandler(ProgressHandler):
+    """
+    Handler that displays progress using tqdm progress bars in the CLI.
+    """
+
+    def __init__(self):
+        super().__init__("cli")
+        self.progress_bars: Dict[str, tqdm] = {}
+
+    @override
+    def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str):
+        # Create a new tqdm progress bar
+        if node_id not in self.progress_bars:
+            self.progress_bars[node_id] = tqdm(
+                total=state["max"],
+                desc=f"Node {node_id}",
+                unit="steps",
+                leave=True,
+                position=len(self.progress_bars),
+            )
+
+    @override
+    def update_handler(
+        self,
+        node_id: str,
+        value: float,
+        max_value: float,
+        state: NodeProgressState,
+        prompt_id: str,
+        image: Optional[Image.Image] = None,
+    ):
+        # Handle case where start_handler wasn't called
+        if node_id not in self.progress_bars:
+            self.progress_bars[node_id] = tqdm(
+                total=max_value,
+                desc=f"Node {node_id}",
+                unit="steps",
+                leave=True,
+                position=len(self.progress_bars),
+            )
+            self.progress_bars[node_id].update(value)
+        else:
+            # Update existing progress bar
+            if max_value != self.progress_bars[node_id].total:
+                self.progress_bars[node_id].total = max_value
+            # Calculate the update amount (difference from current position)
+            current_position = self.progress_bars[node_id].n
+            update_amount = value - current_position
+            if update_amount > 0:
+                self.progress_bars[node_id].update(update_amount)
+
+    @override
+    def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str):
+        # Complete and close the progress bar if it exists
+        if node_id in self.progress_bars:
+            # Ensure the bar shows 100% completion
+            remaining = state["max"] - self.progress_bars[node_id].n
+            if remaining > 0:
+                self.progress_bars[node_id].update(remaining)
+            self.progress_bars[node_id].close()
+            del self.progress_bars[node_id]
+
+    @override
+    def reset(self):
+        # Close all progress bars
+        for bar in self.progress_bars.values():
+            bar.close()
+        self.progress_bars.clear()
+
+
+class WebUIProgressHandler(ProgressHandler):
+    """
+    Handler that sends progress updates to the WebUI via WebSockets.
+    """
+
+    def __init__(self, server_instance):
+        super().__init__("webui")
+        self.server_instance = server_instance
+
+    def set_registry(self, registry: "ProgressRegistry"):
+        self.registry = registry
+
+    def _send_progress_state(self, prompt_id: str, nodes: Dict[str, NodeProgressState]):
+        """Send the current progress state to the client"""
+        if self.server_instance is None:
+            return
+
+        # Only send info for non-pending nodes
+        active_nodes = {
+            node_id: {
+                "value": state["value"],
+                "max": state["max"],
+                "state": state["state"].value,
+                "node_id": node_id,
+                "prompt_id": prompt_id,
+                "display_node_id": self.registry.dynprompt.get_display_node_id(node_id),
+                "parent_node_id": self.registry.dynprompt.get_parent_node_id(node_id),
+                "real_node_id": self.registry.dynprompt.get_real_node_id(node_id),
+            }
+            for node_id, state in nodes.items()
+            if state["state"] != NodeState.Pending
+        }
+
+        # Send a combined progress_state message with all node states
+        self.server_instance.send_sync(
+            "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes}
+        )
+
+    @override
+    def start_handler(self, node_id: str, state: NodeProgressState, prompt_id: str):
+        # Send progress state of all nodes
+        if self.registry:
+            self._send_progress_state(prompt_id, self.registry.nodes)
+
+    @override
+    def update_handler(
+        self,
+        node_id: str,
+        value: float,
+        max_value: float,
+        state: NodeProgressState,
+        prompt_id: str,
+        image: Optional[Image.Image] = None,
+    ):
+        # Send progress state of all nodes
+        if self.registry:
+            self._send_progress_state(prompt_id, self.registry.nodes)
+        if image:
+            # Only send new format if client supports it
+            if feature_flags.supports_feature(
+                self.server_instance.sockets_metadata,
+                self.server_instance.client_id,
+                "supports_preview_metadata",
+            ):
+                metadata = {
+                    "node_id": node_id,
+                    "prompt_id": prompt_id,
+                    "display_node_id": self.registry.dynprompt.get_display_node_id(
+                        node_id
+                    ),
+                    "parent_node_id": self.registry.dynprompt.get_parent_node_id(
+                        node_id
+                    ),
+                    "real_node_id": self.registry.dynprompt.get_real_node_id(node_id),
+                }
+                self.server_instance.send_sync(
+                    BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA,
+                    (image, metadata),
+                    self.server_instance.client_id,
+                )
+
+    @override
+    def finish_handler(self, node_id: str, state: NodeProgressState, prompt_id: str):
+        # Send progress state of all nodes
+        if self.registry:
+            self._send_progress_state(prompt_id, self.registry.nodes)
+
+
+class ProgressRegistry:
+    """
+    Registry that maintains node progress state and notifies registered handlers.
+    """
+
+    def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt"):
+        self.prompt_id = prompt_id
+        self.dynprompt = dynprompt
+        self.nodes: Dict[str, NodeProgressState] = {}
+        self.handlers: Dict[str, ProgressHandler] = {}
+
+    def register_handler(self, handler: ProgressHandler) -> None:
+        """Register a progress handler"""
+        self.handlers[handler.name] = handler
+
+    def unregister_handler(self, handler_name: str) -> None:
+        """Unregister a progress handler"""
+        if handler_name in self.handlers:
+            # Allow handler to clean up resources
+            self.handlers[handler_name].reset()
+            del self.handlers[handler_name]
+
+    def enable_handler(self, handler_name: str) -> None:
+        """Enable a progress handler"""
+        if handler_name in self.handlers:
+            self.handlers[handler_name].enable()
+
+    def disable_handler(self, handler_name: str) -> None:
+        """Disable a progress handler"""
+        if handler_name in self.handlers:
+            self.handlers[handler_name].disable()
+
+    def ensure_entry(self, node_id: str) -> NodeProgressState:
+        """Ensure a node entry exists"""
+        if node_id not in self.nodes:
+            self.nodes[node_id] = NodeProgressState(
+                state=NodeState.Pending, value=0, max=1
+            )
+        return self.nodes[node_id]
+
+    def start_progress(self, node_id: str) -> None:
+        """Start progress tracking for a node"""
+        entry = self.ensure_entry(node_id)
+        entry["state"] = NodeState.Running
+        entry["value"] = 0.0
+        entry["max"] = 1.0
+
+        # Notify all enabled handlers
+        for handler in self.handlers.values():
+            if handler.enabled:
+                handler.start_handler(node_id, entry, self.prompt_id)
+
+    def update_progress(
+        self, node_id: str, value: float, max_value: float, image: Optional[Image.Image]
+    ) -> None:
+        """Update progress for a node"""
+        entry = self.ensure_entry(node_id)
+        entry["state"] = NodeState.Running
+        entry["value"] = value
+        entry["max"] = max_value
+
+        # Notify all enabled handlers
+        for handler in self.handlers.values():
+            if handler.enabled:
+                handler.update_handler(
+                    node_id, value, max_value, entry, self.prompt_id, image
+                )
+
+    def finish_progress(self, node_id: str) -> None:
+        """Finish progress tracking for a node"""
+        entry = self.ensure_entry(node_id)
+        entry["state"] = NodeState.Finished
+        entry["value"] = entry["max"]
+
+        # Notify all enabled handlers
+        for handler in self.handlers.values():
+            if handler.enabled:
+                handler.finish_handler(node_id, entry, self.prompt_id)
+
+    def reset_handlers(self) -> None:
+        """Reset all handlers"""
+        for handler in self.handlers.values():
+            handler.reset()
+
+# Global registry instance
+global_progress_registry: ProgressRegistry = None
+
+def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None:
+    global global_progress_registry
+
+    # Reset existing handlers if registry exists
+    if global_progress_registry is not None:
+        global_progress_registry.reset_handlers()
+
+    # Create new registry
+    global_progress_registry = ProgressRegistry(prompt_id, dynprompt)
+
+
+def add_progress_handler(handler: ProgressHandler) -> None:
+    registry = get_progress_state()
+    handler.set_registry(registry)
+    registry.register_handler(handler)
+
+
+def get_progress_state() -> ProgressRegistry:
+    global global_progress_registry
+    if global_progress_registry is None:
+        from comfy_execution.graph import DynamicPrompt
+
+        global_progress_registry = ProgressRegistry(
+            prompt_id="", dynprompt=DynamicPrompt({})
+        )
+    return global_progress_registry
--- a/comfy_execution/utils.py
+++ b/comfy_execution/utils.py
@ -0,0 +1,46 @@
+import contextvars
+from typing import Optional, NamedTuple
+
+class ExecutionContext(NamedTuple):
+    """
+    Context information about the currently executing node.
+
+    Attributes:
+        node_id: The ID of the currently executing node
+        list_index: The index in a list being processed (for operations on batches/lists)
+    """
+    prompt_id: str
+    node_id: str
+    list_index: Optional[int]
+
+current_executing_context: contextvars.ContextVar[Optional[ExecutionContext]] = contextvars.ContextVar("current_executing_context", default=None)
+
+def get_executing_context() -> Optional[ExecutionContext]:
+    return current_executing_context.get(None)
+
+class CurrentNodeContext:
+    """
+    Context manager for setting the current executing node context.
+
+    Sets the current_executing_context on enter and resets it on exit.
+
+    Example:
+        with CurrentNodeContext(node_id="123", list_index=0):
+            # Code that should run with the current node context set
+            process_image()
+    """
+    def __init__(self, prompt_id: str, node_id: str, list_index: Optional[int] = None):
+        self.context = ExecutionContext(
+            prompt_id= prompt_id,
+            node_id= node_id,
+            list_index= list_index
+        )
+        self.token = None
+
+    def __enter__(self):
+        self.token = current_executing_context.set(self.context)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.token is not None:
+            current_executing_context.reset(self.token)
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@ -2,6 +2,7 @@ import math
 import comfy.samplers
 import comfy.sample
 from comfy.k_diffusion import sampling as k_diffusion_sampling
+from comfy.k_diffusion import sa_solver
 from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
 import latent_preview
 import torch
@ -521,6 +522,49 @@ class SamplerER_SDE(ComfyNodeABC):
        return (sampler,)


+class SamplerSASolver(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(cls) -> InputTypeDict:
+        return {
+            "required": {
+                "model": (IO.MODEL, {}),
+                "eta": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01, "round": False},),
+                "sde_start_percent": (IO.FLOAT, {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001},),
+                "sde_end_percent": (IO.FLOAT, {"default": 0.8, "min": 0.0, "max": 1.0, "step": 0.001},),
+                "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False},),
+                "predictor_order": (IO.INT, {"default": 3, "min": 1, "max": 6}),
+                "corrector_order": (IO.INT, {"default": 4, "min": 0, "max": 6}),
+                "use_pece": (IO.BOOLEAN, {}),
+                "simple_order_2": (IO.BOOLEAN, {}),
+            }
+        }
+
+    RETURN_TYPES = (IO.SAMPLER,)
+    CATEGORY = "sampling/custom_sampling/samplers"
+
+    FUNCTION = "get_sampler"
+
+    def get_sampler(self, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2):
+        model_sampling = model.get_model_object("model_sampling")
+        start_sigma = model_sampling.percent_to_sigma(sde_start_percent)
+        end_sigma = model_sampling.percent_to_sigma(sde_end_percent)
+        tau_func = sa_solver.get_tau_interval_func(start_sigma, end_sigma, eta=eta)
+
+        sampler_name = "sa_solver"
+        sampler = comfy.samplers.ksampler(
+            sampler_name,
+            {
+                "tau_func": tau_func,
+                "s_noise": s_noise,
+                "predictor_order": predictor_order,
+                "corrector_order": corrector_order,
+                "use_pece": use_pece,
+                "simple_order_2": simple_order_2,
+            },
+        )
+        return (sampler,)
+
+
 class Noise_EmptyNoise:
    def __init__(self):
        self.seed = 0
@ -829,6 +873,7 @@ NODE_CLASS_MAPPINGS = {
    "SamplerDPMPP_2S_Ancestral": SamplerDPMPP_2S_Ancestral,
    "SamplerDPMAdaptative": SamplerDPMAdaptative,
    "SamplerER_SDE": SamplerER_SDE,
+    "SamplerSASolver": SamplerSASolver,
    "SplitSigmas": SplitSigmas,
    "SplitSigmasDenoise": SplitSigmasDenoise,
    "FlipSigmas": FlipSigmas,
--- a/comfy_extras/nodes_fresca.py
+++ b/comfy_extras/nodes_fresca.py
@ -71,8 +71,11 @@ class FreSca:
    DESCRIPTION = "Applies frequency-dependent scaling to the guidance"
    def patch(self, model, scale_low, scale_high, freq_cutoff):
        def custom_cfg_function(args):
-            cond = args["conds_out"][0]
-            uncond = args["conds_out"][1]
+            conds_out = args["conds_out"]
+            if len(conds_out) <= 1 or None in args["conds"][:2]:
+                return conds_out
+            cond = conds_out[0]
+            uncond = conds_out[1]

            guidance = cond - uncond
            filtered_guidance = Fourier_filter(
@ -83,7 +86,7 @@ class FreSca:
            )
            filtered_cond = filtered_guidance + uncond

-            return [filtered_cond, uncond]
+            return [filtered_cond, uncond] + conds_out[2:]

        m = model.clone()
        m.set_model_sampler_pre_cfg_function(custom_cfg_function)
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@ -23,38 +23,78 @@ from comfy.comfy_types.node_typing import IO
 from comfy.weight_adapter import adapters


+def make_batch_extra_option_dict(d, indicies, full_size=None):
+    new_dict = {}
+    for k, v in d.items():
+        newv = v
+        if isinstance(v, dict):
+            newv = make_batch_extra_option_dict(v, indicies, full_size=full_size)
+        elif isinstance(v, torch.Tensor):
+            if full_size is None or v.size(0) == full_size:
+                newv = v[indicies]
+        elif isinstance(v, (list, tuple)) and len(v) == full_size:
+            newv = [v[i] for i in indicies]
+        new_dict[k] = newv
+    return new_dict
+
+
 class TrainSampler(comfy.samplers.Sampler):

-    def __init__(self, loss_fn, optimizer, loss_callback=None):
+    def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.loss_callback = loss_callback
+        self.batch_size = batch_size
+        self.total_steps = total_steps
+        self.seed = seed
+        self.training_dtype = training_dtype

    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
-        self.optimizer.zero_grad()
-        noise = model_wrap.inner_model.model_sampling.noise_scaling(sigmas, noise, latent_image, False)
-        latent = model_wrap.inner_model.model_sampling.noise_scaling(
-            torch.zeros_like(sigmas),
-            torch.zeros_like(noise, requires_grad=True),
-            latent_image,
-            False
-        )
+        cond = model_wrap.conds["positive"]
+        dataset_size = sigmas.size(0)
+        torch.cuda.empty_cache()
+        for i in (pbar:=tqdm.trange(self.total_steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)):
+            noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(self.seed + i * 1000)
+            indicies = torch.randperm(dataset_size)[:self.batch_size].tolist()

-        # Ensure model is in training mode and computing gradients
-        # x0 pred
-        denoised = model_wrap(noise, sigmas, **extra_args)
-        try:
-            loss = self.loss_fn(denoised, latent.clone())
-        except RuntimeError as e:
-            if "does not require grad and does not have a grad_fn" in str(e):
-                logging.info("WARNING: This is likely due to the model is loaded in inference mode.")
-        loss.backward()
-        if self.loss_callback:
-            self.loss_callback(loss.item())
+            batch_latent = torch.stack([latent_image[i] for i in indicies])
+            batch_noise = noisegen.generate_noise({"samples": batch_latent}).to(batch_latent.device)
+            batch_sigmas = [
+                model_wrap.inner_model.model_sampling.percent_to_sigma(
+                    torch.rand((1,)).item()
+                ) for _ in range(min(self.batch_size, dataset_size))
+            ]
+            batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device)

-        self.optimizer.step()
-        # torch.cuda.memory._dump_snapshot("trainn.pickle")
-        # torch.cuda.memory._record_memory_history(enabled=None)
+            xt = model_wrap.inner_model.model_sampling.noise_scaling(
+                batch_sigmas,
+                batch_noise,
+                batch_latent,
+                False
+            )
+            x0 = model_wrap.inner_model.model_sampling.noise_scaling(
+                torch.zeros_like(batch_sigmas),
+                torch.zeros_like(batch_noise),
+                batch_latent,
+                False
+            )
+
+            model_wrap.conds["positive"] = [
+                cond[i] for i in indicies
+            ]
+            batch_extra_args = make_batch_extra_option_dict(extra_args, indicies, full_size=dataset_size)
+
+            with torch.autocast(xt.device.type, dtype=self.training_dtype):
+                x0_pred = model_wrap(xt, batch_sigmas, **batch_extra_args)
+                loss = self.loss_fn(x0_pred, x0)
+            loss.backward()
+            if self.loss_callback:
+                self.loss_callback(loss.item())
+            pbar.set_postfix({"loss": f"{loss.item():.4f}"})
+
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+        torch.cuda.empty_cache()
        return torch.zeros_like(latent_image)


@ -75,7 +115,7 @@ class BiasDiff(torch.nn.Module):
        return self.passive_memory_usage()


-def load_and_process_images(image_files, input_dir, resize_method="None"):
+def load_and_process_images(image_files, input_dir, resize_method="None", w=None, h=None):
    """Utility function to load and process a list of images.

    Args:
@ -90,7 +130,6 @@ def load_and_process_images(image_files, input_dir, resize_method="None"):
        raise ValueError("No valid images found in input")

    output_images = []
-    w, h = None, None

    for file in image_files:
        image_path = os.path.join(input_dir, file)
@ -206,6 +245,103 @@ class LoadImageSetFromFolderNode:
        return (output_tensor,)


+class LoadImageTextSetFromFolderNode:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."}),
+                "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}),
+            },
+            "optional": {
+                "resize_method": (
+                    ["None", "Stretch", "Crop", "Pad"],
+                    {"default": "None"},
+                ),
+                "width": (
+                    IO.INT,
+                    {
+                        "default": -1,
+                        "min": -1,
+                        "max": 10000,
+                        "step": 1,
+                        "tooltip": "The width to resize the images to. -1 means use the original width.",
+                    },
+                ),
+                "height": (
+                    IO.INT,
+                    {
+                        "default": -1,
+                        "min": -1,
+                        "max": 10000,
+                        "step": 1,
+                        "tooltip": "The height to resize the images to. -1 means use the original height.",
+                    },
+                )
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE", IO.CONDITIONING,)
+    FUNCTION = "load_images"
+    CATEGORY = "loaders"
+    EXPERIMENTAL = True
+    DESCRIPTION = "Loads a batch of images and caption from a directory for training."
+
+    def load_images(self, folder, clip, resize_method, width=None, height=None):
+        if clip is None:
+            raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.")
+
+        logging.info(f"Loading images from folder: {folder}")
+
+        sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder)
+        valid_extensions = [".png", ".jpg", ".jpeg", ".webp"]
+
+        image_files = []
+        for item in os.listdir(sub_input_dir):
+            path = os.path.join(sub_input_dir, item)
+            if any(item.lower().endswith(ext) for ext in valid_extensions):
+                image_files.append(path)
+            elif os.path.isdir(path):
+                # Support kohya-ss/sd-scripts folder structure
+                repeat = 1
+                if item.split("_")[0].isdigit():
+                    repeat = int(item.split("_")[0])
+                image_files.extend([
+                    os.path.join(path, f) for f in os.listdir(path) if any(f.lower().endswith(ext) for ext in valid_extensions)
+                ] * repeat)
+
+        caption_file_path = [
+            f.replace(os.path.splitext(f)[1], ".txt")
+            for f in image_files
+        ]
+        captions = []
+        for caption_file in caption_file_path:
+            caption_path = os.path.join(sub_input_dir, caption_file)
+            if os.path.exists(caption_path):
+                with open(caption_path, "r", encoding="utf-8") as f:
+                    caption = f.read().strip()
+                    captions.append(caption)
+            else:
+                captions.append("")
+
+        width = width if width != -1 else None
+        height = height if height != -1 else None
+        output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method, width, height)
+
+        logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.")
+
+        logging.info(f"Encoding captions from {sub_input_dir}.")
+        conditions = []
+        empty_cond = clip.encode_from_tokens_scheduled(clip.tokenize(""))
+        for text in captions:
+            if text == "":
+                conditions.append(empty_cond)
+            tokens = clip.tokenize(text)
+            conditions.extend(clip.encode_from_tokens_scheduled(tokens))
+        logging.info(f"Encoded {len(conditions)} captions from {sub_input_dir}.")
+        return (output_tensor, conditions)
+
+
 def draw_loss_graph(loss_map, steps):
    width, height = 500, 300
    img = Image.new("RGB", (width, height), "white")
@ -381,6 +517,13 @@ class TrainLoraNode:

        latents = latents["samples"].to(dtype)
        num_images = latents.shape[0]
+        logging.info(f"Total Images: {num_images}, Total Captions: {len(positive)}")
+        if len(positive) == 1 and num_images > 1:
+            positive = positive * num_images
+        elif len(positive) != num_images:
+            raise ValueError(
+                f"Number of positive conditions ({len(positive)}) does not match number of images ({num_images})."
+            )

        with torch.inference_mode(False):
            lora_sd = {}
@ -474,43 +617,41 @@ class TrainLoraNode:
            # setup models
            for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model):
                patch(m)
+            mp.model.requires_grad_(False)
            comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True)

            # Setup sampler and guider like in test script
            loss_map = {"loss": []}
            def loss_callback(loss):
                loss_map["loss"].append(loss)
-                pbar.set_postfix({"loss": f"{loss:.4f}"})
            train_sampler = TrainSampler(
-                criterion, optimizer, loss_callback=loss_callback
+                criterion,
+                optimizer,
+                loss_callback=loss_callback,
+                batch_size=batch_size,
+                total_steps=steps,
+                seed=seed,
+                training_dtype=dtype
            )
            guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp)
            guider.set_conds(positive)  # Set conditioning from input
-            ss = comfy_extras.nodes_custom_sampler.SamplerCustomAdvanced()
-
-            # yoland: this currently resize to the first image in the dataset

            # Training loop
-            torch.cuda.empty_cache()
            try:
-                for step in (pbar:=tqdm.trange(steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)):
-                    # Generate random sigma
-                    sigma = mp.model.model_sampling.percent_to_sigma(
-                        torch.rand((1,)).item()
-                    )
-                    sigma = torch.tensor([sigma])
-
-                    noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(step * 1000 + seed)
-
-                    indices = torch.randperm(num_images)[:batch_size]
-                    ss.sample(
-                        noise, guider, train_sampler, sigma, {"samples": latents[indices].clone()}
-                    )
+                # Generate dummy sigmas and noise
+                sigmas = torch.tensor(range(num_images))
+                noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(seed)
+                guider.sample(
+                    noise.generate_noise({"samples": latents}),
+                    latents,
+                    train_sampler,
+                    sigmas,
+                    seed=noise.seed
+                )
            finally:
                for m in mp.model.modules():
                    unpatch(m)
-            del ss, train_sampler, optimizer
-            torch.cuda.empty_cache()
+            del train_sampler, optimizer

            for adapter in all_weight_adapters:
                adapter.requires_grad_(False)
@ -697,6 +838,7 @@ NODE_CLASS_MAPPINGS = {
    "SaveLoRANode": SaveLoRA,
    "LoraModelLoader": LoraModelLoader,
    "LoadImageSetFromFolderNode": LoadImageSetFromFolderNode,
+    "LoadImageTextSetFromFolderNode": LoadImageTextSetFromFolderNode,
    "LossGraphNode": LossGraphNode,
 }

@ -705,5 +847,6 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "SaveLoRANode": "Save LoRA Weights",
    "LoraModelLoader": "Load LoRA Model",
    "LoadImageSetFromFolderNode": "Load Image Dataset from Folder",
+    "LoadImageTextSetFromFolderNode": "Load Image and Text Dataset from Folder",
    "LossGraphNode": "Plot Loss Graph",
 }
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.43"
+__version__ = "0.3.44"
--- a/execution.py
+++ b/execution.py
@ -8,12 +8,14 @@ import time
 import traceback
 from enum import Enum
 from typing import List, Literal, NamedTuple, Optional
+import asyncio

 import torch

 import comfy.model_management
 import nodes
 from comfy_execution.caching import (
+    BasicCache,
    CacheKeySetID,
    CacheKeySetInputSignature,
    DependencyAwareCache,
@ -28,6 +30,8 @@ from comfy_execution.graph import (
 )
 from comfy_execution.graph_utils import GraphBuilder, is_link
 from comfy_execution.validation import validate_node_input
+from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler
+from comfy_execution.utils import CurrentNodeContext


 class ExecutionResult(Enum):
@ -39,12 +43,13 @@ class DuplicateNodeError(Exception):
    pass

 class IsChangedCache:
-    def __init__(self, dynprompt, outputs_cache):
+    def __init__(self, prompt_id: str, dynprompt: DynamicPrompt, outputs_cache: BasicCache):
+        self.prompt_id = prompt_id
        self.dynprompt = dynprompt
        self.outputs_cache = outputs_cache
        self.is_changed = {}

-    def get(self, node_id):
+    async def get(self, node_id):
        if node_id in self.is_changed:
            return self.is_changed[node_id]

@ -62,7 +67,8 @@ class IsChangedCache:
        # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED
        input_data_all, _ = get_input_data(node["inputs"], class_def, node_id, None)
        try:
-            is_changed = _map_node_over_list(class_def, input_data_all, "IS_CHANGED")
+            is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, "IS_CHANGED")
+            is_changed = await resolve_map_node_over_list_results(is_changed)
            node["is_changed"] = [None if isinstance(x, ExecutionBlocker) else x for x in is_changed]
        except Exception as e:
            logging.warning("WARNING: {}".format(e))
@ -164,7 +170,19 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e

 map_node_over_list = None #Don't hook this please

-def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None):
+async def resolve_map_node_over_list_results(results):
+    remaining = [x for x in results if isinstance(x, asyncio.Task) and not x.done()]
+    if len(remaining) == 0:
+        return [x.result() if isinstance(x, asyncio.Task) else x for x in results]
+    else:
+        done, pending = await asyncio.wait(remaining)
+        for task in done:
+            exc = task.exception()
+            if exc is not None:
+                raise exc
+        return [x.result() if isinstance(x, asyncio.Task) else x for x in results]
+
+async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None):
    # check if node wants the lists
    input_is_list = getattr(obj, "INPUT_IS_LIST", False)

@ -178,7 +196,7 @@ def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execut
        return {k: v[i if len(v) > i else -1] for k, v in d.items()}

    results = []
-    def process_inputs(inputs, index=None, input_is_list=False):
+    async def process_inputs(inputs, index=None, input_is_list=False):
        if allow_interrupt:
            nodes.before_node_execution()
        execution_block = None
@ -194,20 +212,37 @@ def _map_node_over_list(obj, input_data_all, func, allow_interrupt=False, execut
        if execution_block is None:
            if pre_execute_cb is not None and index is not None:
                pre_execute_cb(index)
-            results.append(getattr(obj, func)(**inputs))
+            f = getattr(obj, func)
+            if inspect.iscoroutinefunction(f):
+                async def async_wrapper(f, prompt_id, unique_id, list_index, args):
+                    with CurrentNodeContext(prompt_id, unique_id, list_index):
+                        return await f(**args)
+                task = asyncio.create_task(async_wrapper(f, prompt_id, unique_id, index, args=inputs))
+                # Give the task a chance to execute without yielding
+                await asyncio.sleep(0)
+                if task.done():
+                    result = task.result()
+                    results.append(result)
+                else:
+                    results.append(task)
+            else:
+                with CurrentNodeContext(prompt_id, unique_id, index):
+                    result = f(**inputs)
+                results.append(result)
        else:
            results.append(execution_block)

    if input_is_list:
-        process_inputs(input_data_all, 0, input_is_list=input_is_list)
+        await process_inputs(input_data_all, 0, input_is_list=input_is_list)
    elif max_len_input == 0:
-        process_inputs({})
+        await process_inputs({})
    else:
        for i in range(max_len_input):
            input_dict = slice_dict(input_data_all, i)
-            process_inputs(input_dict, i)
+            await process_inputs(input_dict, i)
    return results

+
 def merge_result_data(results, obj):
    # check which outputs need concatenating
    output = []
@ -229,11 +264,18 @@ def merge_result_data(results, obj):
            output.append([o[i] for o in results])
    return output

-def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb=None):
+async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None):
+    return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb)
+    has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values)
+    if has_pending_task:
+        return return_values, {}, False, has_pending_task
+    output, ui, has_subgraph = get_output_from_returns(return_values, obj)
+    return output, ui, has_subgraph, False
+
+def get_output_from_returns(return_values, obj):
    results = []
    uis = []
    subgraph_results = []
-    return_values = _map_node_over_list(obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb)
    has_subgraph = False
    for i in range(len(return_values)):
        r = return_values[i]
@ -267,6 +309,10 @@ def get_output_data(obj, input_data_all, execution_block_cb=None, pre_execute_cb
    else:
        output = []
    ui = dict()
+    # TODO: Think there's an existing bug here
+    # If we're performing a subgraph expansion, we probably shouldn't be returning UI values yet.
+    # They'll get cached without the completed subgraphs. It's an edge case and I'm not aware of
+    # any nodes that use both subgraph expansion and custom UI outputs, but might be a problem in the future.
    if len(uis) > 0:
        ui = {k: [y for x in uis for y in x[k]] for k in uis[0].keys()}
    return output, ui, has_subgraph
@ -279,7 +325,7 @@ def format_value(x):
    else:
        return str(x)

-def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results):
+async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes):
    unique_id = current_item
    real_node_id = dynprompt.get_real_node_id(unique_id)
    display_node_id = dynprompt.get_display_node_id(unique_id)
@ -291,11 +337,26 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp
        if server.client_id is not None:
            cached_output = caches.ui.get(unique_id) or {}
            server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id)
+        get_progress_state().finish_progress(unique_id)
        return (ExecutionResult.SUCCESS, None, None)

    input_data_all = None
    try:
-        if unique_id in pending_subgraph_results:
+        if unique_id in pending_async_nodes:
+            results = []
+            for r in pending_async_nodes[unique_id]:
+                if isinstance(r, asyncio.Task):
+                    try:
+                        results.append(r.result())
+                    except Exception as ex:
+                        # An async task failed - propagate the exception up
+                        del pending_async_nodes[unique_id]
+                        raise ex
+                else:
+                    results.append(r)
+            del pending_async_nodes[unique_id]
+            output_data, output_ui, has_subgraph = get_output_from_returns(results, class_def)
+        elif unique_id in pending_subgraph_results:
            cached_results = pending_subgraph_results[unique_id]
            resolved_outputs = []
            for is_subgraph, result in cached_results:
@ -317,6 +378,7 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp
            output_ui = []
            has_subgraph = False
        else:
+            get_progress_state().start_progress(unique_id)
            input_data_all, missing_keys = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data)
            if server.client_id is not None:
                server.last_node_id = display_node_id
@ -328,7 +390,8 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp
                caches.objects.set(unique_id, obj)

            if hasattr(obj, "check_lazy_status"):
-                required_inputs = _map_node_over_list(obj, input_data_all, "check_lazy_status", allow_interrupt=True)
+                required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True)
+                required_inputs = await resolve_map_node_over_list_results(required_inputs)
                required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], []))
                required_inputs = [x for x in required_inputs if isinstance(x,str) and (
                    x not in input_data_all or x in missing_keys
@ -357,8 +420,18 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp
                else:
                    return block
            def pre_execute_cb(call_index):
+                # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)?
                GraphBuilder.set_default_prefix(unique_id, call_index, 0)
-            output_data, output_ui, has_subgraph = get_output_data(obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb)
+            output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb)
+            if has_pending_tasks:
+                pending_async_nodes[unique_id] = output_data
+                unblock = execution_list.add_external_block(unique_id)
+                async def await_completion():
+                    tasks = [x for x in output_data if isinstance(x, asyncio.Task)]
+                    await asyncio.gather(*tasks, return_exceptions=True)
+                    unblock()
+                asyncio.create_task(await_completion())
+                return (ExecutionResult.PENDING, None, None)
        if len(output_ui) > 0:
            caches.ui.set(unique_id, {
                "meta": {
@ -401,7 +474,8 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp
                    cached_outputs.append((True, node_outputs))
            new_node_ids = set(new_node_ids)
            for cache in caches.all:
-                cache.ensure_subcache_for(unique_id, new_node_ids).clean_unused()
+                subcache = await cache.ensure_subcache_for(unique_id, new_node_ids)
+                subcache.clean_unused()
            for node_id in new_output_ids:
                execution_list.add_node(node_id)
            for link in new_output_links:
@ -446,6 +520,7 @@ def execute(server, dynprompt, caches, current_item, extra_data, executed, promp

        return (ExecutionResult.FAILURE, error_details, ex)

+    get_progress_state().finish_progress(unique_id)
    executed.add(unique_id)

    return (ExecutionResult.SUCCESS, None, None)
@ -500,6 +575,11 @@ class PromptExecutor:
            self.add_message("execution_error", mes, broadcast=False)

    def execute(self, prompt, prompt_id, extra_data={}, execute_outputs=[]):
+        asyncio_loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(asyncio_loop)
+        asyncio.run(self.execute_async(prompt, prompt_id, extra_data, execute_outputs))
+
+    async def execute_async(self, prompt, prompt_id, extra_data={}, execute_outputs=[]):
        nodes.interrupt_processing(False)

        if "client_id" in extra_data:
@ -512,9 +592,11 @@ class PromptExecutor:

        with torch.inference_mode():
            dynamic_prompt = DynamicPrompt(prompt)
-            is_changed_cache = IsChangedCache(dynamic_prompt, self.caches.outputs)
+            reset_progress_state(prompt_id, dynamic_prompt)
+            add_progress_handler(WebUIProgressHandler(self.server))
+            is_changed_cache = IsChangedCache(prompt_id, dynamic_prompt, self.caches.outputs)
            for cache in self.caches.all:
-                cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache)
+                await cache.set_prompt(dynamic_prompt, prompt.keys(), is_changed_cache)
                cache.clean_unused()

            cached_nodes = []
@ -527,6 +609,7 @@ class PromptExecutor:
                          { "nodes": cached_nodes, "prompt_id": prompt_id},
                          broadcast=False)
            pending_subgraph_results = {}
+            pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results
            executed = set()
            execution_list = ExecutionList(dynamic_prompt, self.caches.outputs)
            current_outputs = self.caches.outputs.all_node_ids()
@ -534,12 +617,13 @@ class PromptExecutor:
                execution_list.add_node(node_id)

            while not execution_list.is_empty():
-                node_id, error, ex = execution_list.stage_node_execution()
+                node_id, error, ex = await execution_list.stage_node_execution()
                if error is not None:
                    self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex)
                    break

-                result, error, ex = execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results)
+                assert node_id is not None, "Node ID should not be None at this point"
+                result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes)
                self.success = result != ExecutionResult.FAILURE
                if result == ExecutionResult.FAILURE:
                    self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex)
@ -569,7 +653,7 @@ class PromptExecutor:
                comfy.model_management.unload_all_models()


-def validate_inputs(prompt, item, validated):
+async def validate_inputs(prompt_id, prompt, item, validated):
    unique_id = item
    if unique_id in validated:
        return validated[unique_id]
@ -646,7 +730,7 @@ def validate_inputs(prompt, item, validated):
                errors.append(error)
                continue
            try:
-                r = validate_inputs(prompt, o_id, validated)
+                r = await validate_inputs(prompt_id, prompt, o_id, validated)
                if r[0] is False:
                    # `r` will be set in `validated[o_id]` already
                    valid = False
@ -771,7 +855,8 @@ def validate_inputs(prompt, item, validated):
            input_filtered['input_types'] = [received_types]

        #ret = obj_class.VALIDATE_INPUTS(**input_filtered)
-        ret = _map_node_over_list(obj_class, input_filtered, "VALIDATE_INPUTS")
+        ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, "VALIDATE_INPUTS")
+        ret = await resolve_map_node_over_list_results(ret)
        for x in input_filtered:
            for i, r in enumerate(ret):
                if r is not True and not isinstance(r, ExecutionBlocker):
@ -804,7 +889,7 @@ def full_type_name(klass):
        return klass.__qualname__
    return module + '.' + klass.__qualname__

-def validate_prompt(prompt):
+async def validate_prompt(prompt_id, prompt):
    outputs = set()
    for x in prompt:
        if 'class_type' not in prompt[x]:
@ -847,7 +932,7 @@ def validate_prompt(prompt):
        valid = False
        reasons = []
        try:
-            m = validate_inputs(prompt, o, validated)
+            m = await validate_inputs(prompt_id, prompt, o, validated)
            valid = m[0]
            reasons = m[1]
        except Exception as ex:
--- a/main.py
+++ b/main.py
@ -11,6 +11,9 @@ import itertools
 import utils.extra_config
 import logging
 import sys
+from comfy_execution.progress import get_progress_state
+from comfy_execution.utils import get_executing_context
+from comfy_api import feature_flags

 if not args.disable_manager:
    import comfyui_manager
@ -139,11 +142,14 @@ if __name__ == "__main__":

    import cuda_malloc

+if 'torch' in sys.modules:
+    logging.warning("WARNING: Potential Error in code: Torch already imported, torch should never be imported before this point.")
+
 import comfy.utils

 import execution
 import server
-from server import BinaryEventTypes
+from protocol import BinaryEventTypes
 import nodes
 import comfy.model_management
 import comfyui_version
@ -239,15 +245,34 @@ async def run(server_instance, address='', port=8188, verbose=True, call_on_star
        server_instance.start_multi_address(addresses, call_on_start, verbose), server_instance.publish_loop()
    )

-
 def hijack_progress(server_instance):
-    def hook(value, total, preview_image):
+    def hook(value, total, preview_image, prompt_id=None, node_id=None):
+        executing_context = get_executing_context()
+        if prompt_id is None and executing_context is not None:
+            prompt_id = executing_context.prompt_id
+        if node_id is None and executing_context is not None:
+            node_id = executing_context.node_id
        comfy.model_management.throw_exception_if_processing_interrupted()
-        progress = {"value": value, "max": total, "prompt_id": server_instance.last_prompt_id, "node": server_instance.last_node_id}
+        if prompt_id is None:
+            prompt_id = server_instance.last_prompt_id
+        if node_id is None:
+            node_id = server_instance.last_node_id
+        progress = {"value": value, "max": total, "prompt_id": prompt_id, "node": node_id}
+        get_progress_state().update_progress(node_id, value, total, preview_image)

        server_instance.send_sync("progress", progress, server_instance.client_id)
        if preview_image is not None:
-            server_instance.send_sync(BinaryEventTypes.UNENCODED_PREVIEW_IMAGE, preview_image, server_instance.client_id)
+            # Only send old method if client doesn't support preview metadata
+            if not feature_flags.supports_feature(
+                server_instance.sockets_metadata,
+                server_instance.client_id,
+                "supports_preview_metadata",
+            ):
+                server_instance.send_sync(
+                    BinaryEventTypes.UNENCODED_PREVIEW_IMAGE,
+                    preview_image,
+                    server_instance.client_id,
+                )

    comfy.utils.set_progress_bar_global_hook(hook)

--- a/nodes.py
+++ b/nodes.py
@ -2319,6 +2319,7 @@ def init_builtin_api_nodes():
        "nodes_pika.py",
        "nodes_runway.py",
        "nodes_tripo.py",
+        "nodes_moonvalley.py",
        "nodes_rodin.py",
        "nodes_gemini.py",
    ]
--- a/notebooks/comfyui_colab.ipynb
+++ b/notebooks/comfyui_colab.ipynb
@ -1,322 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "aaaaaaaaaa"
-   },
-   "source": [
-    "Git clone the repo and install the requirements. (ignore the pip errors about protobuf)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "bbbbbbbbbb"
-   },
-   "outputs": [],
-   "source": [
-    "#@title Environment Setup\n",
-    "\n",
-    "\n",
-    "OPTIONS = {}\n",
-    "\n",
-    "USE_GOOGLE_DRIVE = False  #@param {type:\"boolean\"}\n",
-    "UPDATE_COMFY_UI = True  #@param {type:\"boolean\"}\n",
-    "WORKSPACE = 'ComfyUI'\n",
-    "OPTIONS['USE_GOOGLE_DRIVE'] = USE_GOOGLE_DRIVE\n",
-    "OPTIONS['UPDATE_COMFY_UI'] = UPDATE_COMFY_UI\n",
-    "\n",
-    "if OPTIONS['USE_GOOGLE_DRIVE']:\n",
-    "    !echo \"Mounting Google Drive...\"\n",
-    "    %cd /\n",
-    "    \n",
-    "    from google.colab import drive\n",
-    "    drive.mount('/content/drive')\n",
-    "\n",
-    "    WORKSPACE = \"/content/drive/MyDrive/ComfyUI\"\n",
-    "    %cd /content/drive/MyDrive\n",
-    "\n",
-    "![ ! -d $WORKSPACE ] && echo -= Initial setup ComfyUI =- && git clone https://github.com/comfyanonymous/ComfyUI\n",
-    "%cd $WORKSPACE\n",
-    "\n",
-    "if OPTIONS['UPDATE_COMFY_UI']:\n",
-    "  !echo -= Updating ComfyUI =-\n",
-    "  !git pull\n",
-    "\n",
-    "!echo -= Install dependencies =-\n",
-    "!pip install xformers!=0.0.18 -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://download.pytorch.org/whl/cu117"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "cccccccccc"
-   },
-   "source": [
-    "Download some models/checkpoints/vae or custom comfyui nodes (uncomment the commands for the ones you want)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "dddddddddd"
-   },
-   "outputs": [],
-   "source": [
-    "# Checkpoints\n",
-    "\n",
-    "### SDXL\n",
-    "### I recommend these workflow examples: https://comfyanonymous.github.io/ComfyUI_examples/sdxl/\n",
-    "\n",
-    "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors -P ./models/checkpoints/\n",
-    "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/resolve/main/sd_xl_refiner_1.0.safetensors -P ./models/checkpoints/\n",
-    "\n",
-    "# SDXL ReVision\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/clip_vision_g/resolve/main/clip_vision_g.safetensors -P ./models/clip_vision/\n",
-    "\n",
-    "# SD1.5\n",
-    "!wget -c https://huggingface.co/Comfy-Org/stable-diffusion-v1-5-archive/resolve/main/v1-5-pruned-emaonly-fp16.safetensors -P ./models/checkpoints/\n",
-    "\n",
-    "# SD2\n",
-    "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.safetensors -P ./models/checkpoints/\n",
-    "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-ema-pruned.safetensors -P ./models/checkpoints/\n",
-    "\n",
-    "# Some SD1.5 anime style\n",
-    "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix2/AbyssOrangeMix2_hard.safetensors -P ./models/checkpoints/\n",
-    "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix3/AOM3A1_orangemixs.safetensors -P ./models/checkpoints/\n",
-    "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/Models/AbyssOrangeMix3/AOM3A3_orangemixs.safetensors -P ./models/checkpoints/\n",
-    "#!wget -c https://huggingface.co/Linaqruf/anything-v3.0/resolve/main/anything-v3-fp16-pruned.safetensors -P ./models/checkpoints/\n",
-    "\n",
-    "# Waifu Diffusion 1.5 (anime style SD2.x 768-v)\n",
-    "#!wget -c https://huggingface.co/waifu-diffusion/wd-1-5-beta3/resolve/main/wd-illusion-fp16.safetensors -P ./models/checkpoints/\n",
-    "\n",
-    "\n",
-    "# unCLIP models\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/illuminatiDiffusionV1_v11_unCLIP/resolve/main/illuminatiDiffusionV1_v11-unclip-h-fp16.safetensors -P ./models/checkpoints/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/wd-1.5-beta2_unCLIP/resolve/main/wd-1-5-beta2-aesthetic-unclip-h-fp16.safetensors -P ./models/checkpoints/\n",
-    "\n",
-    "\n",
-    "# VAE\n",
-    "!wget -c https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.safetensors -P ./models/vae/\n",
-    "#!wget -c https://huggingface.co/WarriorMama777/OrangeMixs/resolve/main/VAEs/orangemix.vae.pt -P ./models/vae/\n",
-    "#!wget -c https://huggingface.co/hakurei/waifu-diffusion-v1-4/resolve/main/vae/kl-f8-anime2.ckpt -P ./models/vae/\n",
-    "\n",
-    "\n",
-    "# Loras\n",
-    "#!wget -c https://civitai.com/api/download/models/10350 -O ./models/loras/theovercomer8sContrastFix_sd21768.safetensors #theovercomer8sContrastFix SD2.x 768-v\n",
-    "#!wget -c https://civitai.com/api/download/models/10638 -O ./models/loras/theovercomer8sContrastFix_sd15.safetensors #theovercomer8sContrastFix SD1.x\n",
-    "#!wget -c https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_offset_example-lora_1.0.safetensors -P ./models/loras/ #SDXL offset noise lora\n",
-    "\n",
-    "\n",
-    "# T2I-Adapter\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_depth_sd14v1.pth -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_seg_sd14v1.pth -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_sketch_sd14v1.pth -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_keypose_sd14v1.pth -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_openpose_sd14v1.pth -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_color_sd14v1.pth -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_canny_sd14v1.pth -P ./models/controlnet/\n",
-    "\n",
-    "# T2I Styles Model\n",
-    "#!wget -c https://huggingface.co/TencentARC/T2I-Adapter/resolve/main/models/t2iadapter_style_sd14v1.pth -P ./models/style_models/\n",
-    "\n",
-    "# CLIPVision model (needed for styles model)\n",
-    "#!wget -c https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin -O ./models/clip_vision/clip_vit14.bin\n",
-    "\n",
-    "\n",
-    "# ControlNet\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11e_sd15_ip2p_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11e_sd15_shuffle_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_canny_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11f1p_sd15_depth_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_inpaint_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_lineart_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_mlsd_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_normalbae_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_openpose_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_scribble_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_seg_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15_softedge_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11p_sd15s2_lineart_anime_fp16.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors/resolve/main/control_v11u_sd15_tile_fp16.safetensors -P ./models/controlnet/\n",
-    "\n",
-    "# ControlNet SDXL\n",
-    "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-canny-rank256.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-depth-rank256.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-recolor-rank256.safetensors -P ./models/controlnet/\n",
-    "#!wget -c https://huggingface.co/stabilityai/control-lora/resolve/main/control-LoRAs-rank256/control-lora-sketch-rank256.safetensors -P ./models/controlnet/\n",
-    "\n",
-    "# Controlnet Preprocessor nodes by Fannovel16\n",
-    "#!cd custom_nodes && git clone https://github.com/Fannovel16/comfy_controlnet_preprocessors; cd comfy_controlnet_preprocessors && python install.py\n",
-    "\n",
-    "\n",
-    "# GLIGEN\n",
-    "#!wget -c https://huggingface.co/comfyanonymous/GLIGEN_pruned_safetensors/resolve/main/gligen_sd14_textbox_pruned_fp16.safetensors -P ./models/gligen/\n",
-    "\n",
-    "\n",
-    "# ESRGAN upscale model\n",
-    "#!wget -c https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth -P ./models/upscale_models/\n",
-    "#!wget -c https://huggingface.co/sberbank-ai/Real-ESRGAN/resolve/main/RealESRGAN_x2.pth -P ./models/upscale_models/\n",
-    "#!wget -c https://huggingface.co/sberbank-ai/Real-ESRGAN/resolve/main/RealESRGAN_x4.pth -P ./models/upscale_models/\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "kkkkkkkkkkkkkkk"
-   },
-   "source": [
-    "### Run ComfyUI with cloudflared (Recommended Way)\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "jjjjjjjjjjjjjj"
-   },
-   "outputs": [],
-   "source": [
-    "!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb\n",
-    "!dpkg -i cloudflared-linux-amd64.deb\n",
-    "\n",
-    "import subprocess\n",
-    "import threading\n",
-    "import time\n",
-    "import socket\n",
-    "import urllib.request\n",
-    "\n",
-    "def iframe_thread(port):\n",
-    "  while True:\n",
-    "      time.sleep(0.5)\n",
-    "      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
-    "      result = sock.connect_ex(('127.0.0.1', port))\n",
-    "      if result == 0:\n",
-    "        break\n",
-    "      sock.close()\n",
-    "  print(\"\\nComfyUI finished loading, trying to launch cloudflared (if it gets stuck here cloudflared is having issues)\\n\")\n",
-    "\n",
-    "  p = subprocess.Popen([\"cloudflared\", \"tunnel\", \"--url\", \"http://127.0.0.1:{}\".format(port)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
-    "  for line in p.stderr:\n",
-    "    l = line.decode()\n",
-    "    if \"trycloudflare.com \" in l:\n",
-    "      print(\"This is the URL to access ComfyUI:\", l[l.find(\"http\"):], end='')\n",
-    "    #print(l, end='')\n",
-    "\n",
-    "\n",
-    "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n",
-    "\n",
-    "!python main.py --dont-print-server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "kkkkkkkkkkkkkk"
-   },
-   "source": [
-    "### Run ComfyUI with localtunnel\n",
-    "\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "jjjjjjjjjjjjj"
-   },
-   "outputs": [],
-   "source": [
-    "!npm install -g localtunnel\n",
-    "\n",
-    "import threading\n",
-    "\n",
-    "def iframe_thread(port):\n",
-    "  while True:\n",
-    "      time.sleep(0.5)\n",
-    "      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
-    "      result = sock.connect_ex(('127.0.0.1', port))\n",
-    "      if result == 0:\n",
-    "        break\n",
-    "      sock.close()\n",
-    "  print(\"\\nComfyUI finished loading, trying to launch localtunnel (if it gets stuck here localtunnel is having issues)\\n\")\n",
-    "\n",
-    "  print(\"The password/enpoint ip for localtunnel is:\", urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip(\"\\n\"))\n",
-    "  p = subprocess.Popen([\"lt\", \"--port\", \"{}\".format(port)], stdout=subprocess.PIPE)\n",
-    "  for line in p.stdout:\n",
-    "    print(line.decode(), end='')\n",
-    "\n",
-    "\n",
-    "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n",
-    "\n",
-    "!python main.py --dont-print-server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "gggggggggg"
-   },
-   "source": [
-    "### Run ComfyUI with colab iframe (use only in case the previous way with localtunnel doesn't work)\n",
-    "\n",
-    "You should see the ui appear in an iframe. If you get a 403 error, it's your firefox settings or an extension that's messing things up.\n",
-    "\n",
-    "If you want to open it in another window use the link.\n",
-    "\n",
-    "Note that some UI features like live image previews won't work because the colab iframe blocks websockets."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "hhhhhhhhhh"
-   },
-   "outputs": [],
-   "source": [
-    "import threading\n",
-    "def iframe_thread(port):\n",
-    "  while True:\n",
-    "      time.sleep(0.5)\n",
-    "      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
-    "      result = sock.connect_ex(('127.0.0.1', port))\n",
-    "      if result == 0:\n",
-    "        break\n",
-    "      sock.close()\n",
-    "  from google.colab import output\n",
-    "  output.serve_kernel_port_as_iframe(port, height=1024)\n",
-    "  print(\"to open it in a window you can open this link here:\")\n",
-    "  output.serve_kernel_port_as_window(port)\n",
-    "\n",
-    "threading.Thread(target=iframe_thread, daemon=True, args=(8188,)).start()\n",
-    "\n",
-    "!python main.py --dont-print-server"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "provenance": []
-  },
-  "gpuClass": "standard",
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
--- a/protocol.py
+++ b/protocol.py
@ -0,0 +1,7 @@
+
+class BinaryEventTypes:
+    PREVIEW_IMAGE = 1
+    UNENCODED_PREVIEW_IMAGE = 2
+    TEXT = 3
+    PREVIEW_IMAGE_WITH_METADATA = 4
+
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.43"
+version = "0.3.44"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.23.4
-comfyui-workflow-templates==0.1.33
+comfyui-workflow-templates==0.1.35
 comfyui-embedded-docs==0.2.4
 comfyui_manager
 torch
--- a/server.py
+++ b/server.py
@ -26,6 +26,7 @@ import mimetypes
 from comfy.cli_args import args
 import comfy.utils
 import comfy.model_management
+from comfy_api import feature_flags
 import node_helpers
 from comfyui_version import __version__
 from app.frontend_management import FrontendManager
@ -35,15 +36,11 @@ from app.model_manager import ModelFileManager
 from app.custom_node_manager import CustomNodeManager
 from typing import Optional, Union
 from api_server.routes.internal.internal_routes import InternalRoutes
+from protocol import BinaryEventTypes

 if not args.disable_manager:
    import comfyui_manager

-class BinaryEventTypes:
-    PREVIEW_IMAGE = 1
-    UNENCODED_PREVIEW_IMAGE = 2
-    TEXT = 3
-
 async def send_socket_catch_exception(function, message):
    try:
        await function(message)
@ -184,6 +181,7 @@ class PromptServer():
        max_upload_size = round(args.max_upload_size * 1024 * 1024)
        self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares)
        self.sockets = dict()
+        self.sockets_metadata = dict()
        self.web_root = (
            FrontendManager.init_frontend(args.front_end_version)
            if args.front_end_root is None
@ -208,20 +206,53 @@ class PromptServer():
            else:
                sid = uuid.uuid4().hex

+            # Store WebSocket for backward compatibility
            self.sockets[sid] = ws
+            # Store metadata separately
+            self.sockets_metadata[sid] = {"feature_flags": {}}

            try:
                # Send initial state to the new client
-                await self.send("status", { "status": self.get_queue_info(), 'sid': sid }, sid)
+                await self.send("status", {"status": self.get_queue_info(), "sid": sid}, sid)
                # On reconnect if we are the currently executing client send the current node
                if self.client_id == sid and self.last_node_id is not None:
                    await self.send("executing", { "node": self.last_node_id }, sid)

+                # Flag to track if we've received the first message
+                first_message = True
+
                async for msg in ws:
                    if msg.type == aiohttp.WSMsgType.ERROR:
                        logging.warning('ws connection closed with exception %s' % ws.exception())
+                    elif msg.type == aiohttp.WSMsgType.TEXT:
+                        try:
+                            data = json.loads(msg.data)
+                            # Check if first message is feature flags
+                            if first_message and data.get("type") == "feature_flags":
+                                # Store client feature flags
+                                client_flags = data.get("data", {})
+                                self.sockets_metadata[sid]["feature_flags"] = client_flags
+
+                                # Send server feature flags in response
+                                await self.send(
+                                    "feature_flags",
+                                    feature_flags.get_server_features(),
+                                    sid,
+                                )
+
+                                logging.info(
+                                    f"Feature flags negotiated for client {sid}: {client_flags}"
+                                )
+                            first_message = False
+                        except json.JSONDecodeError:
+                            logging.warning(
+                                f"Invalid JSON received from client {sid}: {msg.data}"
+                            )
+                        except Exception as e:
+                            logging.error(f"Error processing WebSocket message: {e}")
            finally:
                self.sockets.pop(sid, None)
+                self.sockets_metadata.pop(sid, None)
            return ws

        @routes.get("/")
@ -554,6 +585,10 @@ class PromptServer():
            }
            return web.json_response(system_stats)

+        @routes.get("/features")
+        async def get_features(request):
+            return web.json_response(feature_flags.get_server_features())
+
        @routes.get("/prompt")
        async def get_prompt(request):
            return web.json_response(self.get_queue_info())
@ -649,7 +684,8 @@ class PromptServer():

            if "prompt" in json_data:
                prompt = json_data["prompt"]
-                valid = execution.validate_prompt(prompt)
+                prompt_id = str(uuid.uuid4())
+                valid = await execution.validate_prompt(prompt_id, prompt)
                extra_data = {}
                if "extra_data" in json_data:
                    extra_data = json_data["extra_data"]
@ -657,7 +693,6 @@ class PromptServer():
                if "client_id" in json_data:
                    extra_data["client_id"] = json_data["client_id"]
                if valid[0]:
-                    prompt_id = str(uuid.uuid4())
                    outputs_to_execute = valid[2]
                    self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute))
                    response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]}
@ -772,6 +807,10 @@ class PromptServer():
    async def send(self, event, data, sid=None):
        if event == BinaryEventTypes.UNENCODED_PREVIEW_IMAGE:
            await self.send_image(data, sid=sid)
+        elif event == BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA:
+            # data is (preview_image, metadata)
+            preview_image, metadata = data
+            await self.send_image_with_metadata(preview_image, metadata, sid=sid)
        elif isinstance(data, (bytes, bytearray)):
            await self.send_bytes(event, data, sid)
        else:
@ -810,6 +849,43 @@ class PromptServer():
        preview_bytes = bytesIO.getvalue()
        await self.send_bytes(BinaryEventTypes.PREVIEW_IMAGE, preview_bytes, sid=sid)

+    async def send_image_with_metadata(self, image_data, metadata=None, sid=None):
+        image_type = image_data[0]
+        image = image_data[1]
+        max_size = image_data[2]
+        if max_size is not None:
+            if hasattr(Image, 'Resampling'):
+                resampling = Image.Resampling.BILINEAR
+            else:
+                resampling = Image.Resampling.LANCZOS
+
+            image = ImageOps.contain(image, (max_size, max_size), resampling)
+
+        mimetype = "image/png" if image_type == "PNG" else "image/jpeg"
+
+        # Prepare metadata
+        if metadata is None:
+            metadata = {}
+        metadata["image_type"] = mimetype
+
+        # Serialize metadata as JSON
+        import json
+        metadata_json = json.dumps(metadata).encode('utf-8')
+        metadata_length = len(metadata_json)
+
+        # Prepare image data
+        bytesIO = BytesIO()
+        image.save(bytesIO, format=image_type, quality=95, compress_level=1)
+        image_bytes = bytesIO.getvalue()
+
+        # Combine metadata and image
+        combined_data = bytearray()
+        combined_data.extend(struct.pack(">I", metadata_length))
+        combined_data.extend(metadata_json)
+        combined_data.extend(image_bytes)
+
+        await self.send_bytes(BinaryEventTypes.PREVIEW_IMAGE_WITH_METADATA, combined_data, sid=sid)
+
    async def send_bytes(self, event, data, sid=None):
        message = self.encode_bytes(event, data)

@ -851,10 +927,10 @@ class PromptServer():
        ssl_ctx = None
        scheme = "http"
        if args.tls_keyfile and args.tls_certfile:
-                ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_SERVER, verify_mode=ssl.CERT_NONE)
-                ssl_ctx.load_cert_chain(certfile=args.tls_certfile,
+            ssl_ctx = ssl.SSLContext(protocol=ssl.PROTOCOL_TLS_SERVER, verify_mode=ssl.CERT_NONE)
+            ssl_ctx.load_cert_chain(certfile=args.tls_certfile,
                                keyfile=args.tls_keyfile)
-                scheme = "https"
+            scheme = "https"

        if verbose:
            logging.info("Starting server\n")
--- a/tests-unit/feature_flags_test.py
+++ b/tests-unit/feature_flags_test.py
@ -0,0 +1,98 @@
+"""Tests for feature flags functionality."""
+
+from comfy_api.feature_flags import (
+    get_connection_feature,
+    supports_feature,
+    get_server_features,
+    SERVER_FEATURE_FLAGS,
+)
+
+
+class TestFeatureFlags:
+    """Test suite for feature flags functions."""
+
+    def test_get_server_features_returns_copy(self):
+        """Test that get_server_features returns a copy of the server flags."""
+        features = get_server_features()
+        # Verify it's a copy by modifying it
+        features["test_flag"] = True
+        # Original should be unchanged
+        assert "test_flag" not in SERVER_FEATURE_FLAGS
+
+    def test_get_server_features_contains_expected_flags(self):
+        """Test that server features contain expected flags."""
+        features = get_server_features()
+        assert "supports_preview_metadata" in features
+        assert features["supports_preview_metadata"] is True
+        assert "max_upload_size" in features
+        assert isinstance(features["max_upload_size"], (int, float))
+
+    def test_get_connection_feature_with_missing_sid(self):
+        """Test getting feature for non-existent session ID."""
+        sockets_metadata = {}
+        result = get_connection_feature(sockets_metadata, "missing_sid", "some_feature")
+        assert result is False  # Default value
+
+    def test_get_connection_feature_with_custom_default(self):
+        """Test getting feature with custom default value."""
+        sockets_metadata = {}
+        result = get_connection_feature(
+            sockets_metadata, "missing_sid", "some_feature", default="custom_default"
+        )
+        assert result == "custom_default"
+
+    def test_get_connection_feature_with_feature_flags(self):
+        """Test getting feature from connection with feature flags."""
+        sockets_metadata = {
+            "sid1": {
+                "feature_flags": {
+                    "supports_preview_metadata": True,
+                    "custom_feature": "value",
+                },
+            }
+        }
+        result = get_connection_feature(sockets_metadata, "sid1", "supports_preview_metadata")
+        assert result is True
+
+        result = get_connection_feature(sockets_metadata, "sid1", "custom_feature")
+        assert result == "value"
+
+    def test_get_connection_feature_missing_feature(self):
+        """Test getting non-existent feature from connection."""
+        sockets_metadata = {
+            "sid1": {"feature_flags": {"existing_feature": True}}
+        }
+        result = get_connection_feature(sockets_metadata, "sid1", "missing_feature")
+        assert result is False
+
+    def test_supports_feature_returns_boolean(self):
+        """Test that supports_feature always returns boolean."""
+        sockets_metadata = {
+            "sid1": {
+                "feature_flags": {
+                    "bool_feature": True,
+                    "string_feature": "value",
+                    "none_feature": None,
+                },
+            }
+        }
+
+        # True boolean feature
+        assert supports_feature(sockets_metadata, "sid1", "bool_feature") is True
+
+        # Non-boolean values should return False
+        assert supports_feature(sockets_metadata, "sid1", "string_feature") is False
+        assert supports_feature(sockets_metadata, "sid1", "none_feature") is False
+        assert supports_feature(sockets_metadata, "sid1", "missing_feature") is False
+
+    def test_supports_feature_with_missing_connection(self):
+        """Test supports_feature with missing connection."""
+        sockets_metadata = {}
+        assert supports_feature(sockets_metadata, "missing_sid", "any_feature") is False
+
+    def test_empty_feature_flags_dict(self):
+        """Test connection with empty feature flags dictionary."""
+        sockets_metadata = {"sid1": {"feature_flags": {}}}
+        result = get_connection_feature(sockets_metadata, "sid1", "any_feature")
+        assert result is False
+        assert supports_feature(sockets_metadata, "sid1", "any_feature") is False
--- a/tests-unit/requirements.txt
+++ b/tests-unit/requirements.txt
@ -1,3 +1,4 @@
 pytest>=7.8.0
 pytest-aiohttp
 pytest-asyncio
+websocket-client
--- a/tests-unit/websocket_feature_flags_test.py
+++ b/tests-unit/websocket_feature_flags_test.py
@ -0,0 +1,77 @@
+"""Simplified tests for WebSocket feature flags functionality."""
+from comfy_api import feature_flags
+
+
+class TestWebSocketFeatureFlags:
+    """Test suite for WebSocket feature flags integration."""
+
+    def test_server_feature_flags_response(self):
+        """Test server feature flags are properly formatted."""
+        features = feature_flags.get_server_features()
+
+        # Check expected server features
+        assert "supports_preview_metadata" in features
+        assert features["supports_preview_metadata"] is True
+        assert "max_upload_size" in features
+        assert isinstance(features["max_upload_size"], (int, float))
+
+    def test_progress_py_checks_feature_flags(self):
+        """Test that progress.py checks feature flags before sending metadata."""
+        # This simulates the check in progress.py
+        client_id = "test_client"
+        sockets_metadata = {"test_client": {"feature_flags": {}}}
+
+        # The actual check would be in progress.py
+        supports_metadata = feature_flags.supports_feature(
+            sockets_metadata, client_id, "supports_preview_metadata"
+        )
+
+        assert supports_metadata is False
+
+    def test_multiple_clients_different_features(self):
+        """Test handling multiple clients with different feature support."""
+        sockets_metadata = {
+            "modern_client": {
+                "feature_flags": {"supports_preview_metadata": True}
+            },
+            "legacy_client": {
+                "feature_flags": {}
+            }
+        }
+
+        # Check modern client
+        assert feature_flags.supports_feature(
+            sockets_metadata, "modern_client", "supports_preview_metadata"
+        ) is True
+
+        # Check legacy client
+        assert feature_flags.supports_feature(
+            sockets_metadata, "legacy_client", "supports_preview_metadata"
+        ) is False
+
+    def test_feature_negotiation_message_format(self):
+        """Test the format of feature negotiation messages."""
+        # Client message format
+        client_message = {
+            "type": "feature_flags",
+            "data": {
+                "supports_preview_metadata": True,
+                "api_version": "1.0.0"
+            }
+        }
+
+        # Verify structure
+        assert client_message["type"] == "feature_flags"
+        assert "supports_preview_metadata" in client_message["data"]
+
+        # Server response format (what would be sent)
+        server_features = feature_flags.get_server_features()
+        server_message = {
+            "type": "feature_flags",
+            "data": server_features
+        }
+
+        # Verify structure
+        assert server_message["type"] == "feature_flags"
+        assert "supports_preview_metadata" in server_message["data"]
+        assert server_message["data"]["supports_preview_metadata"] is True
--- a/tests/inference/extra_model_paths.yaml
+++ b/tests/inference/extra_model_paths.yaml
@ -1,4 +1,4 @@
 # Config for testing nodes
 testing:
-    custom_nodes: tests/inference/testing_nodes
+    custom_nodes: testing_nodes

--- a/tests/inference/test_async_nodes.py
+++ b/tests/inference/test_async_nodes.py
@ -0,0 +1,410 @@
+import pytest
+import time
+import torch
+import urllib.error
+import numpy as np
+import subprocess
+
+from pytest import fixture
+from comfy_execution.graph_utils import GraphBuilder
+from tests.inference.test_execution import ComfyClient
+
+
+@pytest.mark.execution
+class TestAsyncNodes:
+    @fixture(scope="class", autouse=True, params=[
+        (False, 0),
+        (True, 0),
+        (True, 100),
+    ])
+    def _server(self, args_pytest, request):
+        pargs = [
+            'python','main.py',
+            '--output-directory', args_pytest["output_dir"],
+            '--listen', args_pytest["listen"],
+            '--port', str(args_pytest["port"]),
+            '--extra-model-paths-config', 'tests/inference/extra_model_paths.yaml',
+        ]
+        use_lru, lru_size = request.param
+        if use_lru:
+            pargs += ['--cache-lru', str(lru_size)]
+        # Running server with args: pargs
+        p = subprocess.Popen(pargs)
+        yield
+        p.kill()
+        torch.cuda.empty_cache()
+
+    @fixture(scope="class", autouse=True)
+    def shared_client(self, args_pytest, _server):
+        client = ComfyClient()
+        n_tries = 5
+        for i in range(n_tries):
+            time.sleep(4)
+            try:
+                client.connect(listen=args_pytest["listen"], port=args_pytest["port"])
+            except ConnectionRefusedError:
+                # Retrying...
+                pass
+            else:
+                break
+        yield client
+        del client
+        torch.cuda.empty_cache()
+
+    @fixture
+    def client(self, shared_client, request):
+        shared_client.set_test_name(f"async_nodes[{request.node.name}]")
+        yield shared_client
+
+    @fixture
+    def builder(self, request):
+        yield GraphBuilder(prefix=request.node.name)
+
+    # Happy Path Tests
+
+    def test_basic_async_execution(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that a basic async node executes correctly."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.1)
+        output = g.node("SaveImage", images=sleep_node.out(0))
+
+        result = client.run(g)
+
+        # Verify execution completed
+        assert result.did_run(sleep_node), "Async sleep node should have executed"
+        assert result.did_run(output), "Output node should have executed"
+
+        # Verify the image passed through correctly
+        result_images = result.get_images(output)
+        assert len(result_images) == 1, "Should have 1 image"
+        assert np.array(result_images[0]).min() == 0 and np.array(result_images[0]).max() == 0, "Image should be black"
+
+    def test_multiple_async_parallel_execution(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that multiple async nodes execute in parallel."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+
+        # Create multiple async sleep nodes with different durations
+        sleep1 = g.node("TestSleep", value=image.out(0), seconds=0.3)
+        sleep2 = g.node("TestSleep", value=image.out(0), seconds=0.4)
+        sleep3 = g.node("TestSleep", value=image.out(0), seconds=0.5)
+
+        # Add outputs for each
+        _output1 = g.node("PreviewImage", images=sleep1.out(0))
+        _output2 = g.node("PreviewImage", images=sleep2.out(0))
+        _output3 = g.node("PreviewImage", images=sleep3.out(0))
+
+        start_time = time.time()
+        result = client.run(g)
+        elapsed_time = time.time() - start_time
+
+        # Should take ~0.5s (max duration) not 1.2s (sum of durations)
+        assert elapsed_time < 0.8, f"Parallel execution took {elapsed_time}s, expected < 0.8s"
+
+        # Verify all nodes executed
+        assert result.did_run(sleep1) and result.did_run(sleep2) and result.did_run(sleep3)
+
+    def test_async_with_dependencies(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async nodes with proper dependency handling."""
+        g = builder
+        image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+
+        # Chain of async operations
+        sleep1 = g.node("TestSleep", value=image1.out(0), seconds=0.2)
+        sleep2 = g.node("TestSleep", value=image2.out(0), seconds=0.2)
+
+        # Average depends on both async results
+        average = g.node("TestVariadicAverage", input1=sleep1.out(0), input2=sleep2.out(0))
+        output = g.node("SaveImage", images=average.out(0))
+
+        result = client.run(g)
+
+        # Verify execution order
+        assert result.did_run(sleep1) and result.did_run(sleep2)
+        assert result.did_run(average) and result.did_run(output)
+
+        # Verify averaged result
+        result_images = result.get_images(output)
+        avg_value = np.array(result_images[0]).mean()
+        assert abs(avg_value - 127.5) < 1, f"Average value {avg_value} should be ~127.5"
+
+    def test_async_validate_inputs(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async VALIDATE_INPUTS function."""
+        g = builder
+        # Create a test node with async validation
+        validation_node = g.node("TestAsyncValidation", value=5.0, threshold=10.0)
+        g.node("SaveImage", images=validation_node.out(0))
+
+        # Should pass validation
+        result = client.run(g)
+        assert result.did_run(validation_node)
+
+        # Test validation failure
+        validation_node.inputs['threshold'] = 3.0  # Will fail since value > threshold
+        with pytest.raises(urllib.error.HTTPError):
+            client.run(g)
+
+    def test_async_lazy_evaluation(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async nodes with lazy evaluation."""
+        g = builder
+        input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        input2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+        mask = g.node("StubMask", value=0.0, height=512, width=512, batch_size=1)
+
+        # Create async nodes that will be evaluated lazily
+        sleep1 = g.node("TestSleep", value=input1.out(0), seconds=0.3)
+        sleep2 = g.node("TestSleep", value=input2.out(0), seconds=0.3)
+
+        # Use lazy mix that only needs sleep1 (mask=0.0)
+        lazy_mix = g.node("TestLazyMixImages", image1=sleep1.out(0), image2=sleep2.out(0), mask=mask.out(0))
+        g.node("SaveImage", images=lazy_mix.out(0))
+
+        start_time = time.time()
+        result = client.run(g)
+        elapsed_time = time.time() - start_time
+
+        # Should only execute sleep1, not sleep2
+        assert elapsed_time < 0.5, f"Should skip sleep2, took {elapsed_time}s"
+        assert result.did_run(sleep1), "Sleep1 should have executed"
+        assert not result.did_run(sleep2), "Sleep2 should have been skipped"
+
+    def test_async_check_lazy_status(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async check_lazy_status function."""
+        g = builder
+        # Create a node with async check_lazy_status
+        lazy_node = g.node("TestAsyncLazyCheck",
+                          input1="value1",
+                          input2="value2",
+                          condition=True)
+        g.node("SaveImage", images=lazy_node.out(0))
+
+        result = client.run(g)
+        assert result.did_run(lazy_node)
+
+    # Error Handling Tests
+
+    def test_async_execution_error(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that async execution errors are properly handled."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        # Create an async node that will error
+        error_node = g.node("TestAsyncError", value=image.out(0), error_after=0.1)
+        g.node("SaveImage", images=error_node.out(0))
+
+        try:
+            client.run(g)
+            assert False, "Should have raised an error"
+        except Exception as e:
+            assert 'prompt_id' in e.args[0], f"Did not get proper error message: {e}"
+            assert e.args[0]['node_id'] == error_node.id, "Error should be from async error node"
+
+    def test_async_validation_error(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async validation error handling."""
+        g = builder
+        # Node with async validation that will fail
+        validation_node = g.node("TestAsyncValidationError", value=15.0, max_value=10.0)
+        g.node("SaveImage", images=validation_node.out(0))
+
+        with pytest.raises(urllib.error.HTTPError) as exc_info:
+            client.run(g)
+        # Verify it's a validation error
+        assert exc_info.value.code == 400
+
+    def test_async_timeout_handling(self, client: ComfyClient, builder: GraphBuilder):
+        """Test handling of async operations that timeout."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        # Very long sleep that would timeout
+        timeout_node = g.node("TestAsyncTimeout", value=image.out(0), timeout=0.5, operation_time=2.0)
+        g.node("SaveImage", images=timeout_node.out(0))
+
+        try:
+            client.run(g)
+            assert False, "Should have raised a timeout error"
+        except Exception as e:
+            assert 'timeout' in str(e).lower(), f"Expected timeout error, got: {e}"
+
+    def test_concurrent_async_error_recovery(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that workflow can recover after async errors."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+
+        # First run with error
+        error_node = g.node("TestAsyncError", value=image.out(0), error_after=0.1)
+        g.node("SaveImage", images=error_node.out(0))
+
+        try:
+            client.run(g)
+        except Exception:
+            pass  # Expected
+
+        # Second run should succeed
+        g2 = GraphBuilder(prefix="recovery_test")
+        image2 = g2.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+        sleep_node = g2.node("TestSleep", value=image2.out(0), seconds=0.1)
+        g2.node("SaveImage", images=sleep_node.out(0))
+
+        result = client.run(g2)
+        assert result.did_run(sleep_node), "Should be able to run after error"
+
+    def test_sync_error_during_async_execution(self, client: ComfyClient, builder: GraphBuilder):
+        """Test handling when sync node errors while async node is executing."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+
+        # Async node that takes time
+        sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.5)
+
+        # Sync node that will error immediately
+        error_node = g.node("TestSyncError", value=image.out(0))
+
+        # Both feed into output
+        g.node("PreviewImage", images=sleep_node.out(0))
+        g.node("PreviewImage", images=error_node.out(0))
+
+        try:
+            client.run(g)
+            assert False, "Should have raised an error"
+        except Exception as e:
+            # Verify the sync error was caught even though async was running
+            assert 'prompt_id' in e.args[0]
+
+    # Edge Cases
+
+    def test_async_with_execution_blocker(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async nodes with execution blockers."""
+        g = builder
+        image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+
+        # Async sleep nodes
+        sleep1 = g.node("TestSleep", value=image1.out(0), seconds=0.2)
+        sleep2 = g.node("TestSleep", value=image2.out(0), seconds=0.2)
+
+        # Create list of images
+        image_list = g.node("TestMakeListNode", value1=sleep1.out(0), value2=sleep2.out(0))
+
+        # Create list of blocking conditions - [False, True] to block only the second item
+        int1 = g.node("StubInt", value=1)
+        int2 = g.node("StubInt", value=2)
+        block_list = g.node("TestMakeListNode", value1=int1.out(0), value2=int2.out(0))
+
+        # Compare each value against 2, so first is False (1 != 2) and second is True (2 == 2)
+        compare = g.node("TestIntConditions", a=block_list.out(0), b=2, operation="==")
+
+        # Block based on the comparison results
+        blocker = g.node("TestExecutionBlocker", input=image_list.out(0), block=compare.out(0), verbose=False)
+
+        output = g.node("PreviewImage", images=blocker.out(0))
+
+        result = client.run(g)
+        images = result.get_images(output)
+        assert len(images) == 1, "Should have blocked second image"
+
+    def test_async_caching_behavior(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that async nodes are properly cached."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        sleep_node = g.node("TestSleep", value=image.out(0), seconds=0.2)
+        g.node("SaveImage", images=sleep_node.out(0))
+
+        # First run
+        result1 = client.run(g)
+        assert result1.did_run(sleep_node), "Should run first time"
+
+        # Second run - should be cached
+        start_time = time.time()
+        result2 = client.run(g)
+        elapsed_time = time.time() - start_time
+
+        assert not result2.did_run(sleep_node), "Should be cached"
+        assert elapsed_time < 0.1, f"Cached run took {elapsed_time}s, should be instant"
+
+    def test_async_with_dynamic_prompts(self, client: ComfyClient, builder: GraphBuilder):
+        """Test async nodes within dynamically generated prompts."""
+        g = builder
+        image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+
+        # Node that generates async nodes dynamically
+        dynamic_async = g.node("TestDynamicAsyncGeneration",
+                              image1=image1.out(0),
+                              image2=image2.out(0),
+                              num_async_nodes=3,
+                              sleep_duration=0.2)
+        g.node("SaveImage", images=dynamic_async.out(0))
+
+        start_time = time.time()
+        result = client.run(g)
+        elapsed_time = time.time() - start_time
+
+        # Should execute async nodes in parallel within dynamic prompt
+        assert elapsed_time < 0.5, f"Dynamic async execution took {elapsed_time}s"
+        assert result.did_run(dynamic_async)
+
+    def test_async_resource_cleanup(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that async resources are properly cleaned up."""
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+
+        # Create multiple async nodes that use resources
+        resource_nodes = []
+        for i in range(5):
+            node = g.node("TestAsyncResourceUser",
+                         value=image.out(0),
+                         resource_id=f"resource_{i}",
+                         duration=0.1)
+            resource_nodes.append(node)
+            g.node("PreviewImage", images=node.out(0))
+
+        result = client.run(g)
+
+        # Verify all nodes executed
+        for node in resource_nodes:
+            assert result.did_run(node)
+
+        # Run again to ensure resources were cleaned up
+        result2 = client.run(g)
+        # Should be cached but not error due to resource conflicts
+        for node in resource_nodes:
+            assert not result2.did_run(node), "Should be cached"
+
+    def test_async_cancellation(self, client: ComfyClient, builder: GraphBuilder):
+        """Test cancellation of async operations."""
+        # This would require implementing cancellation in the client
+        # For now, we'll test that long-running async operations can be interrupted
+        pass  # TODO: Implement when cancellation API is available
+
+    def test_mixed_sync_async_execution(self, client: ComfyClient, builder: GraphBuilder):
+        """Test workflows with both sync and async nodes."""
+        g = builder
+        image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+        mask = g.node("StubMask", value=0.5, height=512, width=512, batch_size=1)
+
+        # Mix of sync and async operations
+        # Sync: lazy mix images
+        sync_op1 = g.node("TestLazyMixImages", image1=image1.out(0), image2=image2.out(0), mask=mask.out(0))
+        # Async: sleep
+        async_op1 = g.node("TestSleep", value=sync_op1.out(0), seconds=0.2)
+        # Sync: custom validation
+        sync_op2 = g.node("TestCustomValidation1", input1=async_op1.out(0), input2=0.5)
+        # Async: sleep again
+        async_op2 = g.node("TestSleep", value=sync_op2.out(0), seconds=0.2)
+
+        output = g.node("SaveImage", images=async_op2.out(0))
+
+        result = client.run(g)
+
+        # Verify all nodes executed in correct order
+        assert result.did_run(sync_op1)
+        assert result.did_run(async_op1)
+        assert result.did_run(sync_op2)
+        assert result.did_run(async_op2)
+
+        # Image should be a mix of black and white (gray)
+        result_images = result.get_images(output)
+        avg_value = np.array(result_images[0]).mean()
+        assert abs(avg_value - 63.75) < 5, f"Average value {avg_value} should be ~63.75"
--- a/tests/inference/test_execution.py
+++ b/tests/inference/test_execution.py
@ -252,7 +252,7 @@ class TestExecution:

    @pytest.mark.parametrize("test_type, test_value", [
        ("StubInt", 5),
-        ("StubFloat", 5.0)
+        ("StubMask", 5.0)
    ])
    def test_validation_error_edge1(self, test_type, test_value, client: ComfyClient, builder: GraphBuilder):
        g = builder
@ -497,6 +497,69 @@ class TestExecution:
        assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25"
        assert not result.did_run(test_node), "The execution should have been cached"

+    def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder):
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+
+        # Create sleep nodes for each duration
+        sleep_node1 = g.node("TestSleep", value=image.out(0), seconds=2.8)
+        sleep_node2 = g.node("TestSleep", value=image.out(0), seconds=2.9)
+        sleep_node3 = g.node("TestSleep", value=image.out(0), seconds=3.0)
+
+        # Add outputs to verify the execution
+        _output1 = g.node("PreviewImage", images=sleep_node1.out(0))
+        _output2 = g.node("PreviewImage", images=sleep_node2.out(0))
+        _output3 = g.node("PreviewImage", images=sleep_node3.out(0))
+
+        start_time = time.time()
+        result = client.run(g)
+        elapsed_time = time.time() - start_time
+
+        # The test should take around 0.4 seconds (the longest sleep duration)
+        # plus some overhead, but definitely less than the sum of all sleeps (0.9s)
+        # We'll allow for up to 0.8s total to account for overhead
+        assert elapsed_time < 4.0, f"Parallel execution took {elapsed_time}s, expected less than 0.8s"
+
+        # Verify that all nodes executed
+        assert result.did_run(sleep_node1), "Sleep node 1 should have run"
+        assert result.did_run(sleep_node2), "Sleep node 2 should have run"
+        assert result.did_run(sleep_node3), "Sleep node 3 should have run"
+
+    def test_parallel_sleep_expansion(self, client: ComfyClient, builder: GraphBuilder):
+        g = builder
+        # Create input images with different values
+        image1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1)
+        image2 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+        image3 = g.node("StubImage", content="WHITE", height=512, width=512, batch_size=1)
+
+        # Create a TestParallelSleep node that expands into multiple TestSleep nodes
+        parallel_sleep = g.node("TestParallelSleep",
+                                image1=image1.out(0),
+                                image2=image2.out(0),
+                                image3=image3.out(0),
+                                sleep1=0.4,
+                                sleep2=0.5,
+                                sleep3=0.6)
+        output = g.node("SaveImage", images=parallel_sleep.out(0))
+
+        start_time = time.time()
+        result = client.run(g)
+        elapsed_time = time.time() - start_time
+
+        # Similar to the previous test, expect parallel execution of the sleep nodes
+        # which should complete in less than the sum of all sleeps
+        assert elapsed_time < 0.8, f"Expansion execution took {elapsed_time}s, expected less than 0.8s"
+
+        # Verify the parallel sleep node executed
+        assert result.did_run(parallel_sleep), "ParallelSleep node should have run"
+
+        # Verify we get an image as output (blend of the three input images)
+        result_images = result.get_images(output)
+        assert len(result_images) == 1, "Should have 1 image"
+        # Average pixel value should be around 170 (255 * 2 // 3)
+        avg_value = numpy.array(result_images[0]).mean()
+        assert avg_value == 170, f"Image average value {avg_value} should be 170"
+
    # This tests that nodes with OUTPUT_IS_LIST function correctly when they receive an ExecutionBlocker
    # as input. We also test that when that list (containing an ExecutionBlocker) is passed to a node,
    # only that one entry in the list is blocked.
--- a/tests/inference/testing_nodes/testing-pack/init.py
+++ b/tests/inference/testing_nodes/testing-pack/init.py
@ -1,23 +1,26 @@
-from .specific_tests import TEST_NODE_CLASS_MAPPINGS, TEST_NODE_DISPLAY_NAME_MAPPINGS
-from .flow_control import FLOW_CONTROL_NODE_CLASS_MAPPINGS, FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS
-from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPINGS
-from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS
-from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS
-
-# NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS)
-# NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS)
-
-NODE_CLASS_MAPPINGS = {}
-NODE_CLASS_MAPPINGS.update(TEST_NODE_CLASS_MAPPINGS)
-NODE_CLASS_MAPPINGS.update(FLOW_CONTROL_NODE_CLASS_MAPPINGS)
-NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS)
-NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS)
-NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS)
-
-NODE_DISPLAY_NAME_MAPPINGS = {}
-NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS)
-NODE_DISPLAY_NAME_MAPPINGS.update(FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS)
-NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS)
-NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS)
-NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS)
-
+from .specific_tests import TEST_NODE_CLASS_MAPPINGS, TEST_NODE_DISPLAY_NAME_MAPPINGS
+from .flow_control import FLOW_CONTROL_NODE_CLASS_MAPPINGS, FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS
+from .util import UTILITY_NODE_CLASS_MAPPINGS, UTILITY_NODE_DISPLAY_NAME_MAPPINGS
+from .conditions import CONDITION_NODE_CLASS_MAPPINGS, CONDITION_NODE_DISPLAY_NAME_MAPPINGS
+from .stubs import TEST_STUB_NODE_CLASS_MAPPINGS, TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS
+from .async_test_nodes import ASYNC_TEST_NODE_CLASS_MAPPINGS, ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS
+
+# NODE_CLASS_MAPPINGS = GENERAL_NODE_CLASS_MAPPINGS.update(COMPONENT_NODE_CLASS_MAPPINGS)
+# NODE_DISPLAY_NAME_MAPPINGS = GENERAL_NODE_DISPLAY_NAME_MAPPINGS.update(COMPONENT_NODE_DISPLAY_NAME_MAPPINGS)
+
+NODE_CLASS_MAPPINGS = {}
+NODE_CLASS_MAPPINGS.update(TEST_NODE_CLASS_MAPPINGS)
+NODE_CLASS_MAPPINGS.update(FLOW_CONTROL_NODE_CLASS_MAPPINGS)
+NODE_CLASS_MAPPINGS.update(UTILITY_NODE_CLASS_MAPPINGS)
+NODE_CLASS_MAPPINGS.update(CONDITION_NODE_CLASS_MAPPINGS)
+NODE_CLASS_MAPPINGS.update(TEST_STUB_NODE_CLASS_MAPPINGS)
+NODE_CLASS_MAPPINGS.update(ASYNC_TEST_NODE_CLASS_MAPPINGS)
+
+NODE_DISPLAY_NAME_MAPPINGS = {}
+NODE_DISPLAY_NAME_MAPPINGS.update(TEST_NODE_DISPLAY_NAME_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(FLOW_CONTROL_NODE_DISPLAY_NAME_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(UTILITY_NODE_DISPLAY_NAME_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(CONDITION_NODE_DISPLAY_NAME_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(TEST_STUB_NODE_DISPLAY_NAME_MAPPINGS)
+NODE_DISPLAY_NAME_MAPPINGS.update(ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS)
+
--- a/tests/inference/testing_nodes/testing-pack/async_test_nodes.py
+++ b/tests/inference/testing_nodes/testing-pack/async_test_nodes.py
@ -0,0 +1,343 @@
+import torch
+import asyncio
+from typing import Dict
+from comfy.utils import ProgressBar
+from comfy_execution.graph_utils import GraphBuilder
+from comfy.comfy_types.node_typing import ComfyNodeABC
+from comfy.comfy_types import IO
+
+
+class TestAsyncValidation(ComfyNodeABC):
+    """Test node with async VALIDATE_INPUTS."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": ("FLOAT", {"default": 5.0}),
+                "threshold": ("FLOAT", {"default": 10.0}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process"
+    CATEGORY = "_for_testing/async"
+
+    @classmethod
+    async def VALIDATE_INPUTS(cls, value, threshold):
+        # Simulate async validation (e.g., checking remote service)
+        await asyncio.sleep(0.05)
+
+        if value > threshold:
+            return f"Value {value} exceeds threshold {threshold}"
+        return True
+
+    def process(self, value, threshold):
+        # Create image based on value
+        intensity = value / 10.0
+        image = torch.ones([1, 512, 512, 3]) * intensity
+        return (image,)
+
+
+class TestAsyncError(ComfyNodeABC):
+    """Test node that errors during async execution."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": (IO.ANY, {}),
+                "error_after": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 10.0}),
+            },
+        }
+
+    RETURN_TYPES = (IO.ANY,)
+    FUNCTION = "error_execution"
+    CATEGORY = "_for_testing/async"
+
+    async def error_execution(self, value, error_after):
+        await asyncio.sleep(error_after)
+        raise RuntimeError("Intentional async execution error for testing")
+
+
+class TestAsyncValidationError(ComfyNodeABC):
+    """Test node with async validation that always fails."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": ("FLOAT", {"default": 5.0}),
+                "max_value": ("FLOAT", {"default": 10.0}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process"
+    CATEGORY = "_for_testing/async"
+
+    @classmethod
+    async def VALIDATE_INPUTS(cls, value, max_value):
+        await asyncio.sleep(0.05)
+        # Always fail validation for values > max_value
+        if value > max_value:
+            return f"Async validation failed: {value} > {max_value}"
+        return True
+
+    def process(self, value, max_value):
+        # This won't be reached if validation fails
+        image = torch.ones([1, 512, 512, 3]) * (value / max_value)
+        return (image,)
+
+
+class TestAsyncTimeout(ComfyNodeABC):
+    """Test node that simulates timeout scenarios."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": (IO.ANY, {}),
+                "timeout": ("FLOAT", {"default": 1.0, "min": 0.1, "max": 10.0}),
+                "operation_time": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 10.0}),
+            },
+        }
+
+    RETURN_TYPES = (IO.ANY,)
+    FUNCTION = "timeout_execution"
+    CATEGORY = "_for_testing/async"
+
+    async def timeout_execution(self, value, timeout, operation_time):
+        try:
+            # This will timeout if operation_time > timeout
+            await asyncio.wait_for(asyncio.sleep(operation_time), timeout=timeout)
+            return (value,)
+        except asyncio.TimeoutError:
+            raise RuntimeError(f"Operation timed out after {timeout} seconds")
+
+
+class TestSyncError(ComfyNodeABC):
+    """Test node that errors synchronously (for mixed sync/async testing)."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": (IO.ANY, {}),
+            },
+        }
+
+    RETURN_TYPES = (IO.ANY,)
+    FUNCTION = "sync_error"
+    CATEGORY = "_for_testing/async"
+
+    def sync_error(self, value):
+        raise RuntimeError("Intentional sync execution error for testing")
+
+
+class TestAsyncLazyCheck(ComfyNodeABC):
+    """Test node with async check_lazy_status."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "input1": (IO.ANY, {"lazy": True}),
+                "input2": (IO.ANY, {"lazy": True}),
+                "condition": ("BOOLEAN", {"default": True}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process"
+    CATEGORY = "_for_testing/async"
+
+    async def check_lazy_status(self, condition, input1, input2):
+        # Simulate async checking (e.g., querying remote service)
+        await asyncio.sleep(0.05)
+
+        needed = []
+        if condition and input1 is None:
+            needed.append("input1")
+        if not condition and input2 is None:
+            needed.append("input2")
+        return needed
+
+    def process(self, input1, input2, condition):
+        # Return a simple image
+        return (torch.ones([1, 512, 512, 3]),)
+
+
+class TestDynamicAsyncGeneration(ComfyNodeABC):
+    """Test node that dynamically generates async nodes."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "image1": ("IMAGE",),
+                "image2": ("IMAGE",),
+                "num_async_nodes": ("INT", {"default": 3, "min": 1, "max": 10}),
+                "sleep_duration": ("FLOAT", {"default": 0.2, "min": 0.1, "max": 1.0}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "generate_async_workflow"
+    CATEGORY = "_for_testing/async"
+
+    def generate_async_workflow(self, image1, image2, num_async_nodes, sleep_duration):
+        g = GraphBuilder()
+
+        # Create multiple async sleep nodes
+        sleep_nodes = []
+        for i in range(num_async_nodes):
+            image = image1 if i % 2 == 0 else image2
+            sleep_node = g.node("TestSleep", value=image, seconds=sleep_duration)
+            sleep_nodes.append(sleep_node)
+
+        # Average all results
+        if len(sleep_nodes) == 1:
+            final_node = sleep_nodes[0]
+        else:
+            avg_inputs = {"input1": sleep_nodes[0].out(0)}
+            for i, node in enumerate(sleep_nodes[1:], 2):
+                avg_inputs[f"input{i}"] = node.out(0)
+            final_node = g.node("TestVariadicAverage", **avg_inputs)
+
+        return {
+            "result": (final_node.out(0),),
+            "expand": g.finalize(),
+        }
+
+
+class TestAsyncResourceUser(ComfyNodeABC):
+    """Test node that uses resources during async execution."""
+
+    # Class-level resource tracking for testing
+    _active_resources: Dict[str, bool] = {}
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": (IO.ANY, {}),
+                "resource_id": ("STRING", {"default": "resource_0"}),
+                "duration": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 1.0}),
+            },
+        }
+
+    RETURN_TYPES = (IO.ANY,)
+    FUNCTION = "use_resource"
+    CATEGORY = "_for_testing/async"
+
+    async def use_resource(self, value, resource_id, duration):
+        # Check if resource is already in use
+        if self._active_resources.get(resource_id, False):
+            raise RuntimeError(f"Resource {resource_id} is already in use!")
+
+        # Mark resource as in use
+        self._active_resources[resource_id] = True
+
+        try:
+            # Simulate resource usage
+            await asyncio.sleep(duration)
+            return (value,)
+        finally:
+            # Always clean up resource
+            self._active_resources[resource_id] = False
+
+
+class TestAsyncBatchProcessing(ComfyNodeABC):
+    """Test async processing of batched inputs."""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "images": ("IMAGE",),
+                "process_time_per_item": ("FLOAT", {"default": 0.1, "min": 0.01, "max": 1.0}),
+            },
+            "hidden": {
+                "unique_id": "UNIQUE_ID",
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "process_batch"
+    CATEGORY = "_for_testing/async"
+
+    async def process_batch(self, images, process_time_per_item, unique_id):
+        batch_size = images.shape[0]
+        pbar = ProgressBar(batch_size, node_id=unique_id)
+
+        # Process each image in the batch
+        processed = []
+        for i in range(batch_size):
+            # Simulate async processing
+            await asyncio.sleep(process_time_per_item)
+
+            # Simple processing: invert the image
+            processed_image = 1.0 - images[i:i+1]
+            processed.append(processed_image)
+
+            pbar.update(1)
+
+        # Stack processed images
+        result = torch.cat(processed, dim=0)
+        return (result,)
+
+
+class TestAsyncConcurrentLimit(ComfyNodeABC):
+    """Test concurrent execution limits for async nodes."""
+
+    _semaphore = asyncio.Semaphore(2)  # Only allow 2 concurrent executions
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": (IO.ANY, {}),
+                "duration": ("FLOAT", {"default": 0.5, "min": 0.1, "max": 2.0}),
+                "node_id": ("INT", {"default": 0}),
+            },
+        }
+
+    RETURN_TYPES = (IO.ANY,)
+    FUNCTION = "limited_execution"
+    CATEGORY = "_for_testing/async"
+
+    async def limited_execution(self, value, duration, node_id):
+        async with self._semaphore:
+            # Node {node_id} acquired semaphore
+            await asyncio.sleep(duration)
+            # Node {node_id} releasing semaphore
+            return (value,)
+
+
+# Add node mappings
+ASYNC_TEST_NODE_CLASS_MAPPINGS = {
+    "TestAsyncValidation": TestAsyncValidation,
+    "TestAsyncError": TestAsyncError,
+    "TestAsyncValidationError": TestAsyncValidationError,
+    "TestAsyncTimeout": TestAsyncTimeout,
+    "TestSyncError": TestSyncError,
+    "TestAsyncLazyCheck": TestAsyncLazyCheck,
+    "TestDynamicAsyncGeneration": TestDynamicAsyncGeneration,
+    "TestAsyncResourceUser": TestAsyncResourceUser,
+    "TestAsyncBatchProcessing": TestAsyncBatchProcessing,
+    "TestAsyncConcurrentLimit": TestAsyncConcurrentLimit,
+}
+
+ASYNC_TEST_NODE_DISPLAY_NAME_MAPPINGS = {
+    "TestAsyncValidation": "Test Async Validation",
+    "TestAsyncError": "Test Async Error",
+    "TestAsyncValidationError": "Test Async Validation Error",
+    "TestAsyncTimeout": "Test Async Timeout",
+    "TestSyncError": "Test Sync Error",
+    "TestAsyncLazyCheck": "Test Async Lazy Check",
+    "TestDynamicAsyncGeneration": "Test Dynamic Async Generation",
+    "TestAsyncResourceUser": "Test Async Resource User",
+    "TestAsyncBatchProcessing": "Test Async Batch Processing",
+    "TestAsyncConcurrentLimit": "Test Async Concurrent Limit",
+}
--- a/tests/inference/testing_nodes/testing-pack/specific_tests.py
+++ b/tests/inference/testing_nodes/testing-pack/specific_tests.py
@ -1,6 +1,11 @@
 import torch
+import time
+import asyncio
+from comfy.utils import ProgressBar
 from .tools import VariantSupport
 from comfy_execution.graph_utils import GraphBuilder
+from comfy.comfy_types.node_typing import ComfyNodeABC
+from comfy.comfy_types import IO

 class TestLazyMixImages:
    @classmethod
@ -333,6 +338,131 @@ class TestMixedExpansionReturns:
                "expand": g.finalize(),
            }

+class TestSamplingInExpansion:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "model": ("MODEL",),
+                "clip": ("CLIP",),
+                "vae": ("VAE",),
+                "seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
+                "steps": ("INT", {"default": 20, "min": 1, "max": 100}),
+                "cfg": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 30.0}),
+                "prompt": ("STRING", {"multiline": True, "default": "a beautiful landscape with mountains and trees"}),
+                "negative_prompt": ("STRING", {"multiline": True, "default": "blurry, bad quality, worst quality"}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "sampling_in_expansion"
+
+    CATEGORY = "Testing/Nodes"
+
+    def sampling_in_expansion(self, model, clip, vae, seed, steps, cfg, prompt, negative_prompt):
+        g = GraphBuilder()
+
+        # Create a basic image generation workflow using the input model, clip and vae
+        # 1. Setup text prompts using the provided CLIP model
+        positive_prompt = g.node("CLIPTextEncode",
+                               text=prompt,
+                               clip=clip)
+        negative_prompt = g.node("CLIPTextEncode",
+                                text=negative_prompt,
+                                clip=clip)
+
+        # 2. Create empty latent with specified size
+        empty_latent = g.node("EmptyLatentImage", width=512, height=512, batch_size=1)
+
+        # 3. Setup sampler and generate image latent
+        sampler = g.node("KSampler",
+                        model=model,
+                        positive=positive_prompt.out(0),
+                        negative=negative_prompt.out(0),
+                        latent_image=empty_latent.out(0),
+                        seed=seed,
+                        steps=steps,
+                        cfg=cfg,
+                        sampler_name="euler_ancestral",
+                        scheduler="normal")
+
+        # 4. Decode latent to image using VAE
+        output = g.node("VAEDecode", samples=sampler.out(0), vae=vae)
+
+        return {
+            "result": (output.out(0),),
+            "expand": g.finalize(),
+        }
+
+class TestSleep(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "value": (IO.ANY, {}),
+                "seconds": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 9999.0, "step": 0.01, "tooltip": "The amount of seconds to sleep."}),
+            },
+            "hidden": {
+                "unique_id": "UNIQUE_ID",
+            },
+        }
+    RETURN_TYPES = (IO.ANY,)
+    FUNCTION = "sleep"
+
+    CATEGORY = "_for_testing"
+
+    async def sleep(self, value, seconds, unique_id):
+        pbar = ProgressBar(seconds, node_id=unique_id)
+        start = time.time()
+        expiration = start + seconds
+        now = start
+        while now < expiration:
+            now = time.time()
+            pbar.update_absolute(now - start)
+            await asyncio.sleep(0.01)
+        return (value,)
+
+class TestParallelSleep(ComfyNodeABC):
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "image1": ("IMAGE", ),
+                "image2": ("IMAGE", ),
+                "image3": ("IMAGE", ),
+                "sleep1": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}),
+                "sleep2": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}),
+                "sleep3": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step": 0.01}),
+            },
+            "hidden": {
+                "unique_id": "UNIQUE_ID",
+            },
+        }
+    RETURN_TYPES = ("IMAGE",)
+    FUNCTION = "parallel_sleep"
+    CATEGORY = "_for_testing"
+    OUTPUT_NODE = True
+
+    def parallel_sleep(self, image1, image2, image3, sleep1, sleep2, sleep3, unique_id):
+        # Create a graph dynamically with three TestSleep nodes
+        g = GraphBuilder()
+
+        # Create sleep nodes for each duration and image
+        sleep_node1 = g.node("TestSleep", value=image1, seconds=sleep1)
+        sleep_node2 = g.node("TestSleep", value=image2, seconds=sleep2)
+        sleep_node3 = g.node("TestSleep", value=image3, seconds=sleep3)
+
+        # Blend the results using TestVariadicAverage
+        blend = g.node("TestVariadicAverage",
+                       input1=sleep_node1.out(0),
+                       input2=sleep_node2.out(0),
+                       input3=sleep_node3.out(0))
+
+        return {
+            "result": (blend.out(0),),
+            "expand": g.finalize(),
+        }
+
 TEST_NODE_CLASS_MAPPINGS = {
    "TestLazyMixImages": TestLazyMixImages,
    "TestVariadicAverage": TestVariadicAverage,
@ -345,6 +475,9 @@ TEST_NODE_CLASS_MAPPINGS = {
    "TestCustomValidation5": TestCustomValidation5,
    "TestDynamicDependencyCycle": TestDynamicDependencyCycle,
    "TestMixedExpansionReturns": TestMixedExpansionReturns,
+    "TestSamplingInExpansion": TestSamplingInExpansion,
+    "TestSleep": TestSleep,
+    "TestParallelSleep": TestParallelSleep,
 }

 TEST_NODE_DISPLAY_NAME_MAPPINGS = {
@ -359,4 +492,7 @@ TEST_NODE_DISPLAY_NAME_MAPPINGS = {
    "TestCustomValidation5": "Custom Validation 5",
    "TestDynamicDependencyCycle": "Dynamic Dependency Cycle",
    "TestMixedExpansionReturns": "Mixed Expansion Returns",
+    "TestSamplingInExpansion": "Sampling In Expansion",
+    "TestSleep": "Test Sleep",
+    "TestParallelSleep": "Test Parallel Sleep",
 }