diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py
index e643b4414..70d173889 100644
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@@ -37,7 +37,7 @@ class ChromaRadianceParams(ChromaParams):
     nerf_final_head_type: str
     # None means use the same dtype as the model.
     nerf_embedder_dtype: Optional[torch.dtype]
-
+    use_x0: bool
 
 class ChromaRadiance(Chroma):
     """
@@ -159,6 +159,9 @@ class ChromaRadiance(Chroma):
         self.skip_dit = []
         self.lite = False
 
+        if params.use_x0:
+            self.register_buffer("__x0__", torch.tensor([]))
+
     @property
     def _nerf_final_layer(self) -> nn.Module:
         if self.params.nerf_final_head_type == "linear":
@@ -276,6 +279,12 @@ class ChromaRadiance(Chroma):
         params_dict |= overrides
         return params.__class__(**params_dict)
 
+    def _apply_x0_residual(self, predicted, noisy, timesteps):
+
+        # non zero during training to prevent 0 div
+        eps = 0.0
+        return (noisy - predicted) / (timesteps.view(-1,1,1,1) + eps)
+
     def _forward(
         self,
         x: Tensor,
@@ -316,4 +325,11 @@ class ChromaRadiance(Chroma):
             transformer_options,
             attn_mask=kwargs.get("attention_mask", None),
         )
-        return self.forward_nerf(img, img_out, params)[:, :, :h, :w]
+
+        out = self.forward_nerf(img, img_out, params)[:, :, :h, :w]
+
+        # If x0 variant → v-pred, just return this instead
+        if hasattr(self, "__x0__"):
+            out = self._apply_x0_residual(out, img, timestep)
+        return out
+
diff --git a/comfy/ldm/kandinsky5/model.py b/comfy/ldm/kandinsky5/model.py
index a653e02fc..1509de2f8 100644
--- a/comfy/ldm/kandinsky5/model.py
+++ b/comfy/ldm/kandinsky5/model.py
@@ -387,6 +387,9 @@ class Kandinsky5(nn.Module):
         return self.out_layer(visual_embed, time_embed)
 
     def _forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
+        original_dims = x.ndim
+        if original_dims == 4:
+            x = x.unsqueeze(2)
         bs, c, t_len, h, w = x.shape
         x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
 
@@ -397,7 +400,10 @@ class Kandinsky5(nn.Module):
         freqs = self.rope_encode_3d(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options)
         freqs_text = self.rope_encode_1d(context.shape[1], device=x.device, dtype=x.dtype, transformer_options=transformer_options)
 
-        return self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs)
+        out = self.forward_orig(x, timestep, context, y, freqs, freqs_text, transformer_options=transformer_options, **kwargs)
+        if original_dims == 4:
+            out = out.squeeze(2)
+        return out
 
     def forward(self, x, timestep, context, y, time_dim_replace=None, transformer_options={}, **kwargs):
         return comfy.patcher_extension.WrapperExecutor.new_class_executor(
diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index 6c24fed9b..c47df49ca 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -377,6 +377,7 @@ class NextDiT(nn.Module):
         z_image_modulation=False,
         time_scale=1.0,
         pad_tokens_multiple=None,
+        clip_text_dim=None,
         image_model=None,
         device=None,
         dtype=None,
@@ -447,6 +448,31 @@ class NextDiT(nn.Module):
             ),
         )
 
+        self.clip_text_pooled_proj = None
+
+        if clip_text_dim is not None:
+            self.clip_text_dim = clip_text_dim
+            self.clip_text_pooled_proj = nn.Sequential(
+                operation_settings.get("operations").RMSNorm(clip_text_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
+                operation_settings.get("operations").Linear(
+                    clip_text_dim,
+                    clip_text_dim,
+                    bias=True,
+                    device=operation_settings.get("device"),
+                    dtype=operation_settings.get("dtype"),
+                ),
+            )
+            self.time_text_embed = nn.Sequential(
+                nn.SiLU(),
+                operation_settings.get("operations").Linear(
+                    min(dim, 1024) + clip_text_dim,
+                    min(dim, 1024),
+                    bias=True,
+                    device=operation_settings.get("device"),
+                    dtype=operation_settings.get("dtype"),
+                ),
+            )
+
         self.layers = nn.ModuleList(
             [
                 JointTransformerBlock(
@@ -585,6 +611,15 @@ class NextDiT(nn.Module):
 
         cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
 
+        if self.clip_text_pooled_proj is not None:
+            pooled = kwargs.get("clip_text_pooled", None)
+            if pooled is not None:
+                pooled = self.clip_text_pooled_proj(pooled)
+            else:
+                pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype)
+
+            adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
+
         patches = transformer_options.get("patches", {})
         x_is_tensor = isinstance(x, torch.Tensor)
         img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
diff --git a/comfy/lora.py b/comfy/lora.py
index e7202ce97..2ed0acb9d 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -320,6 +320,7 @@ def model_lora_keys_unet(model, key_map={}):
                 to = diffusers_keys[k]
                 key_lora = k[:-len(".weight")]
                 key_map["diffusion_model.{}".format(key_lora)] = to
+                key_map["transformer.{}".format(key_lora)] = to
                 key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to
 
     if isinstance(model, comfy.model_base.Kandinsky5):
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 0be006cc2..6b8a8454d 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1110,6 +1110,10 @@ class Lumina2(BaseModel):
             if 'num_tokens' not in out:
                 out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
 
+        clip_text_pooled = kwargs["pooled_output"]  # Newbie
+        if clip_text_pooled is not None:
+            out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
+
         return out
 
 class WAN21(BaseModel):
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 30b33a486..19e6aa954 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -257,6 +257,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["nerf_tile_size"] = 512
                 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
                 dit_config["nerf_embedder_dtype"] = torch.float32
+            if "__x0__" in state_dict_keys: # x0 pred
+                dit_config["use_x0"] = True
         else:
             dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
             dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys
@@ -423,6 +425,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["axes_lens"] = [300, 512, 512]
             dit_config["rope_theta"] = 10000.0
             dit_config["ffn_dim_multiplier"] = 4.0
+            ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None)
+            if ctd_weight is not None:
+                dit_config["clip_text_dim"] = ctd_weight.shape[0]
         elif dit_config["dim"] == 3840:  # Z image
             dit_config["n_heads"] = 30
             dit_config["n_kv_heads"] = 30
diff --git a/comfy/model_management.py b/comfy/model_management.py
index aeddbaefe..40717b1e4 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1492,6 +1492,20 @@ def extended_fp16_support():
 
     return True
 
+LORA_COMPUTE_DTYPES = {}
+def lora_compute_dtype(device):
+    dtype = LORA_COMPUTE_DTYPES.get(device, None)
+    if dtype is not None:
+        return dtype
+
+    if should_use_fp16(device):
+        dtype = torch.float16
+    else:
+        dtype = torch.float32
+
+    LORA_COMPUTE_DTYPES[device] = dtype
+    return dtype
+
 def soft_empty_cache(force=False):
     global cpu_state
     if cpu_state == CPUState.MPS:
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 215784874..a486c2723 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -35,6 +35,7 @@ import comfy.model_management
 import comfy.patcher_extension
 import comfy.utils
 from comfy.comfy_types import UnetWrapperFunction
+from comfy.quant_ops import QuantizedTensor
 from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
 
 
@@ -132,14 +133,17 @@ class LowVramPatch:
     def __call__(self, weight):
         return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)
 
-#The above patch logic may cast up the weight to fp32, and do math. Go with fp32 x 3
-LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 3
+LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2
 
 def low_vram_patch_estimate_vram(model, key):
     weight, set_func, convert_func = get_key_weight(model, key)
     if weight is None:
         return 0
-    return weight.numel() * torch.float32.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR
+    model_dtype = getattr(model, "manual_cast_dtype", torch.float32)
+    if model_dtype is None:
+        model_dtype = weight.dtype
+
+    return weight.numel() * model_dtype.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR
 
 def get_key_weight(model, key):
     set_func = None
@@ -614,10 +618,11 @@ class ModelPatcher:
         if key not in self.backup:
             self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
 
+        temp_dtype = comfy.model_management.lora_compute_dtype(device_to)
         if device_to is not None:
-            temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
+            temp_weight = comfy.model_management.cast_to_device(weight, device_to, temp_dtype, copy=True)
         else:
-            temp_weight = weight.to(torch.float32, copy=True)
+            temp_weight = weight.to(temp_dtype, copy=True)
         if convert_func is not None:
             temp_weight = convert_func(temp_weight, inplace=True)
 
@@ -661,12 +666,18 @@ class ModelPatcher:
                 module_mem = comfy.model_management.module_size(m)
                 module_offload_mem = module_mem
                 if hasattr(m, "comfy_cast_weights"):
-                    weight_key = "{}.weight".format(n)
-                    bias_key = "{}.bias".format(n)
-                    if weight_key in self.patches:
-                        module_offload_mem += low_vram_patch_estimate_vram(self.model, weight_key)
-                    if bias_key in self.patches:
-                        module_offload_mem += low_vram_patch_estimate_vram(self.model, bias_key)
+                    def check_module_offload_mem(key):
+                        if key in self.patches:
+                            return low_vram_patch_estimate_vram(self.model, key)
+                        model_dtype = getattr(self.model, "manual_cast_dtype", None)
+                        weight, _, _ = get_key_weight(self.model, key)
+                        if model_dtype is None or weight is None:
+                            return 0
+                        if (weight.dtype != model_dtype or isinstance(weight, QuantizedTensor)):
+                            return weight.numel() * model_dtype.itemsize
+                        return 0
+                    module_offload_mem += check_module_offload_mem("{}.weight".format(n))
+                    module_offload_mem += check_module_offload_mem("{}.bias".format(n))
                 loading.append((module_offload_mem, module_mem, n, m, params))
         return loading
 
@@ -761,6 +772,8 @@ class ModelPatcher:
                     key = "{}.{}".format(n, param)
                     self.unpin_weight(key)
                     self.patch_weight_to_device(key, device_to=device_to)
+                if comfy.model_management.is_device_cuda(device_to):
+                    torch.cuda.synchronize()
 
                 logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
                 m.comfy_patched_weights = True
@@ -917,7 +930,7 @@ class ModelPatcher:
                                     patch_counter += 1
                             cast_weight = True
 
-                        if cast_weight:
+                        if cast_weight and hasattr(m, "comfy_cast_weights"):
                             m.prev_comfy_cast_weights = m.comfy_cast_weights
                             m.comfy_cast_weights = True
                         m.comfy_patched_weights = False
diff --git a/comfy/ops.py b/comfy/ops.py
index 35237c9f7..6f34d50fc 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -22,7 +22,6 @@ import comfy.model_management
 from comfy.cli_args import args, PerformanceFeature
 import comfy.float
 import comfy.rmsnorm
-import contextlib
 import json
 
 def run_every_op():
@@ -94,13 +93,6 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
     else:
         offload_stream = None
 
-    if offload_stream is not None:
-        wf_context = offload_stream
-        if hasattr(wf_context, "as_context"):
-            wf_context = wf_context.as_context(offload_stream)
-    else:
-        wf_context = contextlib.nullcontext()
-
     non_blocking = comfy.model_management.device_supports_non_blocking(device)
 
     weight_has_function = len(s.weight_function) > 0
diff --git a/comfy/sd.py b/comfy/sd.py
index 754b1703d..a16f2d14f 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -127,6 +127,8 @@ class CLIP:
 
         self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
         self.patcher = comfy.model_patcher.ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
+        #Match torch.float32 hardcode upcast in TE implemention
+        self.patcher.set_model_compute_dtype(torch.float32)
         self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram
         self.patcher.is_clip = True
         self.apply_hooks_to_conds = None
diff --git a/comfy/text_encoders/kandinsky5.py b/comfy/text_encoders/kandinsky5.py
index 22f991c36..be086458c 100644
--- a/comfy/text_encoders/kandinsky5.py
+++ b/comfy/text_encoders/kandinsky5.py
@@ -24,10 +24,10 @@ class Kandinsky5TokenizerImage(Kandinsky5Tokenizer):
 
 class Qwen25_7BVLIModel(sd1_clip.SDClipModel):
     def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}):
-        llama_scaled_fp8 = model_options.get("qwen_scaled_fp8", None)
-        if llama_scaled_fp8 is not None:
+        llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
+        if llama_quantization_metadata is not None:
             model_options = model_options.copy()
-            model_options["scaled_fp8"] = llama_scaled_fp8
+            model_options["quantization_metadata"] = llama_quantization_metadata
         super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 
 
@@ -56,12 +56,12 @@ class Kandinsky5TEModel(QwenImageTEModel):
         else:
             return super().load_sd(sd)
 
-def te(dtype_llama=None, llama_scaled_fp8=None):
+def te(dtype_llama=None, llama_quantization_metadata=None):
     class Kandinsky5TEModel_(Kandinsky5TEModel):
         def __init__(self, device="cpu", dtype=None, model_options={}):
-            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
+            if llama_quantization_metadata is not None:
                 model_options = model_options.copy()
-                model_options["qwen_scaled_fp8"] = llama_scaled_fp8
+                model_options["llama_quantization_metadata"] = llama_quantization_metadata
             if dtype_llama is not None:
                 dtype = dtype_llama
             super().__init__(device=device, dtype=dtype, model_options=model_options)
diff --git a/comfy/utils.py b/comfy/utils.py
index 89846bc95..9dc0d76ac 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -803,12 +803,17 @@ def safetensors_header(safetensors_path, max_size=100*1024*1024):
             return None
         return f.read(length_of_header)
 
+ATTR_UNSET={}
+
 def set_attr(obj, attr, value):
     attrs = attr.split(".")
     for name in attrs[:-1]:
         obj = getattr(obj, name)
-    prev = getattr(obj, attrs[-1])
-    setattr(obj, attrs[-1], value)
+    prev = getattr(obj, attrs[-1], ATTR_UNSET)
+    if value is ATTR_UNSET:
+        delattr(obj, attrs[-1])
+    else:
+        setattr(obj, attrs[-1], value)
     return prev
 
 def set_attr_param(obj, attr, value):
diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py
index 0fa01d1e7..35e1ac853 100644
--- a/comfy_api/latest/__init__.py
+++ b/comfy_api/latest/__init__.py
@@ -5,9 +5,9 @@ from typing import Type, TYPE_CHECKING
 from comfy_api.internal import ComfyAPIBase
 from comfy_api.internal.singleton import ProxiedSingleton
 from comfy_api.internal.async_to_sync import create_sync_class
-from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
-from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
-from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
+from ._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
+from ._input_impl import VideoFromFile, VideoFromComponents
+from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
 from . import _io_public as io
 from . import _ui_public as ui
 # from comfy_api.latest._resources import _RESOURCES as resources  #noqa: F401
@@ -80,7 +80,7 @@ class ComfyExtension(ABC):
     async def on_load(self) -> None:
         """
         Called when an extension is loaded.
-        This should be used to initialize any global resources neeeded by the extension.
+        This should be used to initialize any global resources needed by the extension.
         """
 
     @abstractmethod
diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py
index 87c81d73a..e634a0311 100644
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@@ -4,7 +4,7 @@ from fractions import Fraction
 from typing import Optional, Union, IO
 import io
 import av
-from comfy_api.util import VideoContainer, VideoCodec, VideoComponents
+from .._util import VideoContainer, VideoCodec, VideoComponents
 
 class VideoInput(ABC):
     """
diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py
index a4cd3737d..ea35c6062 100644
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -3,14 +3,14 @@ from av.container import InputContainer
 from av.subtitles.stream import SubtitleStream
 from fractions import Fraction
 from typing import Optional
-from comfy_api.latest._input import AudioInput, VideoInput
+from .._input import AudioInput, VideoInput
 import av
 import io
 import json
 import numpy as np
 import math
 import torch
-from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
+from .._util import VideoContainer, VideoCodec, VideoComponents
 
 
 def container_to_output_format(container_format: str | None) -> str | None:
diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index ec6abd832..513dbc5db 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -26,7 +26,7 @@ if TYPE_CHECKING:
     from comfy_api.input import VideoInput
 from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class,
     prune_dict, shallow_clone_class)
-from comfy_api.latest._resources import Resources, ResourcesLocal
+from ._resources import Resources, ResourcesLocal
 from comfy_execution.graph_utils import ExecutionBlocker
 from ._util import MESH, VOXEL
 
diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py
index 5a75a3aae..2babe209a 100644
--- a/comfy_api/latest/_ui.py
+++ b/comfy_api/latest/_ui.py
@@ -22,7 +22,7 @@ import folder_paths
 
 # used for image preview
 from comfy.cli_args import args
-from comfy_api.latest._io import ComfyNode, FolderType, Image, _UIOutput
+from ._io import ComfyNode, FolderType, Image, _UIOutput
 
 
 class SavedResult(dict):
diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py
index c3e3d8e3a..fd3b5a510 100644
--- a/comfy_api/latest/_util/video_types.py
+++ b/comfy_api/latest/_util/video_types.py
@@ -3,7 +3,7 @@ from dataclasses import dataclass
 from enum import Enum
 from fractions import Fraction
 from typing import Optional
-from comfy_api.latest._input import ImageInput, AudioInput
+from .._input import ImageInput, AudioInput
 
 class VideoCodec(str, Enum):
     AUTO = "auto"
diff --git a/comfy_api_nodes/apis/bytedance_api.py b/comfy_api_nodes/apis/bytedance_api.py
new file mode 100644
index 000000000..77cd76f9b
--- /dev/null
+++ b/comfy_api_nodes/apis/bytedance_api.py
@@ -0,0 +1,144 @@
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class Text2ImageTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(...)
+    response_format: str | None = Field("url")
+    size: str | None = Field(None)
+    seed: int | None = Field(0, ge=0, le=2147483647)
+    guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
+    watermark: bool | None = Field(True)
+
+
+class Image2ImageTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(...)
+    response_format: str | None = Field("url")
+    image: str = Field(..., description="Base64 encoded string or image URL")
+    size: str | None = Field("adaptive")
+    seed: int | None = Field(..., ge=0, le=2147483647)
+    guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
+    watermark: bool | None = Field(True)
+
+
+class Seedream4Options(BaseModel):
+    max_images: int = Field(15)
+
+
+class Seedream4TaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    prompt: str = Field(...)
+    response_format: str = Field("url")
+    image: list[str] | None = Field(None, description="Image URLs")
+    size: str = Field(...)
+    seed: int = Field(..., ge=0, le=2147483647)
+    sequential_image_generation: str = Field("disabled")
+    sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15))
+    watermark: bool = Field(True)
+
+
+class ImageTaskCreationResponse(BaseModel):
+    model: str = Field(...)
+    created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.")
+    data: list = Field([], description="Contains information about the generated image(s).")
+    error: dict = Field({}, description="Contains `code` and `message` fields in case of error.")
+
+
+class TaskTextContent(BaseModel):
+    type: str = Field("text")
+    text: str = Field(...)
+
+
+class TaskImageContentUrl(BaseModel):
+    url: str = Field(...)
+
+
+class TaskImageContent(BaseModel):
+    type: str = Field("image_url")
+    image_url: TaskImageContentUrl = Field(...)
+    role: Literal["first_frame", "last_frame", "reference_image"] | None = Field(None)
+
+
+class Text2VideoTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    content: list[TaskTextContent] = Field(..., min_length=1)
+
+
+class Image2VideoTaskCreationRequest(BaseModel):
+    model: str = Field(...)
+    content: list[TaskTextContent | TaskImageContent] = Field(..., min_length=2)
+
+
+class TaskCreationResponse(BaseModel):
+    id: str = Field(...)
+
+
+class TaskStatusError(BaseModel):
+    code: str = Field(...)
+    message: str = Field(...)
+
+
+class TaskStatusResult(BaseModel):
+    video_url: str = Field(...)
+
+
+class TaskStatusResponse(BaseModel):
+    id: str = Field(...)
+    model: str = Field(...)
+    status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
+    error: TaskStatusError | None = Field(None)
+    content: TaskStatusResult | None = Field(None)
+
+
+RECOMMENDED_PRESETS = [
+    ("1024x1024 (1:1)", 1024, 1024),
+    ("864x1152 (3:4)", 864, 1152),
+    ("1152x864 (4:3)", 1152, 864),
+    ("1280x720 (16:9)", 1280, 720),
+    ("720x1280 (9:16)", 720, 1280),
+    ("832x1248 (2:3)", 832, 1248),
+    ("1248x832 (3:2)", 1248, 832),
+    ("1512x648 (21:9)", 1512, 648),
+    ("2048x2048 (1:1)", 2048, 2048),
+    ("Custom", None, None),
+]
+
+RECOMMENDED_PRESETS_SEEDREAM_4 = [
+    ("2048x2048 (1:1)", 2048, 2048),
+    ("2304x1728 (4:3)", 2304, 1728),
+    ("1728x2304 (3:4)", 1728, 2304),
+    ("2560x1440 (16:9)", 2560, 1440),
+    ("1440x2560 (9:16)", 1440, 2560),
+    ("2496x1664 (3:2)", 2496, 1664),
+    ("1664x2496 (2:3)", 1664, 2496),
+    ("3024x1296 (21:9)", 3024, 1296),
+    ("4096x4096 (1:1)", 4096, 4096),
+    ("Custom", None, None),
+]
+
+# The time in this dictionary are given for 10 seconds duration.
+VIDEO_TASKS_EXECUTION_TIME = {
+    "seedance-1-0-lite-t2v-250428": {
+        "480p": 40,
+        "720p": 60,
+        "1080p": 90,
+    },
+    "seedance-1-0-lite-i2v-250428": {
+        "480p": 40,
+        "720p": 60,
+        "1080p": 90,
+    },
+    "seedance-1-0-pro-250528": {
+        "480p": 70,
+        "720p": 85,
+        "1080p": 115,
+    },
+    "seedance-1-0-pro-fast-251015": {
+        "480p": 50,
+        "720p": 65,
+        "1080p": 100,
+    },
+}
diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py
index a380ecc86..f8edc38c9 100644
--- a/comfy_api_nodes/apis/gemini_api.py
+++ b/comfy_api_nodes/apis/gemini_api.py
@@ -84,15 +84,7 @@ class GeminiSystemInstructionContent(BaseModel):
         description="A list of ordered parts that make up a single message. "
         "Different parts may have different IANA MIME types.",
     )
-    role: GeminiRole = Field(
-        ...,
-        description="The identity of the entity that creates the message. "
-        "The following values are supported: "
-        "user: This indicates that the message is sent by a real person, typically a user-generated message. "
-        "model: This indicates that the message is generated by the model. "
-        "The model value is used to insert messages from model into the conversation during multi-turn conversations. "
-        "For non-multi-turn conversations, this field can be left blank or unset.",
-    )
+    role: GeminiRole | None = Field(..., description="The role field of systemInstruction may be ignored.")
 
 
 class GeminiFunctionDeclaration(BaseModel):
diff --git a/comfy_api_nodes/apis/veo_api.py b/comfy_api_nodes/apis/veo_api.py
index 8328d1aa4..23ca725b7 100644
--- a/comfy_api_nodes/apis/veo_api.py
+++ b/comfy_api_nodes/apis/veo_api.py
@@ -85,7 +85,7 @@ class Response1(BaseModel):
     raiMediaFilteredReasons: Optional[list[str]] = Field(
         None, description='Reasons why media was filtered by responsible AI policies'
     )
-    videos: Optional[list[Video]] = None
+    videos: Optional[list[Video]] = Field(None)
 
 
 class VeoGenVidPollResponse(BaseModel):
diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py
index caced471e..57c0218d0 100644
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@@ -1,13 +1,27 @@
 import logging
 import math
-from enum import Enum
-from typing import Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, Field
 from typing_extensions import override
 
-from comfy_api.latest import IO, ComfyExtension
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.bytedance_api import (
+    RECOMMENDED_PRESETS,
+    RECOMMENDED_PRESETS_SEEDREAM_4,
+    VIDEO_TASKS_EXECUTION_TIME,
+    Image2ImageTaskCreationRequest,
+    Image2VideoTaskCreationRequest,
+    ImageTaskCreationResponse,
+    Seedream4Options,
+    Seedream4TaskCreationRequest,
+    TaskCreationResponse,
+    TaskImageContent,
+    TaskImageContentUrl,
+    TaskStatusResponse,
+    TaskTextContent,
+    Text2ImageTaskCreationRequest,
+    Text2VideoTaskCreationRequest,
+)
 from comfy_api_nodes.util import (
     ApiEndpoint,
     download_url_to_image_tensor,
@@ -29,162 +43,6 @@ BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
 BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"  # + /{task_id}
 
 
-class Text2ImageModelName(str, Enum):
-    seedream_3 = "seedream-3-0-t2i-250415"
-
-
-class Image2ImageModelName(str, Enum):
-    seededit_3 = "seededit-3-0-i2i-250628"
-
-
-class Text2VideoModelName(str, Enum):
-    seedance_1_pro = "seedance-1-0-pro-250528"
-    seedance_1_lite = "seedance-1-0-lite-t2v-250428"
-
-
-class Image2VideoModelName(str, Enum):
-    """note(August 31): Pro model only supports FirstFrame: https://docs.byteplus.com/en/docs/ModelArk/1520757"""
-
-    seedance_1_pro = "seedance-1-0-pro-250528"
-    seedance_1_lite = "seedance-1-0-lite-i2v-250428"
-
-
-class Text2ImageTaskCreationRequest(BaseModel):
-    model: Text2ImageModelName = Text2ImageModelName.seedream_3
-    prompt: str = Field(...)
-    response_format: Optional[str] = Field("url")
-    size: Optional[str] = Field(None)
-    seed: Optional[int] = Field(0, ge=0, le=2147483647)
-    guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0)
-    watermark: Optional[bool] = Field(True)
-
-
-class Image2ImageTaskCreationRequest(BaseModel):
-    model: Image2ImageModelName = Image2ImageModelName.seededit_3
-    prompt: str = Field(...)
-    response_format: Optional[str] = Field("url")
-    image: str = Field(..., description="Base64 encoded string or image URL")
-    size: Optional[str] = Field("adaptive")
-    seed: Optional[int] = Field(..., ge=0, le=2147483647)
-    guidance_scale: Optional[float] = Field(..., ge=1.0, le=10.0)
-    watermark: Optional[bool] = Field(True)
-
-
-class Seedream4Options(BaseModel):
-    max_images: int = Field(15)
-
-
-class Seedream4TaskCreationRequest(BaseModel):
-    model: str = Field("seedream-4-0-250828")
-    prompt: str = Field(...)
-    response_format: str = Field("url")
-    image: Optional[list[str]] = Field(None, description="Image URLs")
-    size: str = Field(...)
-    seed: int = Field(..., ge=0, le=2147483647)
-    sequential_image_generation: str = Field("disabled")
-    sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15))
-    watermark: bool = Field(True)
-
-
-class ImageTaskCreationResponse(BaseModel):
-    model: str = Field(...)
-    created: int = Field(..., description="Unix timestamp (in seconds) indicating time when the request was created.")
-    data: list = Field([], description="Contains information about the generated image(s).")
-    error: dict = Field({}, description="Contains `code` and `message` fields in case of error.")
-
-
-class TaskTextContent(BaseModel):
-    type: str = Field("text")
-    text: str = Field(...)
-
-
-class TaskImageContentUrl(BaseModel):
-    url: str = Field(...)
-
-
-class TaskImageContent(BaseModel):
-    type: str = Field("image_url")
-    image_url: TaskImageContentUrl = Field(...)
-    role: Optional[Literal["first_frame", "last_frame", "reference_image"]] = Field(None)
-
-
-class Text2VideoTaskCreationRequest(BaseModel):
-    model: Text2VideoModelName = Text2VideoModelName.seedance_1_pro
-    content: list[TaskTextContent] = Field(..., min_length=1)
-
-
-class Image2VideoTaskCreationRequest(BaseModel):
-    model: Image2VideoModelName = Image2VideoModelName.seedance_1_pro
-    content: list[Union[TaskTextContent, TaskImageContent]] = Field(..., min_length=2)
-
-
-class TaskCreationResponse(BaseModel):
-    id: str = Field(...)
-
-
-class TaskStatusError(BaseModel):
-    code: str = Field(...)
-    message: str = Field(...)
-
-
-class TaskStatusResult(BaseModel):
-    video_url: str = Field(...)
-
-
-class TaskStatusResponse(BaseModel):
-    id: str = Field(...)
-    model: str = Field(...)
-    status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
-    error: Optional[TaskStatusError] = Field(None)
-    content: Optional[TaskStatusResult] = Field(None)
-
-
-RECOMMENDED_PRESETS = [
-    ("1024x1024 (1:1)", 1024, 1024),
-    ("864x1152 (3:4)", 864, 1152),
-    ("1152x864 (4:3)", 1152, 864),
-    ("1280x720 (16:9)", 1280, 720),
-    ("720x1280 (9:16)", 720, 1280),
-    ("832x1248 (2:3)", 832, 1248),
-    ("1248x832 (3:2)", 1248, 832),
-    ("1512x648 (21:9)", 1512, 648),
-    ("2048x2048 (1:1)", 2048, 2048),
-    ("Custom", None, None),
-]
-
-RECOMMENDED_PRESETS_SEEDREAM_4 = [
-    ("2048x2048 (1:1)", 2048, 2048),
-    ("2304x1728 (4:3)", 2304, 1728),
-    ("1728x2304 (3:4)", 1728, 2304),
-    ("2560x1440 (16:9)", 2560, 1440),
-    ("1440x2560 (9:16)", 1440, 2560),
-    ("2496x1664 (3:2)", 2496, 1664),
-    ("1664x2496 (2:3)", 1664, 2496),
-    ("3024x1296 (21:9)", 3024, 1296),
-    ("4096x4096 (1:1)", 4096, 4096),
-    ("Custom", None, None),
-]
-
-# The time in this dictionary are given for 10 seconds duration.
-VIDEO_TASKS_EXECUTION_TIME = {
-    "seedance-1-0-lite-t2v-250428": {
-        "480p": 40,
-        "720p": 60,
-        "1080p": 90,
-    },
-    "seedance-1-0-lite-i2v-250428": {
-        "480p": 40,
-        "720p": 60,
-        "1080p": 90,
-    },
-    "seedance-1-0-pro-250528": {
-        "480p": 70,
-        "720p": 85,
-        "1080p": 115,
-    },
-}
-
-
 def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
     if response.error:
         error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}"
@@ -194,13 +52,6 @@ def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
     return response.data[0]["url"]
 
 
-def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]:
-    """Returns the video URL from the task status response if it exists."""
-    if hasattr(response, "content") and response.content:
-        return response.content.video_url
-    return None
-
-
 class ByteDanceImageNode(IO.ComfyNode):
 
     @classmethod
@@ -211,12 +62,7 @@ class ByteDanceImageNode(IO.ComfyNode):
             category="api node/image/ByteDance",
             description="Generate images using ByteDance models via api based on prompt",
             inputs=[
-                IO.Combo.Input(
-                    "model",
-                    options=Text2ImageModelName,
-                    default=Text2ImageModelName.seedream_3,
-                    tooltip="Model name",
-                ),
+                IO.Combo.Input("model", options=["seedream-3-0-t2i-250415"]),
                 IO.String.Input(
                     "prompt",
                     multiline=True,
@@ -335,12 +181,7 @@ class ByteDanceImageEditNode(IO.ComfyNode):
             category="api node/image/ByteDance",
             description="Edit images using ByteDance models via api based on prompt",
             inputs=[
-                IO.Combo.Input(
-                    "model",
-                    options=Image2ImageModelName,
-                    default=Image2ImageModelName.seededit_3,
-                    tooltip="Model name",
-                ),
+                IO.Combo.Input("model", options=["seededit-3-0-i2i-250628"]),
                 IO.Image.Input(
                     "image",
                     tooltip="The base image to edit",
@@ -394,7 +235,7 @@ class ByteDanceImageEditNode(IO.ComfyNode):
     async def execute(
         cls,
         model: str,
-        image: torch.Tensor,
+        image: Input.Image,
         prompt: str,
         seed: int,
         guidance_scale: float,
@@ -434,7 +275,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
             inputs=[
                 IO.Combo.Input(
                     "model",
-                    options=["seedream-4-0-250828"],
+                    options=["seedream-4-5-251128", "seedream-4-0-250828"],
                     tooltip="Model name",
                 ),
                 IO.String.Input(
@@ -459,7 +300,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
                     default=2048,
                     min=1024,
                     max=4096,
-                    step=64,
+                    step=8,
                     tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
                     optional=True,
                 ),
@@ -468,7 +309,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
                     default=2048,
                     min=1024,
                     max=4096,
-                    step=64,
+                    step=8,
                     tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
                     optional=True,
                 ),
@@ -532,7 +373,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
         cls,
         model: str,
         prompt: str,
-        image: torch.Tensor = None,
+        image: Input.Image | None = None,
         size_preset: str = RECOMMENDED_PRESETS_SEEDREAM_4[0][0],
         width: int = 2048,
         height: int = 2048,
@@ -555,6 +396,18 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
                 raise ValueError(
                     f"Custom size out of range: {w}x{h}. " "Both width and height must be between 1024 and 4096 pixels."
                 )
+        out_num_pixels = w * h
+        mp_provided = out_num_pixels / 1_000_000.0
+        if "seedream-4-5" in model and out_num_pixels < 3686400:
+            raise ValueError(
+                f"Minimum image resolution that Seedream 4.5 can generate is 3.68MP, "
+                f"but {mp_provided:.2f}MP provided."
+            )
+        if "seedream-4-0" in model and out_num_pixels < 921600:
+            raise ValueError(
+                f"Minimum image resolution that the selected model can generate is 0.92MP, "
+                f"but {mp_provided:.2f}MP provided."
+            )
         n_input_images = get_number_of_images(image) if image is not None else 0
         if n_input_images > 10:
             raise ValueError(f"Maximum of 10 reference images are supported, but {n_input_images} received.")
@@ -607,9 +460,8 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
             inputs=[
                 IO.Combo.Input(
                     "model",
-                    options=Text2VideoModelName,
-                    default=Text2VideoModelName.seedance_1_pro,
-                    tooltip="Model name",
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"],
+                    default="seedance-1-0-pro-fast-251015",
                 ),
                 IO.String.Input(
                     "prompt",
@@ -714,9 +566,8 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
             inputs=[
                 IO.Combo.Input(
                     "model",
-                    options=Image2VideoModelName,
-                    default=Image2VideoModelName.seedance_1_pro,
-                    tooltip="Model name",
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"],
+                    default="seedance-1-0-pro-fast-251015",
                 ),
                 IO.String.Input(
                     "prompt",
@@ -787,7 +638,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
         cls,
         model: str,
         prompt: str,
-        image: torch.Tensor,
+        image: Input.Image,
         resolution: str,
         aspect_ratio: str,
         duration: int,
@@ -833,9 +684,8 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
             inputs=[
                 IO.Combo.Input(
                     "model",
-                    options=[model.value for model in Image2VideoModelName],
-                    default=Image2VideoModelName.seedance_1_lite.value,
-                    tooltip="Model name",
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"],
+                    default="seedance-1-0-lite-i2v-250428",
                 ),
                 IO.String.Input(
                     "prompt",
@@ -910,8 +760,8 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
         cls,
         model: str,
         prompt: str,
-        first_frame: torch.Tensor,
-        last_frame: torch.Tensor,
+        first_frame: Input.Image,
+        last_frame: Input.Image,
         resolution: str,
         aspect_ratio: str,
         duration: int,
@@ -968,9 +818,8 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
             inputs=[
                 IO.Combo.Input(
                     "model",
-                    options=[Image2VideoModelName.seedance_1_lite.value],
-                    default=Image2VideoModelName.seedance_1_lite.value,
-                    tooltip="Model name",
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"],
+                    default="seedance-1-0-lite-i2v-250428",
                 ),
                 IO.String.Input(
                     "prompt",
@@ -1034,7 +883,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
         cls,
         model: str,
         prompt: str,
-        images: torch.Tensor,
+        images: Input.Image,
         resolution: str,
         aspect_ratio: str,
         duration: int,
@@ -1069,8 +918,8 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
 
 async def process_video_task(
     cls: type[IO.ComfyNode],
-    payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest],
-    estimated_duration: Optional[int],
+    payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
+    estimated_duration: int | None,
 ) -> IO.NodeOutput:
     initial_response = await sync_op(
         cls,
@@ -1085,7 +934,7 @@ async def process_video_task(
         estimated_duration=estimated_duration,
         response_model=TaskStatusResponse,
     )
-    return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response)))
+    return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
 
 
 def raise_if_text_params(prompt: str, text_params: list[str]) -> None:
diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py
index 08f7b0f64..ad0f4b4d1 100644
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@@ -13,8 +13,7 @@ import torch
 from typing_extensions import override
 
 import folder_paths
-from comfy_api.latest import IO, ComfyExtension, Input
-from comfy_api.util import VideoCodec, VideoContainer
+from comfy_api.latest import IO, ComfyExtension, Input, Types
 from comfy_api_nodes.apis.gemini_api import (
     GeminiContent,
     GeminiFileData,
@@ -27,6 +26,8 @@ from comfy_api_nodes.apis.gemini_api import (
     GeminiMimeType,
     GeminiPart,
     GeminiRole,
+    GeminiSystemInstructionContent,
+    GeminiTextPart,
     Modality,
 )
 from comfy_api_nodes.util import (
@@ -43,6 +44,14 @@ from comfy_api_nodes.util import (
 
 GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini"
 GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024  # 20 MB
+GEMINI_IMAGE_SYS_PROMPT = (
+    "You are an expert image-generation engine. You must ALWAYS produce an image.\n"
+    "Interpret all user input—regardless of "
+    "format, intent, or abstraction—as literal visual directives for image composition.\n"
+    "If a prompt is conversational or lacks specific visual details, "
+    "you must creatively invent a concrete visual scenario that depicts the concept.\n"
+    "Prioritize generating the visual representation above any text, formatting, or conversational requests."
+)
 
 
 class GeminiModel(str, Enum):
@@ -68,7 +77,7 @@ class GeminiImageModel(str, Enum):
 
 async def create_image_parts(
     cls: type[IO.ComfyNode],
-    images: torch.Tensor,
+    images: Input.Image,
     image_limit: int = 0,
 ) -> list[GeminiPart]:
     image_parts: list[GeminiPart] = []
@@ -154,8 +163,8 @@ def get_text_from_response(response: GeminiGenerateContentResponse) -> str:
     return "\n".join([part.text for part in parts])
 
 
-def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Tensor:
-    image_tensors: list[torch.Tensor] = []
+def get_image_from_response(response: GeminiGenerateContentResponse) -> Input.Image:
+    image_tensors: list[Input.Image] = []
     parts = get_parts_by_type(response, "image/png")
     for part in parts:
         image_data = base64.b64decode(part.inlineData.data)
@@ -277,6 +286,13 @@ class GeminiNode(IO.ComfyNode):
                     tooltip="Optional file(s) to use as context for the model. "
                     "Accepts inputs from the Gemini Generate Content Input Files node.",
                 ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default="",
+                    optional=True,
+                    tooltip="Foundational instructions that dictate an AI's behavior.",
+                ),
             ],
             outputs=[
                 IO.String.Output(),
@@ -293,7 +309,9 @@ class GeminiNode(IO.ComfyNode):
     def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]:
         """Convert video input to Gemini API compatible parts."""
 
-        base_64_string = video_to_base64_string(video_input, container_format=VideoContainer.MP4, codec=VideoCodec.H264)
+        base_64_string = video_to_base64_string(
+            video_input, container_format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264
+        )
         return [
             GeminiPart(
                 inlineData=GeminiInlineData(
@@ -343,10 +361,11 @@ class GeminiNode(IO.ComfyNode):
         prompt: str,
         model: str,
         seed: int,
-        images: torch.Tensor | None = None,
+        images: Input.Image | None = None,
         audio: Input.Audio | None = None,
         video: Input.Video | None = None,
         files: list[GeminiPart] | None = None,
+        system_prompt: str = "",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=False)
 
@@ -363,7 +382,10 @@ class GeminiNode(IO.ComfyNode):
         if files is not None:
             parts.extend(files)
 
-        # Create response
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
         response = await sync_op(
             cls,
             endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
@@ -373,7 +395,8 @@ class GeminiNode(IO.ComfyNode):
                         role=GeminiRole.user,
                         parts=parts,
                     )
-                ]
+                ],
+                systemInstruction=gemini_system_prompt,
             ),
             response_model=GeminiGenerateContentResponse,
             price_extractor=calculate_tokens_price,
@@ -523,6 +546,13 @@ class GeminiImage(IO.ComfyNode):
                     "'IMAGE+TEXT' to return both the generated image and a text response.",
                     optional=True,
                 ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default=GEMINI_IMAGE_SYS_PROMPT,
+                    optional=True,
+                    tooltip="Foundational instructions that dictate an AI's behavior.",
+                ),
             ],
             outputs=[
                 IO.Image.Output(),
@@ -542,10 +572,11 @@ class GeminiImage(IO.ComfyNode):
         prompt: str,
         model: str,
         seed: int,
-        images: torch.Tensor | None = None,
+        images: Input.Image | None = None,
         files: list[GeminiPart] | None = None,
         aspect_ratio: str = "auto",
         response_modalities: str = "IMAGE+TEXT",
+        system_prompt: str = "",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=True, min_length=1)
         parts: list[GeminiPart] = [GeminiPart(text=prompt)]
@@ -559,6 +590,10 @@ class GeminiImage(IO.ComfyNode):
         if files is not None:
             parts.extend(files)
 
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
         response = await sync_op(
             cls,
             endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
@@ -570,6 +605,7 @@ class GeminiImage(IO.ComfyNode):
                     responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                     imageConfig=None if aspect_ratio == "auto" else image_config,
                 ),
+                systemInstruction=gemini_system_prompt,
             ),
             response_model=GeminiGenerateContentResponse,
             price_extractor=calculate_tokens_price,
@@ -640,6 +676,13 @@ class GeminiImage2(IO.ComfyNode):
                     tooltip="Optional file(s) to use as context for the model. "
                     "Accepts inputs from the Gemini Generate Content Input Files node.",
                 ),
+                IO.String.Input(
+                    "system_prompt",
+                    multiline=True,
+                    default=GEMINI_IMAGE_SYS_PROMPT,
+                    optional=True,
+                    tooltip="Foundational instructions that dictate an AI's behavior.",
+                ),
             ],
             outputs=[
                 IO.Image.Output(),
@@ -662,8 +705,9 @@ class GeminiImage2(IO.ComfyNode):
         aspect_ratio: str,
         resolution: str,
         response_modalities: str,
-        images: torch.Tensor | None = None,
+        images: Input.Image | None = None,
         files: list[GeminiPart] | None = None,
+        system_prompt: str = "",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=True, min_length=1)
 
@@ -679,6 +723,10 @@ class GeminiImage2(IO.ComfyNode):
         if aspect_ratio != "auto":
             image_config.aspectRatio = aspect_ratio
 
+        gemini_system_prompt = None
+        if system_prompt:
+            gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
         response = await sync_op(
             cls,
             ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
@@ -690,6 +738,7 @@ class GeminiImage2(IO.ComfyNode):
                     responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                     imageConfig=image_config,
                 ),
+                systemInstruction=gemini_system_prompt,
             ),
             response_model=GeminiGenerateContentResponse,
             price_extractor=calculate_tokens_price,
diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py
index 0b757a62b..7e61560dc 100644
--- a/comfy_api_nodes/nodes_ltxv.py
+++ b/comfy_api_nodes/nodes_ltxv.py
@@ -1,12 +1,9 @@
 from io import BytesIO
-from typing import Optional
 
-import torch
 from pydantic import BaseModel, Field
 from typing_extensions import override
 
-from comfy_api.input_impl import VideoFromFile
-from comfy_api.latest import IO, ComfyExtension
+from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
 from comfy_api_nodes.util import (
     ApiEndpoint,
     get_number_of_images,
@@ -26,9 +23,9 @@ class ExecuteTaskRequest(BaseModel):
     model: str = Field(...)
     duration: int = Field(...)
     resolution: str = Field(...)
-    fps: Optional[int] = Field(25)
-    generate_audio: Optional[bool] = Field(True)
-    image_uri: Optional[str] = Field(None)
+    fps: int | None = Field(25)
+    generate_audio: bool | None = Field(True)
+    image_uri: str | None = Field(None)
 
 
 class TextToVideoNode(IO.ComfyNode):
@@ -103,7 +100,7 @@ class TextToVideoNode(IO.ComfyNode):
             as_binary=True,
             max_retries=1,
         )
-        return IO.NodeOutput(VideoFromFile(BytesIO(response)))
+        return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(response)))
 
 
 class ImageToVideoNode(IO.ComfyNode):
@@ -153,7 +150,7 @@ class ImageToVideoNode(IO.ComfyNode):
     @classmethod
     async def execute(
         cls,
-        image: torch.Tensor,
+        image: Input.Image,
         model: str,
         prompt: str,
         duration: int,
@@ -183,7 +180,7 @@ class ImageToVideoNode(IO.ComfyNode):
             as_binary=True,
             max_retries=1,
         )
-        return IO.NodeOutput(VideoFromFile(BytesIO(response)))
+        return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(response)))
 
 
 class LtxvApiExtension(ComfyExtension):
diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py
index 7c31d95b3..2771e4790 100644
--- a/comfy_api_nodes/nodes_moonvalley.py
+++ b/comfy_api_nodes/nodes_moonvalley.py
@@ -1,11 +1,8 @@
 import logging
-from typing import Optional
 
-import torch
 from typing_extensions import override
 
-from comfy_api.input import VideoInput
-from comfy_api.latest import IO, ComfyExtension
+from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis import (
     MoonvalleyPromptResponse,
     MoonvalleyTextToVideoInferenceParams,
@@ -61,7 +58,7 @@ def validate_task_creation_response(response) -> None:
         raise RuntimeError(error_msg)
 
 
-def validate_video_to_video_input(video: VideoInput) -> VideoInput:
+def validate_video_to_video_input(video: Input.Video) -> Input.Video:
     """
     Validates and processes video input for Moonvalley Video-to-Video generation.
 
@@ -82,7 +79,7 @@ def validate_video_to_video_input(video: VideoInput) -> VideoInput:
     return _validate_and_trim_duration(video)
 
 
-def _get_video_dimensions(video: VideoInput) -> tuple[int, int]:
+def _get_video_dimensions(video: Input.Video) -> tuple[int, int]:
     """Extracts video dimensions with error handling."""
     try:
         return video.get_dimensions()
@@ -106,7 +103,7 @@ def _validate_video_dimensions(width: int, height: int) -> None:
         raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}")
 
 
-def _validate_and_trim_duration(video: VideoInput) -> VideoInput:
+def _validate_and_trim_duration(video: Input.Video) -> Input.Video:
     """Validates video duration and trims to 5 seconds if needed."""
     duration = video.get_duration()
     _validate_minimum_duration(duration)
@@ -119,7 +116,7 @@ def _validate_minimum_duration(duration: float) -> None:
         raise ValueError("Input video must be at least 5 seconds long.")
 
 
-def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput:
+def _trim_if_too_long(video: Input.Video, duration: float) -> Input.Video:
     """Trims video to 5 seconds if longer."""
     if duration > 5:
         return trim_video(video, 5)
@@ -241,7 +238,7 @@ class MoonvalleyImg2VideoNode(IO.ComfyNode):
     @classmethod
     async def execute(
         cls,
-        image: torch.Tensor,
+        image: Input.Image,
         prompt: str,
         negative_prompt: str,
         resolution: str,
@@ -362,9 +359,9 @@ class MoonvalleyVideo2VideoNode(IO.ComfyNode):
         prompt: str,
         negative_prompt: str,
         seed: int,
-        video: Optional[VideoInput] = None,
+        video: Input.Video | None = None,
         control_type: str = "Motion Transfer",
-        motion_intensity: Optional[int] = 100,
+        motion_intensity: int | None = 100,
         steps=33,
         prompt_adherence=4.5,
     ) -> IO.NodeOutput:
diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py
index 2fdafbbfe..3c55039c9 100644
--- a/comfy_api_nodes/nodes_runway.py
+++ b/comfy_api_nodes/nodes_runway.py
@@ -11,12 +11,11 @@ User Guides:
 
 """
 
-from typing import Union, Optional
-from typing_extensions import override
 from enum import Enum
 
-import torch
+from typing_extensions import override
 
+from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
 from comfy_api_nodes.apis import (
     RunwayImageToVideoRequest,
     RunwayImageToVideoResponse,
@@ -44,8 +43,6 @@ from comfy_api_nodes.util import (
     sync_op,
     poll_op,
 )
-from comfy_api.input_impl import VideoFromFile
-from comfy_api.latest import ComfyExtension, IO
 
 PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video"
 PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image"
@@ -80,7 +77,7 @@ class RunwayGen3aAspectRatio(str, Enum):
     field_1280_768 = "1280:768"
 
 
-def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]:
+def get_video_url_from_task_status(response: TaskStatusResponse) -> str | None:
     """Returns the video URL from the task status response if it exists."""
     if hasattr(response, "output") and len(response.output) > 0:
         return response.output[0]
@@ -89,13 +86,13 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N
 
 def extract_progress_from_task_status(
     response: TaskStatusResponse,
-) -> Union[float, None]:
+) -> float | None:
     if hasattr(response, "progress") and response.progress is not None:
         return response.progress * 100
     return None
 
 
-def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, None]:
+def get_image_url_from_task_status(response: TaskStatusResponse) -> str | None:
     """Returns the image URL from the task status response if it exists."""
     if hasattr(response, "output") and len(response.output) > 0:
         return response.output[0]
@@ -103,7 +100,7 @@ def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, N
 
 
 async def get_response(
-    cls: type[IO.ComfyNode], task_id: str, estimated_duration: Optional[int] = None
+    cls: type[IO.ComfyNode], task_id: str, estimated_duration: int | None = None
 ) -> TaskStatusResponse:
     """Poll the task status until it is finished then get the response."""
     return await poll_op(
@@ -119,8 +116,8 @@ async def get_response(
 async def generate_video(
     cls: type[IO.ComfyNode],
     request: RunwayImageToVideoRequest,
-    estimated_duration: Optional[int] = None,
-) -> VideoFromFile:
+    estimated_duration: int | None = None,
+) -> InputImpl.VideoFromFile:
     initial_response = await sync_op(
         cls,
         endpoint=ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"),
@@ -193,7 +190,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode):
     async def execute(
         cls,
         prompt: str,
-        start_frame: torch.Tensor,
+        start_frame: Input.Image,
         duration: str,
         ratio: str,
         seed: int,
@@ -283,7 +280,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode):
     async def execute(
         cls,
         prompt: str,
-        start_frame: torch.Tensor,
+        start_frame: Input.Image,
         duration: str,
         ratio: str,
         seed: int,
@@ -381,8 +378,8 @@ class RunwayFirstLastFrameNode(IO.ComfyNode):
     async def execute(
         cls,
         prompt: str,
-        start_frame: torch.Tensor,
-        end_frame: torch.Tensor,
+        start_frame: Input.Image,
+        end_frame: Input.Image,
         duration: str,
         ratio: str,
         seed: int,
@@ -467,7 +464,7 @@ class RunwayTextToImageNode(IO.ComfyNode):
         cls,
         prompt: str,
         ratio: str,
-        reference_image: Optional[torch.Tensor] = None,
+        reference_image: Input.Image | None = None,
     ) -> IO.NodeOutput:
         validate_string(prompt, min_length=1)
 
diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py
index a54dc13ab..e165b8380 100644
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@@ -1,11 +1,9 @@
 import base64
 from io import BytesIO
 
-import torch
 from typing_extensions import override
 
-from comfy_api.input_impl.video_types import VideoFromFile
-from comfy_api.latest import IO, ComfyExtension
+from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
 from comfy_api_nodes.apis.veo_api import (
     VeoGenVidPollRequest,
     VeoGenVidPollResponse,
@@ -232,7 +230,7 @@ class VeoVideoGenerationNode(IO.ComfyNode):
 
             # Check if video is provided as base64 or URL
             if hasattr(video, "bytesBase64Encoded") and video.bytesBase64Encoded:
-                return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
+                return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
 
             if hasattr(video, "gcsUri") and video.gcsUri:
                 return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
@@ -431,8 +429,8 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
         aspect_ratio: str,
         duration: int,
         seed: int,
-        first_frame: torch.Tensor,
-        last_frame: torch.Tensor,
+        first_frame: Input.Image,
+        last_frame: Input.Image,
         model: str,
         generate_audio: bool,
     ):
@@ -493,7 +491,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
         if response.videos:
             video = response.videos[0]
             if video.bytesBase64Encoded:
-                return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
+                return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
             if video.gcsUri:
                 return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
             raise Exception("Video returned but no data or URL was provided")
diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py
index 6cf6e39bf..c609e03da 100644
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -8,10 +8,7 @@ import json
 from typing import Optional
 from typing_extensions import override
 from fractions import Fraction
-from comfy_api.input import AudioInput, ImageInput, VideoInput
-from comfy_api.input_impl import VideoFromComponents, VideoFromFile
-from comfy_api.util import VideoCodec, VideoComponents, VideoContainer
-from comfy_api.latest import ComfyExtension, io, ui
+from comfy_api.latest import ComfyExtension, io, ui, Input, InputImpl, Types
 from comfy.cli_args import args
 
 class SaveWEBM(io.ComfyNode):
@@ -28,7 +25,6 @@ class SaveWEBM(io.ComfyNode):
                 io.Float.Input("fps", default=24.0, min=0.01, max=1000.0, step=0.01),
                 io.Float.Input("crf", default=32.0, min=0, max=63.0, step=1, tooltip="Higher crf means lower quality with a smaller file size, lower crf means higher quality higher filesize."),
             ],
-            outputs=[],
             hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo],
             is_output_node=True,
         )
@@ -79,16 +75,15 @@ class SaveVideo(io.ComfyNode):
             inputs=[
                 io.Video.Input("video", tooltip="The video to save."),
                 io.String.Input("filename_prefix", default="video/ComfyUI", tooltip="The prefix for the file to save. This may include formatting information such as %date:yyyy-MM-dd% or %Empty Latent Image.width% to include values from nodes."),
-                io.Combo.Input("format", options=VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."),
-                io.Combo.Input("codec", options=VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."),
+                io.Combo.Input("format", options=Types.VideoContainer.as_input(), default="auto", tooltip="The format to save the video as."),
+                io.Combo.Input("codec", options=Types.VideoCodec.as_input(), default="auto", tooltip="The codec to use for the video."),
             ],
-            outputs=[],
             hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo],
             is_output_node=True,
         )
 
     @classmethod
-    def execute(cls, video: VideoInput, filename_prefix, format: str, codec) -> io.NodeOutput:
+    def execute(cls, video: Input.Video, filename_prefix, format: str, codec) -> io.NodeOutput:
         width, height = video.get_dimensions()
         full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
             filename_prefix,
@@ -105,10 +100,10 @@ class SaveVideo(io.ComfyNode):
                 metadata["prompt"] = cls.hidden.prompt
             if len(metadata) > 0:
                 saved_metadata = metadata
-        file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}"
+        file = f"{filename}_{counter:05}_.{Types.VideoContainer.get_extension(format)}"
         video.save_to(
             os.path.join(full_output_folder, file),
-            format=VideoContainer(format),
+            format=Types.VideoContainer(format),
             codec=codec,
             metadata=saved_metadata
         )
@@ -135,9 +130,9 @@ class CreateVideo(io.ComfyNode):
         )
 
     @classmethod
-    def execute(cls, images: ImageInput, fps: float, audio: Optional[AudioInput] = None) -> io.NodeOutput:
+    def execute(cls, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None) -> io.NodeOutput:
         return io.NodeOutput(
-            VideoFromComponents(VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps)))
+            InputImpl.VideoFromComponents(Types.VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps)))
         )
 
 class GetVideoComponents(io.ComfyNode):
@@ -159,11 +154,11 @@ class GetVideoComponents(io.ComfyNode):
         )
 
     @classmethod
-    def execute(cls, video: VideoInput) -> io.NodeOutput:
+    def execute(cls, video: Input.Video) -> io.NodeOutput:
         components = video.get_components()
-
         return io.NodeOutput(components.images, components.audio, float(components.frame_rate))
 
+
 class LoadVideo(io.ComfyNode):
     @classmethod
     def define_schema(cls):
@@ -185,7 +180,7 @@ class LoadVideo(io.ComfyNode):
     @classmethod
     def execute(cls, file) -> io.NodeOutput:
         video_path = folder_paths.get_annotated_filepath(file)
-        return io.NodeOutput(VideoFromFile(video_path))
+        return io.NodeOutput(InputImpl.VideoFromFile(video_path))
 
     @classmethod
     def fingerprint_inputs(s, file):
diff --git a/requirements.txt b/requirements.txt
index f98848e20..11a7ac245 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.33.10
-comfyui-workflow-templates==0.7.25
+comfyui-frontend-package==1.33.13
+comfyui-workflow-templates==0.7.54
 comfyui-embedded-docs==0.3.1
 torch
 torchsde