diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py
index 51a263203..59ece5130 100755
--- a/.ci/update_windows/update.py
+++ b/.ci/update_windows/update.py
@@ -66,8 +66,10 @@ if branch is None:
     try:
         ref = repo.lookup_reference('refs/remotes/origin/master')
     except:
-        print("pulling.")  # noqa: T201
-        pull(repo)
+        print("fetching.")  # noqa: T201
+        for remote in repo.remotes:
+            if remote.name == "origin":
+                remote.fetch()
         ref = repo.lookup_reference('refs/remotes/origin/master')
     repo.checkout(ref)
     branch = repo.lookup_branch('master')
@@ -149,3 +151,4 @@ try:
         shutil.copy(stable_update_script, stable_update_script_to)
 except:
     pass
+
diff --git a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
index 96a500be2..2cbb00d99 100755
--- a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
+++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt
@@ -1,5 +1,5 @@
-As of the time of writing this you need this preview driver for best results:
-https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-PREVIEW.html
+As of the time of writing this you need this driver for best results:
+https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-7-1-1.html
 
 HOW TO RUN:
 
@@ -25,3 +25,4 @@ In the ComfyUI directory you will find a file: extra_model_paths.yaml.example
 Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor.
 
 
+
diff --git a/.github/PULL_REQUEST_TEMPLATE/api-node.md b/.github/PULL_REQUEST_TEMPLATE/api-node.md
index f62744878..c1f1bafb1 100644
--- a/.github/PULL_REQUEST_TEMPLATE/api-node.md
+++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md
@@ -18,4 +18,4 @@ If **Need pricing update**:
 - [ ] **QA not required**
 
 ### Comms
-- [ ] Informed **@Kosinkadink**
+- [ ] Informed **Kosinkadink**
diff --git a/.github/workflows/api-node-template.yml b/.github/workflows/api-node-template.yml
index 0775f9979..fdb81c0c5 100644
--- a/.github/workflows/api-node-template.yml
+++ b/.github/workflows/api-node-template.yml
@@ -2,7 +2,7 @@ name: Append API Node PR template
 
 on:
   pull_request_target:
-    types: [opened, reopened, synchronize, edited, ready_for_review]
+    types: [opened, reopened, synchronize, ready_for_review]
     paths:
       - 'comfy_api_nodes/**'   # only run if these files changed
 
diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml
index 7dca7277b..d72ece2ce 100644
--- a/.github/workflows/release-stable-all.yml
+++ b/.github/workflows/release-stable-all.yml
@@ -14,7 +14,7 @@ jobs:
       contents: "write"
       packages: "write"
       pull-requests: "read"
-    name: "Release NVIDIA Default (cu129)"
+    name: "Release NVIDIA Default (cu130)"
     uses: ./.github/workflows/stable-release.yml
     with:
       git_tag: ${{ inputs.git_tag }}
@@ -43,16 +43,33 @@ jobs:
       test_release: true
     secrets: inherit
 
+  release_nvidia_cu126:
+    permissions:
+      contents: "write"
+      packages: "write"
+      pull-requests: "read"
+    name: "Release NVIDIA cu126"
+    uses: ./.github/workflows/stable-release.yml
+    with:
+      git_tag: ${{ inputs.git_tag }}
+      cache_tag: "cu126"
+      python_minor: "12"
+      python_patch: "10"
+      rel_name: "nvidia"
+      rel_extra_name: "_cu126"
+      test_release: true
+    secrets: inherit
+
   release_amd_rocm:
     permissions:
       contents: "write"
       packages: "write"
       pull-requests: "read"
-    name: "Release AMD ROCm 6.4.4"
+    name: "Release AMD ROCm 7.1.1"
     uses: ./.github/workflows/stable-release.yml
     with:
       git_tag: ${{ inputs.git_tag }}
-      cache_tag: "rocm644"
+      cache_tag: "rocm711"
       python_minor: "12"
       python_patch: "10"
       rel_name: "amd"
diff --git a/CODEOWNERS b/CODEOWNERS
index b7aca9b26..4d5448636 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,3 +1,2 @@
 # Admins
-* @comfyanonymous
-* @kosinkadink
+* @comfyanonymous @kosinkadink @guill
diff --git a/README.md b/README.md
index cd8273b0d..91fb510e1 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,8 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
    - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
    - [Qwen Image](https://comfyanonymous.github.io/ComfyUI_examples/qwen_image/)
    - [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
+   - [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
+   - [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
 - Image Editing Models
    - [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
    - [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
@@ -183,7 +185,9 @@ Update your Nvidia drivers if it doesn't start.
 
 [Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
 
-[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z) (Supports Nvidia 10 series and older GPUs).
+[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z).
+
+[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
 
 #### How do I share models between another UI and ComfyUI?
 
@@ -221,7 +225,7 @@ AMD users can install rocm and pytorch with pip if you don't have it already ins
 
 This is the command to install the nightly with ROCm 7.0 which might have some performance improvements:
 
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.0```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1```
 
 
 ### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only.
diff --git a/app/frontend_management.py b/app/frontend_management.py
index cce0c117d..bdaa85812 100644
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -10,7 +10,8 @@ import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import TypedDict, Optional
+from typing import Dict, TypedDict, Optional
+from aiohttp import web
 from importlib.metadata import version
 
 import requests
@@ -257,7 +258,54 @@ comfyui-frontend-package is not installed.
             sys.exit(-1)
 
     @classmethod
-    def templates_path(cls) -> str:
+    def template_asset_map(cls) -> Optional[Dict[str, str]]:
+        """Return a mapping of template asset names to their absolute paths."""
+        try:
+            from comfyui_workflow_templates import (
+                get_asset_path,
+                iter_templates,
+            )
+        except ImportError:
+            logging.error(
+                f"""
+********** ERROR ***********
+
+comfyui-workflow-templates is not installed.
+
+{frontend_install_warning_message()}
+
+********** ERROR ***********
+""".strip()
+            )
+            return None
+
+        try:
+            template_entries = list(iter_templates())
+        except Exception as exc:
+            logging.error(f"Failed to enumerate workflow templates: {exc}")
+            return None
+
+        asset_map: Dict[str, str] = {}
+        try:
+            for entry in template_entries:
+                for asset in entry.assets:
+                    asset_map[asset.filename] = get_asset_path(
+                        entry.template_id, asset.filename
+                    )
+        except Exception as exc:
+            logging.error(f"Failed to resolve template asset paths: {exc}")
+            return None
+
+        if not asset_map:
+            logging.error("No workflow template assets found. Did the packages install correctly?")
+            return None
+
+        return asset_map
+
+
+    @classmethod
+    def legacy_templates_path(cls) -> Optional[str]:
+        """Return the legacy templates directory shipped inside the meta package."""
         try:
             import comfyui_workflow_templates
 
@@ -276,6 +324,7 @@ comfyui-workflow-templates is not installed.
 ********** ERROR ***********
 """.strip()
             )
+            return None
 
     @classmethod
     def embedded_docs_path(cls) -> str:
@@ -392,3 +441,17 @@ comfyui-workflow-templates is not installed.
             logging.info("Falling back to the default frontend.")
             check_frontend_version()
             return cls.default_frontend_path()
+    @classmethod
+    def template_asset_handler(cls):
+        assets = cls.template_asset_map()
+        if not assets:
+            return None
+
+        async def serve_template(request: web.Request) -> web.StreamResponse:
+            rel_path = request.match_info.get("path", "")
+            target = assets.get(rel_path)
+            if target is None:
+                raise web.HTTPNotFound()
+            return web.FileResponse(target)
+
+        return serve_template
diff --git a/app/user_manager.py b/app/user_manager.py
index a2d376c0c..e2c00dab2 100644
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -59,6 +59,9 @@ class UserManager():
         user = "default"
         if args.multi_user and "comfy-user" in request.headers:
             user = request.headers["comfy-user"]
+            # Block System Users (use same error message to prevent probing)
+            if user.startswith(folder_paths.SYSTEM_USER_PREFIX):
+                raise KeyError("Unknown user: " + user)
 
         if user not in self.users:
             raise KeyError("Unknown user: " + user)
@@ -66,15 +69,16 @@ class UserManager():
         return user
 
     def get_request_user_filepath(self, request, file, type="userdata", create_dir=True):
-        user_directory = folder_paths.get_user_directory()
-
         if type == "userdata":
-            root_dir = user_directory
+            root_dir = folder_paths.get_user_directory()
         else:
             raise KeyError("Unknown filepath type:" + type)
 
         user = self.get_request_user_id(request)
-        path = user_root = os.path.abspath(os.path.join(root_dir, user))
+        user_root = folder_paths.get_public_user_directory(user)
+        if user_root is None:
+            return None
+        path = user_root
 
         # prevent leaving /{type}
         if os.path.commonpath((root_dir, user_root)) != root_dir:
@@ -101,7 +105,11 @@ class UserManager():
         name = name.strip()
         if not name:
             raise ValueError("username not provided")
+        if name.startswith(folder_paths.SYSTEM_USER_PREFIX):
+            raise ValueError("System User prefix not allowed")
         user_id = re.sub("[^a-zA-Z0-9-_]+", '-', name)
+        if user_id.startswith(folder_paths.SYSTEM_USER_PREFIX):
+            raise ValueError("System User prefix not allowed")
         user_id = user_id + "_" + str(uuid.uuid4())
 
         self.users[user_id] = name
@@ -132,7 +140,10 @@ class UserManager():
             if username in self.users.values():
                 return web.json_response({"error": "Duplicate username."}, status=400)
 
-            user_id = self.add_user(username)
+            try:
+                user_id = self.add_user(username)
+            except ValueError as e:
+                return web.json_response({"error": str(e)}, status=400)
             return web.json_response(user_id)
 
         @routes.get("/userdata")
@@ -424,7 +435,7 @@ class UserManager():
                 return source
 
             dest = get_user_data_path(request, check_exists=False, param="dest")
-            if not isinstance(source, str):
+            if not isinstance(dest, str):
                 return dest
 
             overwrite = request.query.get("overwrite", 'true') != "false"
diff --git a/comfy/cldm/cldm.py b/comfy/cldm/cldm.py
index ec01665e2..c93c2e909 100644
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@@ -413,7 +413,8 @@ class ControlNet(nn.Module):
         out_middle = []
 
         if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
+            if y is None:
+                raise ValueError("y is None, did you try using a controlnet for SDXL on SD1?")
             emb = emb + self.label_emb(y)
 
         h = x
diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 2f30b72d2..209fc185b 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -121,6 +121,12 @@ upcast.add_argument("--force-upcast-attention", action="store_true", help="Force
 upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disable all upcasting of attention. Should be unnecessary except for debugging.")
 
 
+parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.")
+manager_group = parser.add_mutually_exclusive_group()
+manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.")
+manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager")
+
+
 vram_group = parser.add_mutually_exclusive_group()
 vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
 vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
@@ -131,7 +137,8 @@ vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for e
 
 parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
 
-parser.add_argument("--async-offload", action="store_true", help="Use async weight offloading.")
+parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
+parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
 
 parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
 
@@ -160,13 +167,14 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win
 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
 parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
-parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
+parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes. Also prevents the frontend from communicating with the internet.")
 
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
 
 parser.add_argument("--verbose", default='INFO', const='DEBUG', nargs="?", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Set the logging level')
 parser.add_argument("--log-stdout", action="store_true", help="Send normal process output to stdout instead of stderr (default).")
 
+
 # The default built-in provider hosted under web/
 DEFAULT_VERSION_STRING = "comfyanonymous/ComfyUI@latest"
 
diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index 77e642a94..f1ca0151e 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -6,6 +6,7 @@ class LatentFormat:
     latent_dimensions = 2
     latent_rgb_factors = None
     latent_rgb_factors_bias = None
+    latent_rgb_factors_reshape = None
     taesd_decoder_name = None
 
     def process_in(self, latent):
@@ -178,6 +179,54 @@ class Flux(SD3):
     def process_out(self, latent):
         return (latent / self.scale_factor) + self.shift_factor
 
+class Flux2(LatentFormat):
+    latent_channels = 128
+
+    def __init__(self):
+        self.latent_rgb_factors =[
+            [0.0058, 0.0113, 0.0073],
+            [0.0495, 0.0443, 0.0836],
+            [-0.0099, 0.0096, 0.0644],
+            [0.2144, 0.3009, 0.3652],
+            [0.0166, -0.0039, -0.0054],
+            [0.0157, 0.0103, -0.0160],
+            [-0.0398, 0.0902, -0.0235],
+            [-0.0052, 0.0095, 0.0109],
+            [-0.3527, -0.2712, -0.1666],
+            [-0.0301, -0.0356, -0.0180],
+            [-0.0107, 0.0078, 0.0013],
+            [0.0746, 0.0090, -0.0941],
+            [0.0156, 0.0169, 0.0070],
+            [-0.0034, -0.0040, -0.0114],
+            [0.0032, 0.0181, 0.0080],
+            [-0.0939, -0.0008, 0.0186],
+            [0.0018, 0.0043, 0.0104],
+            [0.0284, 0.0056, -0.0127],
+            [-0.0024, -0.0022, -0.0030],
+            [0.1207, -0.0026, 0.0065],
+            [0.0128, 0.0101, 0.0142],
+            [0.0137, -0.0072, -0.0007],
+            [0.0095, 0.0092, -0.0059],
+            [0.0000, -0.0077, -0.0049],
+            [-0.0465, -0.0204, -0.0312],
+            [0.0095, 0.0012, -0.0066],
+            [0.0290, -0.0034, 0.0025],
+            [0.0220, 0.0169, -0.0048],
+            [-0.0332, -0.0457, -0.0468],
+            [-0.0085, 0.0389, 0.0609],
+            [-0.0076, 0.0003, -0.0043],
+            [-0.0111, -0.0460, -0.0614],
+        ]
+
+        self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
+        self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
+
+    def process_in(self, latent):
+        return latent
+
+    def process_out(self, latent):
+        return latent
+
 class Mochi(LatentFormat):
     latent_channels = 12
     latent_dimensions = 3
@@ -382,6 +431,7 @@ class HunyuanVideo(LatentFormat):
     ]
 
     latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
+    taesd_decoder_name = "taehv"
 
 class Cosmos1CV8x8x8(LatentFormat):
     latent_channels = 16
@@ -445,7 +495,7 @@ class Wan21(LatentFormat):
         ]).view(1, self.latent_channels, 1, 1, 1)
 
 
-        self.taesd_decoder_name = None #TODO
+        self.taesd_decoder_name = "lighttaew2_1"
 
     def process_in(self, latent):
         latents_mean = self.latents_mean.to(latent.device, latent.dtype)
@@ -516,6 +566,7 @@ class Wan22(Wan21):
 
     def __init__(self):
         self.scale_factor = 1.0
+        self.taesd_decoder_name = "lighttaew2_2"
         self.latents_mean = torch.tensor([
                 -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
                 -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
@@ -611,6 +662,67 @@ class HunyuanImage21Refiner(LatentFormat):
     latent_dimensions = 3
     scale_factor = 1.03682
 
+    def process_in(self, latent):
+        out = latent * self.scale_factor
+        out = torch.cat((out[:, :, :1], out), dim=2)
+        out = out.permute(0, 2, 1, 3, 4)
+        b, f_times_2, c, h, w = out.shape
+        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
+        out = out.permute(0, 2, 1, 3, 4).contiguous()
+        return out
+
+    def process_out(self, latent):
+        z = latent / self.scale_factor
+        z = z.permute(0, 2, 1, 3, 4)
+        b, f, c, h, w = z.shape
+        z = z.reshape(b, f, 2, c // 2, h, w)
+        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
+        z = z.permute(0, 2, 1, 3, 4)
+        z = z[:, :, 1:]
+        return z
+
+class HunyuanVideo15(LatentFormat):
+    latent_rgb_factors = [
+        [ 0.0568, -0.0521, -0.0131],
+        [ 0.0014,  0.0735,  0.0326],
+        [ 0.0186,  0.0531, -0.0138],
+        [-0.0031,  0.0051,  0.0288],
+        [ 0.0110,  0.0556,  0.0432],
+        [-0.0041, -0.0023, -0.0485],
+        [ 0.0530,  0.0413,  0.0253],
+        [ 0.0283,  0.0251,  0.0339],
+        [ 0.0277, -0.0372, -0.0093],
+        [ 0.0393,  0.0944,  0.1131],
+        [ 0.0020,  0.0251,  0.0037],
+        [-0.0017,  0.0012,  0.0234],
+        [ 0.0468,  0.0436,  0.0203],
+        [ 0.0354,  0.0439, -0.0233],
+        [ 0.0090,  0.0123,  0.0346],
+        [ 0.0382,  0.0029,  0.0217],
+        [ 0.0261, -0.0300,  0.0030],
+        [-0.0088, -0.0220, -0.0283],
+        [-0.0272, -0.0121, -0.0363],
+        [-0.0664, -0.0622,  0.0144],
+        [ 0.0414,  0.0479,  0.0529],
+        [ 0.0355,  0.0612, -0.0247],
+        [ 0.0147,  0.0264,  0.0174],
+        [ 0.0438,  0.0038,  0.0542],
+        [ 0.0431, -0.0573, -0.0033],
+        [-0.0162, -0.0211, -0.0406],
+        [-0.0487, -0.0295, -0.0393],
+        [ 0.0005, -0.0109,  0.0253],
+        [ 0.0296,  0.0591,  0.0353],
+        [ 0.0119,  0.0181, -0.0306],
+        [-0.0085, -0.0362,  0.0229],
+        [ 0.0005, -0.0106,  0.0242]
+    ]
+
+    latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644]
+    latent_channels = 32
+    latent_dimensions = 3
+    scale_factor = 1.03682
+    taesd_decoder_name = "lighttaehy1_5"
+
 class Hunyuan3Dv2(LatentFormat):
     latent_channels = 64
     latent_dimensions = 1
diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py
index 67bf70eb1..2e8ef0687 100644
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@@ -40,7 +40,8 @@ class ChromaParams:
     out_dim: int
     hidden_dim: int
     n_layers: int
-
+    txt_ids_dims: list
+    vec_in_dim: int
 
 
 
@@ -179,7 +180,10 @@ class Chroma(nn.Module):
         pe = self.pe_embedder(ids)
 
         blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.double_blocks)
+        transformer_options["block_type"] = "double"
         for i, block in enumerate(self.double_blocks):
+            transformer_options["block_index"] = i
             if i not in self.skip_mmdit:
                 double_mod = (
                     self.get_modulations(mod_vectors, "double_img", idx=i),
@@ -222,7 +226,10 @@ class Chroma(nn.Module):
 
         img = torch.cat((txt, img), 1)
 
+        transformer_options["total_blocks"] = len(self.single_blocks)
+        transformer_options["block_type"] = "single"
         for i, block in enumerate(self.single_blocks):
+            transformer_options["block_index"] = i
             if i not in self.skip_dit:
                 single_mod = self.get_modulations(mod_vectors, "single", idx=i)
                 if ("single_block", i) in blocks_replace:
diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py
index 23150a712..60f2bdae2 100644
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -48,15 +48,44 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
     return embedding
 
 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
+    def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None):
         super().__init__()
-        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
         self.silu = nn.SiLU()
-        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
 
     def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 
+class YakMLP(nn.Module):
+    def __init__(self, hidden_size: int, intermediate_size: int, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
+        self.up_proj = operations.Linear(self.hidden_size, self.intermediate_size, bias=True, dtype=dtype, device=device)
+        self.down_proj = operations.Linear(self.intermediate_size, self.hidden_size, bias=True, dtype=dtype, device=device)
+        self.act_fn = nn.SiLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+def build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=False, yak_mlp=False, dtype=None, device=None, operations=None):
+    if yak_mlp:
+        return YakMLP(hidden_size, mlp_hidden_dim, dtype=dtype, device=device, operations=operations)
+    if mlp_silu_act:
+        return nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
+            SiLUActivation(),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
+        )
+    else:
+        return nn.Sequential(
+            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+            nn.GELU(approximate="tanh"),
+            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+        )
 
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, dtype=None, device=None, operations=None):
@@ -80,14 +109,14 @@ class QKNorm(torch.nn.Module):
 
 
 class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
 
         self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
         self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device)
 
 
 @dataclass
@@ -98,11 +127,11 @@ class ModulationOut:
 
 
 class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None):
         super().__init__()
         self.is_double = double
         self.multiplier = 6 if double else 3
-        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
+        self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device)
 
     def forward(self, vec: Tensor) -> tuple:
         if vec.ndim == 2:
@@ -129,8 +158,18 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
         return tensor
 
 
+class SiLUActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.gate_fn = nn.SiLU()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return self.gate_fn(x1) * x2
+
+
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, yak_mlp=False, dtype=None, device=None, operations=None):
         super().__init__()
 
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
@@ -142,27 +181,22 @@ class DoubleStreamBlock(nn.Module):
             self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
 
         self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
 
         self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
+
+        self.img_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
 
         if self.modulation:
             self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
 
         self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
 
         self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
-            nn.GELU(approximate="tanh"),
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
-        )
+
+        self.txt_mlp = build_mlp(hidden_size, mlp_hidden_dim, mlp_silu_act=mlp_silu_act, yak_mlp=yak_mlp, dtype=dtype, device=device, operations=operations)
+
         self.flipped_img_txt = flipped_img_txt
 
     def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
@@ -246,6 +280,9 @@ class SingleStreamBlock(nn.Module):
         mlp_ratio: float = 4.0,
         qk_scale: float = None,
         modulation=True,
+        mlp_silu_act=False,
+        bias=True,
+        yak_mlp=False,
         dtype=None,
         device=None,
         operations=None
@@ -257,17 +294,29 @@ class SingleStreamBlock(nn.Module):
         self.scale = qk_scale or head_dim**-0.5
 
         self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+
+        self.mlp_hidden_dim_first = self.mlp_hidden_dim
+        self.yak_mlp = yak_mlp
+        if mlp_silu_act:
+            self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
+            self.mlp_act = SiLUActivation()
+        else:
+            self.mlp_act = nn.GELU(approximate="tanh")
+
+        if self.yak_mlp:
+            self.mlp_hidden_dim_first *= 2
+            self.mlp_act = nn.SiLU()
+
         # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
         # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device)
 
         self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
 
         self.hidden_size = hidden_size
         self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
 
-        self.mlp_act = nn.GELU(approximate="tanh")
         if modulation:
             self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
         else:
@@ -279,7 +328,7 @@ class SingleStreamBlock(nn.Module):
         else:
             mod = vec
 
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)
 
         q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         del qkv
@@ -289,7 +338,10 @@ class SingleStreamBlock(nn.Module):
         attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
         del q, k, v
         # compute activation in mlp stream, cat again and run second linear layer
-        mlp = self.mlp_act(mlp)
+        if self.yak_mlp:
+            mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2]
+        else:
+            mlp = self.mlp_act(mlp)
         output = self.linear2(torch.cat((attn, mlp), 2))
         x += apply_mod(output, mod.gate, None, modulation_dims)
         if x.dtype == torch.float16:
@@ -298,11 +350,11 @@ class SingleStreamBlock(nn.Module):
 
 
 class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None):
         super().__init__()
         self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device))
 
     def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
         if vec.ndim == 2:
diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py
index 158420290..6a22df8bc 100644
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@@ -7,7 +7,8 @@ import comfy.model_management
 
 
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
-    q, k = apply_rope(q, k, pe)
+    if pe is not None:
+        q, k = apply_rope(q, k, pe)
     heads = q.shape[1]
     x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options)
     return x
diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py
index b9d36f202..f40c2a7a9 100644
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@@ -15,6 +15,8 @@ from .layers import (
     MLPEmbedder,
     SingleStreamBlock,
     timestep_embedding,
+    Modulation,
+    RMSNorm
 )
 
 @dataclass
@@ -33,6 +35,14 @@ class FluxParams:
     patch_size: int
     qkv_bias: bool
     guidance_embed: bool
+    txt_ids_dims: list
+    global_modulation: bool = False
+    mlp_silu_act: bool = False
+    ops_bias: bool = True
+    default_ref_method: str = "offset"
+    ref_index_scale: float = 1.0
+    yak_mlp: bool = False
+    txt_norm: bool = False
 
 
 class Flux(nn.Module):
@@ -58,13 +68,22 @@ class Flux(nn.Module):
         self.hidden_size = params.hidden_size
         self.num_heads = params.num_heads
         self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
+        if params.vec_in_dim is not None:
+            self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        else:
+            self.vector_in = None
+
         self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
         )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
+
+        if params.txt_norm:
+            self.txt_norm = RMSNorm(params.context_in_dim, dtype=dtype, device=device, operations=operations)
+        else:
+            self.txt_norm = None
 
         self.double_blocks = nn.ModuleList(
             [
@@ -73,6 +92,10 @@ class Flux(nn.Module):
                     self.num_heads,
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
+                    modulation=params.global_modulation is False,
+                    mlp_silu_act=params.mlp_silu_act,
+                    proj_bias=params.ops_bias,
+                    yak_mlp=params.yak_mlp,
                     dtype=dtype, device=device, operations=operations
                 )
                 for _ in range(params.depth)
@@ -81,13 +104,30 @@ class Flux(nn.Module):
 
         self.single_blocks = nn.ModuleList(
             [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, yak_mlp=params.yak_mlp, dtype=dtype, device=device, operations=operations)
                 for _ in range(params.depth_single_blocks)
             ]
         )
 
         if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
+            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
+
+        if params.global_modulation:
+            self.double_stream_modulation_img = Modulation(
+                self.hidden_size,
+                double=True,
+                bias=False,
+                dtype=dtype, device=device, operations=operations
+            )
+            self.double_stream_modulation_txt = Modulation(
+                self.hidden_size,
+                double=True,
+                bias=False,
+                dtype=dtype, device=device, operations=operations
+            )
+            self.single_stream_modulation = Modulation(
+                self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations
+            )
 
     def forward_orig(
         self,
@@ -103,9 +143,6 @@ class Flux(nn.Module):
         attn_mask: Tensor = None,
     ) -> Tensor:
 
-        if y is None:
-            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
-
         patches = transformer_options.get("patches", {})
         patches_replace = transformer_options.get("patches_replace", {})
         if img.ndim != 3 or txt.ndim != 3:
@@ -118,9 +155,19 @@ class Flux(nn.Module):
             if guidance is not None:
                 vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
 
-        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+        if self.vector_in is not None:
+            if y is None:
+                y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
+            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+
+        if self.txt_norm is not None:
+            txt = self.txt_norm(txt)
         txt = self.txt_in(txt)
 
+        vec_orig = vec
+        if self.params.global_modulation:
+            vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig))
+
         if "post_input" in patches:
             for p in patches["post_input"]:
                 out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
@@ -136,7 +183,10 @@ class Flux(nn.Module):
             pe = None
 
         blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.double_blocks)
+        transformer_options["block_type"] = "double"
         for i, block in enumerate(self.double_blocks):
+            transformer_options["block_index"] = i
             if ("double_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}
@@ -177,7 +227,13 @@ class Flux(nn.Module):
 
         img = torch.cat((txt, img), 1)
 
+        if self.params.global_modulation:
+            vec, _ = self.single_stream_modulation(vec_orig)
+
+        transformer_options["total_blocks"] = len(self.single_blocks)
+        transformer_options["block_type"] = "single"
         for i, block in enumerate(self.single_blocks):
+            transformer_options["block_index"] = i
             if ("single_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}
@@ -207,7 +263,7 @@ class Flux(nn.Module):
 
         img = img[:, txt.shape[1] :, ...]
 
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec_orig)  # (N, T, patch_size ** 2 * out_channels)
         return img
 
     def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}):
@@ -234,10 +290,10 @@ class Flux(nn.Module):
             h_offset += rope_options.get("shift_y", 0.0)
             w_offset += rope_options.get("shift_x", 0.0)
 
-        img_ids = torch.zeros((steps_h, steps_w, 3), device=x.device, dtype=x.dtype)
+        img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
         img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=x.dtype).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0)
         return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)
 
     def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
@@ -259,10 +315,10 @@ class Flux(nn.Module):
             h = 0
             w = 0
             index = 0
-            ref_latents_method = kwargs.get("ref_latents_method", "offset")
+            ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method)
             for ref in ref_latents:
                 if ref_latents_method == "index":
-                    index += 1
+                    index += self.params.ref_index_scale
                     h_offset = 0
                     w_offset = 0
                 elif ref_latents_method == "uxo":
@@ -286,7 +342,12 @@ class Flux(nn.Module):
                 img = torch.cat([img, kontext], dim=1)
                 img_ids = torch.cat([img_ids, kontext_ids], dim=1)
 
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
+
+        if len(self.params.txt_ids_dims) > 0:
+            for i in self.params.txt_ids_dims:
+                txt_ids[:, :, i] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
+
         out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
         out = out[:, :img_tokens]
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h_orig,:w_orig]
diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py
index 5132e6c07..2749c53f5 100644
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@@ -6,7 +6,6 @@ import comfy.ldm.flux.layers
 import comfy.ldm.modules.diffusionmodules.mmdit
 from comfy.ldm.modules.attention import optimized_attention
 
-
 from dataclasses import dataclass
 from einops import repeat
 
@@ -42,6 +41,8 @@ class HunyuanVideoParams:
     guidance_embed: bool
     byt5: bool
     meanflow: bool
+    use_cond_type_embedding: bool
+    vision_in_dim: int
 
 
 class SelfAttentionRef(nn.Module):
@@ -157,7 +158,10 @@ class TokenRefiner(nn.Module):
         t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
         # m = mask.float().unsqueeze(-1)
         # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
-        c = x.sum(dim=1) / x.shape[1]
+        if x.dtype == torch.float16:
+            c = x.float().sum(dim=1) / x.shape[1]
+        else:
+            c = x.sum(dim=1) / x.shape[1]
 
         c = t + self.c_embedder(c.to(x.dtype))
         x = self.input_embedder(x)
@@ -196,11 +200,15 @@ class HunyuanVideo(nn.Module):
     def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
         super().__init__()
         self.dtype = dtype
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
         params = HunyuanVideoParams(**kwargs)
         self.params = params
         self.patch_size = params.patch_size
         self.in_channels = params.in_channels
         self.out_channels = params.out_channels
+        self.use_cond_type_embedding = params.use_cond_type_embedding
+        self.vision_in_dim = params.vision_in_dim
         if params.hidden_size % params.num_heads != 0:
             raise ValueError(
                 f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
@@ -266,6 +274,18 @@ class HunyuanVideo(nn.Module):
         if final_layer:
             self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)
 
+        # HunyuanVideo 1.5 specific modules
+        if self.vision_in_dim is not None:
+            from comfy.ldm.wan.model import MLPProj
+            self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings)
+        else:
+            self.vision_in = None
+        if self.use_cond_type_embedding:
+            # 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature
+            self.cond_type_embedding = nn.Embedding(3, self.hidden_size)
+        else:
+            self.cond_type_embedding = None
+
     def forward_orig(
         self,
         img: Tensor,
@@ -276,6 +296,7 @@ class HunyuanVideo(nn.Module):
         timesteps: Tensor,
         y: Tensor = None,
         txt_byt5=None,
+        clip_fea=None,
         guidance: Tensor = None,
         guiding_frame_index=None,
         ref_latent=None,
@@ -331,12 +352,31 @@ class HunyuanVideo(nn.Module):
 
         txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)
 
+        if self.cond_type_embedding is not None:
+            self.cond_type_embedding.to(txt.device)
+            cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long))
+            txt = txt + cond_emb.to(txt.dtype)
+
         if self.byt5_in is not None and txt_byt5 is not None:
             txt_byt5 = self.byt5_in(txt_byt5)
+            if self.cond_type_embedding is not None:
+                cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
+                txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
+                txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
+            else:
+                txt = torch.cat((txt, txt_byt5), dim=1)
             txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
-            txt = torch.cat((txt, txt_byt5), dim=1)
             txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1)
 
+        if clip_fea is not None:
+            txt_vision_states = self.vision_in(clip_fea)
+            if self.cond_type_embedding is not None:
+                cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device))
+                txt_vision_states = txt_vision_states + cond_emb
+            txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1)
+            extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
+            txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
+
         ids = torch.cat((img_ids, txt_ids), dim=1)
         pe = self.pe_embedder(ids)
 
@@ -349,7 +389,10 @@ class HunyuanVideo(nn.Module):
             attn_mask = None
 
         blocks_replace = patches_replace.get("dit", {})
+        transformer_options["total_blocks"] = len(self.double_blocks)
+        transformer_options["block_type"] = "double"
         for i, block in enumerate(self.double_blocks):
+            transformer_options["block_index"] = i
             if ("double_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}
@@ -371,7 +414,10 @@ class HunyuanVideo(nn.Module):
 
         img = torch.cat((img, txt), 1)
 
+        transformer_options["total_blocks"] = len(self.single_blocks)
+        transformer_options["block_type"] = "single"
         for i, block in enumerate(self.single_blocks):
+            transformer_options["block_index"] = i
             if ("single_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}
@@ -430,14 +476,14 @@ class HunyuanVideo(nn.Module):
         img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
         return repeat(img_ids, "h w c -> b (h w) c", b=bs)
 
-    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
         return comfy.patcher_extension.WrapperExecutor.new_class_executor(
             self._forward,
             self,
             comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
+        ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
 
-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
         bs = x.shape[0]
         if len(self.patch_size) == 3:
             img_ids = self.img_ids(x)
@@ -445,5 +491,5 @@ class HunyuanVideo(nn.Module):
         else:
             img_ids = self.img_ids_2d(x)
             txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
         return out
diff --git a/comfy/ldm/hunyuan_video/upsampler.py b/comfy/ldm/hunyuan_video/upsampler.py
new file mode 100644
index 000000000..85f515f67
--- /dev/null
+++ b/comfy/ldm/hunyuan_video/upsampler.py
@@ -0,0 +1,121 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, VideoConv3d
+from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm
+import model_management, model_patcher
+
+class SRResidualCausalBlock3D(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block = nn.Sequential(
+            VideoConv3d(channels, channels, kernel_size=3),
+            nn.SiLU(inplace=True),
+            VideoConv3d(channels, channels, kernel_size=3),
+            nn.SiLU(inplace=True),
+            VideoConv3d(channels, channels, kernel_size=3),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.block(x)
+
+class SRModel3DV2(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: int = 64,
+        num_blocks: int = 6,
+        global_residual: bool = False,
+    ):
+        super().__init__()
+        self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3)
+        self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)])
+        self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3)
+        self.global_residual = bool(global_residual)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        y = self.in_conv(x)
+        for blk in self.blocks:
+            y = blk(y)
+        y = self.out_conv(y)
+        if self.global_residual and (y.shape == residual.shape):
+            y = y + residual
+        return y
+
+
+class Upsampler(nn.Module):
+    def __init__(
+        self,
+        z_channels: int,
+        out_channels: int,
+        block_out_channels: tuple[int, ...],
+        num_res_blocks: int = 2,
+    ):
+        super().__init__()
+        self.num_res_blocks = num_res_blocks
+        self.block_out_channels = block_out_channels
+        self.z_channels = z_channels
+
+        ch = block_out_channels[0]
+        self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3)
+
+        self.up = nn.ModuleList()
+
+        for i, tgt in enumerate(block_out_channels):
+            stage = nn.Module()
+            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+                                                    out_channels=tgt,
+                                                    temb_channels=0,
+                                                    conv_shortcut=False,
+                                                    conv_op=VideoConv3d, norm_op=RMS_norm)
+                                        for j in range(num_res_blocks + 1)])
+            ch = tgt
+            self.up.append(stage)
+
+        self.norm_out = RMS_norm(ch)
+        self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3)
+
+    def forward(self, z):
+        """
+        Args:
+            z: (B, C, T, H, W)
+            target_shape: (H, W)
+        """
+        # z to block_in
+        repeats = self.block_out_channels[0] // (self.z_channels)
+        x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
+
+        # upsampling
+        for stage in self.up:
+            for blk in stage.block:
+                x = blk(x)
+
+        out = self.conv_out(F.silu(self.norm_out(x)))
+        return out
+
+UPSAMPLERS = {
+    "720p": SRModel3DV2,
+    "1080p": Upsampler,
+}
+
+class HunyuanVideo15SRModel():
+    def __init__(self, model_type, config):
+        self.load_device = model_management.vae_device()
+        offload_device = model_management.vae_offload_device()
+        self.dtype = model_management.vae_dtype(self.load_device)
+        self.model_class = UPSAMPLERS.get(model_type)
+        self.model = self.model_class(**config).eval()
+
+        self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+
+    def load_sd(self, sd):
+        return self.model.load_state_dict(sd, strict=True)
+
+    def get_sd(self):
+        return self.model.state_dict()
+
+    def resample_latent(self, latent):
+        model_management.load_model_gpu(self.patcher)
+        return self.model(latent.to(self.load_device))
diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py
index c2a0b507d..ddf77cd0e 100644
--- a/comfy/ldm/hunyuan_video/vae_refiner.py
+++ b/comfy/ldm/hunyuan_video/vae_refiner.py
@@ -1,11 +1,13 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
+from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, CarriedConv3d, Normalize, conv_carry_causal_3d, torch_cat_if_needed
 import comfy.ops
 import comfy.ldm.models.autoencoder
+import comfy.model_management
 ops = comfy.ops.disable_weight_init
 
+
 class RMS_norm(nn.Module):
     def __init__(self, dim):
         super().__init__()
@@ -14,10 +16,10 @@ class RMS_norm(nn.Module):
         self.gamma = nn.Parameter(torch.empty(shape))
 
     def forward(self, x):
-        return F.normalize(x, dim=1) * self.scale * self.gamma
+        return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)
 
 class DnSmpl(nn.Module):
-    def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
+    def __init__(self, ic, oc, tds, refiner_vae, op):
         super().__init__()
         fct = 2 * 2 * 2 if tds else 1 * 2 * 2
         assert oc % fct == 0
@@ -27,11 +29,12 @@ class DnSmpl(nn.Module):
         self.tds = tds
         self.gs = fct * ic // oc
 
-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
         r1 = 2 if self.tds else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
+
+        if self.tds and self.refiner_vae and conv_carry_in is None:
 
-        if self.tds and self.refiner_vae:
             hf = h[:, :, :1, :, :]
             b, c, f, ht, wd = hf.shape
             hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
@@ -39,14 +42,7 @@ class DnSmpl(nn.Module):
             hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
             hf = torch.cat([hf, hf], dim=1)
 
-            hn = h[:, :, 1:, :, :]
-            b, c, frms, ht, wd = hn.shape
-            nf = frms // r1
-            hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
-
-            h = torch.cat([hf, hn], dim=2)
+            h = h[:, :, 1:, :, :]
 
             xf = x[:, :, :1, :, :]
             b, ci, f, ht, wd = xf.shape
@@ -54,38 +50,36 @@ class DnSmpl(nn.Module):
             xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
             xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
             B, C, T, H, W = xf.shape
-            xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2)
+            xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2)
 
-            xn = x[:, :, 1:, :, :]
-            b, ci, frms, ht, wd = xn.shape
-            nf = frms // r1
-            xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-            xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-            B, C, T, H, W = xn.shape
-            xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
-            sc = torch.cat([xf, xn], dim=2)
-        else:
-            b, c, frms, ht, wd = h.shape
+            x = x[:, :, 1:, :, :]
 
-            nf = frms // r1
-            h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
+        if h.shape[2] == 0:
+            return hf + xf
 
-            b, ci, frms, ht, wd = x.shape
-            nf = frms // r1
-            sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
-            sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
-            B, C, T, H, W = sc.shape
-            sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+        b, c, frms, ht, wd = h.shape
+        nf = frms // r1
+        h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
+        h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
 
-        return h + sc
+        b, ci, frms, ht, wd = x.shape
+        nf = frms // r1
+        x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
+        x = x.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
+        B, C, T, H, W = x.shape
+        x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
+
+        if self.tds and self.refiner_vae and conv_carry_in is None:
+            h = torch.cat([hf, h], dim=2)
+            x = torch.cat([xf, x], dim=2)
+
+        return h + x
 
 
 class UpSmpl(nn.Module):
-    def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d):
+    def __init__(self, ic, oc, tus, refiner_vae, op):
         super().__init__()
         fct = 2 * 2 * 2 if tus else 1 * 2 * 2
         self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1)
@@ -94,11 +88,11 @@ class UpSmpl(nn.Module):
         self.tus = tus
         self.rp = fct * oc // ic
 
-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
         r1 = 2 if self.tus else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
 
-        if self.tus and self.refiner_vae:
+        if self.tus and self.refiner_vae and conv_carry_in is None:
             hf = h[:, :, :1, :, :]
             b, c, f, ht, wd = hf.shape
             nc = c // (2 * 2)
@@ -107,14 +101,7 @@ class UpSmpl(nn.Module):
             hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
             hf = hf[:, : hf.shape[1] // 2]
 
-            hn = h[:, :, 1:, :, :]
-            b, c, frms, ht, wd = hn.shape
-            nc = c // (r1 * 2 * 2)
-            hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-
-            h = torch.cat([hf, hn], dim=2)
+            h = h[:, :, 1:, :, :]
 
             xf = x[:, :, :1, :, :]
             b, ci, f, ht, wd = xf.shape
@@ -125,29 +112,26 @@ class UpSmpl(nn.Module):
             xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
             xf = xf.reshape(b, nc, f, ht * 2, wd * 2)
 
-            xn = x[:, :, 1:, :, :]
-            xn = xn.repeat_interleave(repeats=self.rp, dim=1)
-            b, c, frms, ht, wd = xn.shape
-            nc = c // (r1 * 2 * 2)
-            xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-            sc = torch.cat([xf, xn], dim=2)
-        else:
-            b, c, frms, ht, wd = h.shape
-            nc = c // (r1 * 2 * 2)
-            h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+            x = x[:, :, 1:, :, :]
 
-            sc = x.repeat_interleave(repeats=self.rp, dim=1)
-            b, c, frms, ht, wd = sc.shape
-            nc = c // (r1 * 2 * 2)
-            sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+        b, c, frms, ht, wd = h.shape
+        nc = c // (r1 * 2 * 2)
+        h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
 
-        return h + sc
+        x = x.repeat_interleave(repeats=self.rp, dim=1)
+        b, c, frms, ht, wd = x.shape
+        nc = c // (r1 * 2 * 2)
+        x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        x = x.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2)
+
+        if self.tus and self.refiner_vae and conv_carry_in is None:
+            h = torch.cat([hf, h], dim=2)
+            x = torch.cat([xf, x], dim=2)
+
+        return h + x
 
 class Encoder(nn.Module):
     def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
@@ -160,7 +144,7 @@ class Encoder(nn.Module):
 
         self.refiner_vae = refiner_vae
         if self.refiner_vae:
-            conv_op = VideoConv3d
+            conv_op = CarriedConv3d
             norm_op = RMS_norm
         else:
             conv_op = ops.Conv3d
@@ -188,9 +172,9 @@ class Encoder(nn.Module):
             self.down.append(stage)
 
         self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
         self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
 
         self.norm_out = norm_op(ch)
         self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
@@ -201,31 +185,48 @@ class Encoder(nn.Module):
         if not self.refiner_vae and x.shape[2] == 1:
             x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
 
-        x = self.conv_in(x)
+        if self.refiner_vae:
+            xl = [x[:, :, :1, :, :]]
+            if x.shape[2] > self.ffactor_temporal:
+                xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2)
+            x = xl
+        else:
+            x = [x]
+        out = []
 
-        for stage in self.down:
-            for blk in stage.block:
-                x = blk(x)
-            if hasattr(stage, 'downsample'):
-                x = stage.downsample(x)
+        conv_carry_in = None
 
-        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+
+            x1 = [ x1 ]
+            x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
+
+            for stage in self.down:
+                for blk in stage.block:
+                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
+                if hasattr(stage, 'downsample'):
+                    x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
+
+            out.append(x1)
+            conv_carry_in = conv_carry_out
+
+        out = torch_cat_if_needed(out, dim=2)
+
+        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
+        del out
 
         b, c, t, h, w = x.shape
         grp = c // (self.z_channels << 1)
         skip = x.view(b, c // grp, grp, t, h, w).mean(2)
 
-        out = self.conv_out(F.silu(self.norm_out(x))) + skip
+        out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip
 
         if self.refiner_vae:
             out = self.regul(out)[0]
 
-            out = torch.cat((out[:, :, :1], out), dim=2)
-            out = out.permute(0, 2, 1, 3, 4)
-            b, f_times_2, c, h, w = out.shape
-            out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
-            out = out.permute(0, 2, 1, 3, 4).contiguous()
-
         return out
 
 class Decoder(nn.Module):
@@ -239,7 +240,7 @@ class Decoder(nn.Module):
 
         self.refiner_vae = refiner_vae
         if self.refiner_vae:
-            conv_op = VideoConv3d
+            conv_op = CarriedConv3d
             norm_op = RMS_norm
         else:
             conv_op = ops.Conv3d
@@ -249,9 +250,9 @@ class Decoder(nn.Module):
         self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
 
         self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
         self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)
 
         self.up = nn.ModuleList()
         depth = (ffactor_spatial >> 1).bit_length()
@@ -275,27 +276,38 @@ class Decoder(nn.Module):
         self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)
 
     def forward(self, z):
-        if self.refiner_vae:
-            z = z.permute(0, 2, 1, 3, 4)
-            b, f, c, h, w = z.shape
-            z = z.reshape(b, f, 2, c // 2, h, w)
-            z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
-            z = z.permute(0, 2, 1, 3, 4)
-            z = z[:, :, 1:]
-
-        x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
+        x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
         x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
 
-        for stage in self.up:
-            for blk in stage.block:
-                x = blk(x)
-            if hasattr(stage, 'upsample'):
-                x = stage.upsample(x)
+        if self.refiner_vae:
+            x = torch.split(x, 2, dim=2)
+        else:
+            x = [ x ]
+        out = []
 
-        out = self.conv_out(F.silu(self.norm_out(x)))
+        conv_carry_in = None
+
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+            for stage in self.up:
+                for blk in stage.block:
+                    x1 = blk(x1, None, conv_carry_in, conv_carry_out)
+                if hasattr(stage, 'upsample'):
+                    x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
+
+            x1 = [ F.silu(self.norm_out(x1)) ]
+            x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out)
+            out.append(x1)
+            conv_carry_in = conv_carry_out
+        del x
+
+        out = torch_cat_if_needed(out, dim=2)
 
         if not self.refiner_vae:
             if z.shape[-3] == 1:
                 out = out[:, :, -1:]
 
         return out
+
diff --git a/comfy/ldm/lumina/controlnet.py b/comfy/ldm/lumina/controlnet.py
new file mode 100644
index 000000000..fd7ce3b5c
--- /dev/null
+++ b/comfy/ldm/lumina/controlnet.py
@@ -0,0 +1,113 @@
+import torch
+from torch import nn
+
+from .model import JointTransformerBlock
+
+class ZImageControlTransformerBlock(JointTransformerBlock):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+        block_id=0,
+        operation_settings=None,
+    ):
+        super().__init__(layer_id, dim, n_heads, n_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, qk_norm, modulation, z_image_modulation=True, operation_settings=operation_settings)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, c, x, **kwargs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+        c = super().forward(c, **kwargs)
+        c_skip = self.after_proj(c)
+        return c_skip, c
+
+class ZImage_Control(torch.nn.Module):
+    def __init__(
+        self,
+        dim: int = 3840,
+        n_heads: int = 30,
+        n_kv_heads: int = 30,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: float = (8.0 / 3.0),
+        norm_eps: float = 1e-5,
+        qk_norm: bool = True,
+        dtype=None,
+        device=None,
+        operations=None,
+        **kwargs
+    ):
+        super().__init__()
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+        self.additional_in_dim = 0
+        self.control_in_dim = 16
+        n_refiner_layers = 2
+        self.n_control_layers = 6
+        self.control_layers = nn.ModuleList(
+            [
+                ZImageControlTransformerBlock(
+                    i,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    block_id=i,
+                    operation_settings=operation_settings,
+                )
+                for i in range(self.n_control_layers)
+            ]
+        )
+
+        all_x_embedder = {}
+        patch_size = 2
+        f_patch_size = 1
+        x_embedder = operations.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True, device=device, dtype=dtype)
+        all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
+
+        self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
+        self.control_noise_refiner = nn.ModuleList(
+            [
+                JointTransformerBlock(
+                    layer_id,
+                    dim,
+                    n_heads,
+                    n_kv_heads,
+                    multiple_of,
+                    ffn_dim_multiplier,
+                    norm_eps,
+                    qk_norm,
+                    modulation=True,
+                    z_image_modulation=True,
+                    operation_settings=operation_settings,
+                )
+                for layer_id in range(n_refiner_layers)
+            ]
+        )
+
+    def forward(self, cap_feats, control_context, x_freqs_cis, adaln_input):
+        patch_size = 2
+        f_patch_size = 1
+        pH = pW = patch_size
+        B, C, H, W = control_context.shape
+        control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
+
+        x_attn_mask = None
+        for layer in self.control_noise_refiner:
+            control_context = layer(control_context, x_attn_mask, x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input)
+        return control_context
+
+    def forward_control_block(self, layer_id, control_context, x, x_attn_mask, x_freqs_cis, adaln_input):
+        return self.control_layers[layer_id](control_context, x, x_mask=x_attn_mask, freqs_cis=x_freqs_cis[:control_context.shape[0], :control_context.shape[1]], adaln_input=adaln_input)
diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index b4494a51d..f1c1a0ec3 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -11,6 +11,7 @@ import comfy.ldm.common_dit
 from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
 from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.math import apply_rope
 import comfy.patcher_extension
 
 
@@ -21,6 +22,10 @@ def modulate(x, scale):
 #                               Core NextDiT Model                              #
 #############################################################################
 
+def clamp_fp16(x):
+    if x.dtype == torch.float16:
+        return torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
+    return x
 
 class JointAttention(nn.Module):
     """Multi-head attention module."""
@@ -31,6 +36,7 @@ class JointAttention(nn.Module):
         n_heads: int,
         n_kv_heads: Optional[int],
         qk_norm: bool,
+        out_bias: bool = False,
         operation_settings={},
     ):
         """
@@ -59,7 +65,7 @@ class JointAttention(nn.Module):
         self.out = operation_settings.get("operations").Linear(
             n_heads * self.head_dim,
             dim,
-            bias=False,
+            bias=out_bias,
             device=operation_settings.get("device"),
             dtype=operation_settings.get("dtype"),
         )
@@ -70,35 +76,6 @@ class JointAttention(nn.Module):
         else:
             self.q_norm = self.k_norm = nn.Identity()
 
-    @staticmethod
-    def apply_rotary_emb(
-        x_in: torch.Tensor,
-        freqs_cis: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Apply rotary embeddings to input tensors using the given frequency
-        tensor.
-
-        This function applies rotary embeddings to the given query 'xq' and
-        key 'xk' tensors using the provided frequency tensor 'freqs_cis'. The
-        input tensors are reshaped as complex numbers, and the frequency tensor
-        is reshaped for broadcasting compatibility. The resulting tensors
-        contain rotary embeddings and are returned as real tensors.
-
-        Args:
-            x_in (torch.Tensor): Query or Key tensor to apply rotary embeddings.
-            freqs_cis (torch.Tensor): Precomputed frequency tensor for complex
-                exponentials.
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor
-                and key tensor with rotary embeddings.
-        """
-
-        t_ = x_in.reshape(*x_in.shape[:-1], -1, 1, 2)
-        t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
-        return t_out.reshape(*x_in.shape)
-
     def forward(
         self,
         x: torch.Tensor,
@@ -134,8 +111,7 @@ class JointAttention(nn.Module):
         xq = self.q_norm(xq)
         xk = self.k_norm(xk)
 
-        xq = JointAttention.apply_rotary_emb(xq, freqs_cis=freqs_cis)
-        xk = JointAttention.apply_rotary_emb(xk, freqs_cis=freqs_cis)
+        xq, xk = apply_rope(xq, xk, freqs_cis)
 
         n_rep = self.n_local_heads // self.n_local_kv_heads
         if n_rep >= 1:
@@ -197,7 +173,7 @@ class FeedForward(nn.Module):
 
     # @torch.compile
     def _forward_silu_gating(self, x1, x3):
-        return F.silu(x1) * x3
+        return clamp_fp16(F.silu(x1) * x3)
 
     def forward(self, x):
         return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
@@ -215,6 +191,8 @@ class JointTransformerBlock(nn.Module):
         norm_eps: float,
         qk_norm: bool,
         modulation=True,
+        z_image_modulation=False,
+        attn_out_bias=False,
         operation_settings={},
     ) -> None:
         """
@@ -235,10 +213,10 @@ class JointTransformerBlock(nn.Module):
         super().__init__()
         self.dim = dim
         self.head_dim = dim // n_heads
-        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, operation_settings=operation_settings)
+        self.attention = JointAttention(dim, n_heads, n_kv_heads, qk_norm, out_bias=attn_out_bias, operation_settings=operation_settings)
         self.feed_forward = FeedForward(
             dim=dim,
-            hidden_dim=4 * dim,
+            hidden_dim=dim,
             multiple_of=multiple_of,
             ffn_dim_multiplier=ffn_dim_multiplier,
             operation_settings=operation_settings,
@@ -252,16 +230,27 @@ class JointTransformerBlock(nn.Module):
 
         self.modulation = modulation
         if modulation:
-            self.adaLN_modulation = nn.Sequential(
-                nn.SiLU(),
-                operation_settings.get("operations").Linear(
-                    min(dim, 1024),
-                    4 * dim,
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
+            if z_image_modulation:
+                self.adaLN_modulation = nn.Sequential(
+                    operation_settings.get("operations").Linear(
+                        min(dim, 256),
+                        4 * dim,
+                        bias=True,
+                        device=operation_settings.get("device"),
+                        dtype=operation_settings.get("dtype"),
+                    ),
+                )
+            else:
+                self.adaLN_modulation = nn.Sequential(
+                    nn.SiLU(),
+                    operation_settings.get("operations").Linear(
+                        min(dim, 1024),
+                        4 * dim,
+                        bias=True,
+                        device=operation_settings.get("device"),
+                        dtype=operation_settings.get("dtype"),
+                    ),
+                )
 
     def forward(
         self,
@@ -288,27 +277,27 @@ class JointTransformerBlock(nn.Module):
             scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
 
             x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
-                self.attention(
+                clamp_fp16(self.attention(
                     modulate(self.attention_norm1(x), scale_msa),
                     x_mask,
                     freqs_cis,
                     transformer_options=transformer_options,
-                )
+                ))
             )
             x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
-                self.feed_forward(
+                clamp_fp16(self.feed_forward(
                     modulate(self.ffn_norm1(x), scale_mlp),
-                )
+                ))
             )
         else:
             assert adaln_input is None
             x = x + self.attention_norm2(
-                self.attention(
+                clamp_fp16(self.attention(
                     self.attention_norm1(x),
                     x_mask,
                     freqs_cis,
                     transformer_options=transformer_options,
-                )
+                ))
             )
             x = x + self.ffn_norm2(
                 self.feed_forward(
@@ -323,7 +312,7 @@ class FinalLayer(nn.Module):
     The final layer of NextDiT.
     """
 
-    def __init__(self, hidden_size, patch_size, out_channels, operation_settings={}):
+    def __init__(self, hidden_size, patch_size, out_channels, z_image_modulation=False, operation_settings={}):
         super().__init__()
         self.norm_final = operation_settings.get("operations").LayerNorm(
             hidden_size,
@@ -340,10 +329,15 @@ class FinalLayer(nn.Module):
             dtype=operation_settings.get("dtype"),
         )
 
+        if z_image_modulation:
+            min_mod = 256
+        else:
+            min_mod = 1024
+
         self.adaLN_modulation = nn.Sequential(
             nn.SiLU(),
             operation_settings.get("operations").Linear(
-                min(hidden_size, 1024),
+                min(hidden_size, min_mod),
                 hidden_size,
                 bias=True,
                 device=operation_settings.get("device"),
@@ -373,12 +367,16 @@ class NextDiT(nn.Module):
         n_heads: int = 32,
         n_kv_heads: Optional[int] = None,
         multiple_of: int = 256,
-        ffn_dim_multiplier: Optional[float] = None,
+        ffn_dim_multiplier: float = 4.0,
         norm_eps: float = 1e-5,
         qk_norm: bool = False,
         cap_feat_dim: int = 5120,
         axes_dims: List[int] = (16, 56, 56),
         axes_lens: List[int] = (1, 512, 512),
+        rope_theta=10000.0,
+        z_image_modulation=False,
+        time_scale=1.0,
+        pad_tokens_multiple=None,
         image_model=None,
         device=None,
         dtype=None,
@@ -390,6 +388,8 @@ class NextDiT(nn.Module):
         self.in_channels = in_channels
         self.out_channels = in_channels
         self.patch_size = patch_size
+        self.time_scale = time_scale
+        self.pad_tokens_multiple = pad_tokens_multiple
 
         self.x_embedder = operation_settings.get("operations").Linear(
             in_features=patch_size * patch_size * in_channels,
@@ -411,6 +411,7 @@ class NextDiT(nn.Module):
                     norm_eps,
                     qk_norm,
                     modulation=True,
+                    z_image_modulation=z_image_modulation,
                     operation_settings=operation_settings,
                 )
                 for layer_id in range(n_refiner_layers)
@@ -434,7 +435,7 @@ class NextDiT(nn.Module):
             ]
         )
 
-        self.t_embedder = TimestepEmbedder(min(dim, 1024), **operation_settings)
+        self.t_embedder = TimestepEmbedder(min(dim, 1024), output_size=256 if z_image_modulation else None, **operation_settings)
         self.cap_embedder = nn.Sequential(
             operation_settings.get("operations").RMSNorm(cap_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
             operation_settings.get("operations").Linear(
@@ -457,18 +458,24 @@ class NextDiT(nn.Module):
                     ffn_dim_multiplier,
                     norm_eps,
                     qk_norm,
+                    z_image_modulation=z_image_modulation,
+                    attn_out_bias=False,
                     operation_settings=operation_settings,
                 )
                 for layer_id in range(n_layers)
             ]
         )
         self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
-        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, operation_settings=operation_settings)
+        self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings)
+
+        if self.pad_tokens_multiple is not None:
+            self.x_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
+            self.cap_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
 
         assert (dim // n_heads) == sum(axes_dims)
         self.axes_dims = axes_dims
         self.axes_lens = axes_lens
-        self.rope_embedder = EmbedND(dim=dim // n_heads, theta=10000.0, axes_dim=axes_dims)
+        self.rope_embedder = EmbedND(dim=dim // n_heads, theta=rope_theta, axes_dim=axes_dims)
         self.dim = dim
         self.n_heads = n_heads
 
@@ -503,108 +510,54 @@ class NextDiT(nn.Module):
         bsz = len(x)
         pH = pW = self.patch_size
         device = x[0].device
-        dtype = x[0].dtype
 
-        if cap_mask is not None:
-            l_effective_cap_len = cap_mask.sum(dim=1).tolist()
-        else:
-            l_effective_cap_len = [num_tokens] * bsz
+        if self.pad_tokens_multiple is not None:
+            pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
+            cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
 
-        if cap_mask is not None and not torch.is_floating_point(cap_mask):
-            cap_mask = (cap_mask - 1).to(dtype) * torch.finfo(dtype).max
+        cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
+        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
 
-        img_sizes = [(img.size(1), img.size(2)) for img in x]
-        l_effective_img_len = [(H // pH) * (W // pW) for (H, W) in img_sizes]
+        B, C, H, W = x.shape
+        x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
 
-        max_seq_len = max(
-            (cap_len+img_len for cap_len, img_len in zip(l_effective_cap_len, l_effective_img_len))
-        )
-        max_cap_len = max(l_effective_cap_len)
-        max_img_len = max(l_effective_img_len)
+        rope_options = transformer_options.get("rope_options", None)
+        h_scale = 1.0
+        w_scale = 1.0
+        h_start = 0
+        w_start = 0
+        if rope_options is not None:
+            h_scale = rope_options.get("scale_y", 1.0)
+            w_scale = rope_options.get("scale_x", 1.0)
 
-        position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.float32, device=device)
+            h_start = rope_options.get("shift_y", 0.0)
+            w_start = rope_options.get("shift_x", 0.0)
 
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-            H, W = img_sizes[i]
-            H_tokens, W_tokens = H // pH, W // pW
-            assert H_tokens * W_tokens == img_len
+        H_tokens, W_tokens = H // pH, W // pW
+        x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
+        x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
+        x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
+        x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
 
-            rope_options = transformer_options.get("rope_options", None)
-            h_scale = 1.0
-            w_scale = 1.0
-            h_start = 0
-            w_start = 0
-            if rope_options is not None:
-                h_scale = rope_options.get("scale_y", 1.0)
-                w_scale = rope_options.get("scale_x", 1.0)
+        if self.pad_tokens_multiple is not None:
+            pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
+            x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
+            x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))
 
-                h_start = rope_options.get("shift_y", 0.0)
-                w_start = rope_options.get("shift_x", 0.0)
-
-            position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.float32, device=device)
-            position_ids[i, cap_len:cap_len+img_len, 0] = cap_len
-            row_ids = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
-            col_ids = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
-            position_ids[i, cap_len:cap_len+img_len, 1] = row_ids
-            position_ids[i, cap_len:cap_len+img_len, 2] = col_ids
-
-        freqs_cis = self.rope_embedder(position_ids).movedim(1, 2).to(dtype)
-
-        # build freqs_cis for cap and image individually
-        cap_freqs_cis_shape = list(freqs_cis.shape)
-        # cap_freqs_cis_shape[1] = max_cap_len
-        cap_freqs_cis_shape[1] = cap_feats.shape[1]
-        cap_freqs_cis = torch.zeros(*cap_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
-
-        img_freqs_cis_shape = list(freqs_cis.shape)
-        img_freqs_cis_shape[1] = max_img_len
-        img_freqs_cis = torch.zeros(*img_freqs_cis_shape, device=device, dtype=freqs_cis.dtype)
-
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-            cap_freqs_cis[i, :cap_len] = freqs_cis[i, :cap_len]
-            img_freqs_cis[i, :img_len] = freqs_cis[i, cap_len:cap_len+img_len]
+        freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
 
         # refine context
         for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options)
+            cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
 
-        # refine image
-        flat_x = []
-        for i in range(bsz):
-            img = x[i]
-            C, H, W = img.size()
-            img = img.view(C, H // pH, pH, W // pW, pW).permute(1, 3, 2, 4, 0).flatten(2).flatten(0, 1)
-            flat_x.append(img)
-        x = flat_x
-        padded_img_embed = torch.zeros(bsz, max_img_len, x[0].shape[-1], device=device, dtype=x[0].dtype)
-        padded_img_mask = torch.zeros(bsz, max_img_len, dtype=dtype, device=device)
-        for i in range(bsz):
-            padded_img_embed[i, :l_effective_img_len[i]] = x[i]
-            padded_img_mask[i, l_effective_img_len[i]:] = -torch.finfo(dtype).max
-
-        padded_img_embed = self.x_embedder(padded_img_embed)
-        padded_img_mask = padded_img_mask.unsqueeze(1)
+        padded_img_mask = None
         for layer in self.noise_refiner:
-            padded_img_embed = layer(padded_img_embed, padded_img_mask, img_freqs_cis, t, transformer_options=transformer_options)
-
-        if cap_mask is not None:
-            mask = torch.zeros(bsz, max_seq_len, dtype=dtype, device=device)
-            mask[:, :max_cap_len] = cap_mask[:, :max_cap_len]
-        else:
-            mask = None
-
-        padded_full_embed = torch.zeros(bsz, max_seq_len, self.dim, device=device, dtype=x[0].dtype)
-        for i in range(bsz):
-            cap_len = l_effective_cap_len[i]
-            img_len = l_effective_img_len[i]
-
-            padded_full_embed[i, :cap_len] = cap_feats[i, :cap_len]
-            padded_full_embed[i, cap_len:cap_len+img_len] = padded_img_embed[i, :img_len]
+            x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
 
+        padded_full_embed = torch.cat((cap_feats, x), dim=1)
+        mask = None
+        img_sizes = [(H, W)] * bsz
+        l_effective_cap_len = [cap_feats.shape[1]] * bsz
         return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
 
     def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
@@ -615,7 +568,7 @@ class NextDiT(nn.Module):
         ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)
 
     # def forward(self, x, t, cap_feats, cap_mask):
-    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
+    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
         t = 1.0 - timesteps
         cap_feats = context
         cap_mask = attention_mask
@@ -627,21 +580,29 @@ class NextDiT(nn.Module):
         y: (N,) tensor of text tokens/features
         """
 
-        t = self.t_embedder(t, dtype=x.dtype)  # (N, D)
+        t = self.t_embedder(t * self.time_scale, dtype=x.dtype)  # (N, D)
         adaln_input = t
 
         cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
 
+        patches = transformer_options.get("patches", {})
         transformer_options = kwargs.get("transformer_options", {})
         x_is_tensor = isinstance(x, torch.Tensor)
-        x, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
-        freqs_cis = freqs_cis.to(x.device)
+        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, t, num_tokens, transformer_options=transformer_options)
+        freqs_cis = freqs_cis.to(img.device)
 
-        for layer in self.layers:
-            x = layer(x, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+        for i, layer in enumerate(self.layers):
+            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+            if "double_block" in patches:
+                for p in patches["double_block"]:
+                    out = p({"img": img[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
+                    if "img" in out:
+                        img[:, cap_size[0]:] = out["img"]
+                    if "txt" in out:
+                        img[:, :cap_size[0]] = out["txt"]
 
-        x = self.final_layer(x, adaln_input)
-        x = self.unpatchify(x, img_size, cap_size, return_tensor=x_is_tensor)[:,:,:h,:w]
+        img = self.final_layer(img, adaln_input)
+        img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
 
-        return -x
+        return -img
 
diff --git a/comfy/ldm/models/autoencoder.py b/comfy/ldm/models/autoencoder.py
index 611d36a1b..4f50810dc 100644
--- a/comfy/ldm/models/autoencoder.py
+++ b/comfy/ldm/models/autoencoder.py
@@ -9,6 +9,8 @@ from comfy.ldm.modules.distributions.distributions import DiagonalGaussianDistri
 from comfy.ldm.util import get_obj_from_str, instantiate_from_config
 from comfy.ldm.modules.ema import LitEma
 import comfy.ops
+from einops import rearrange
+import comfy.model_management
 
 class DiagonalGaussianRegularizer(torch.nn.Module):
     def __init__(self, sample: bool = False):
@@ -179,6 +181,21 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
         self.post_quant_conv = conv_op(embed_dim, ddconfig["z_channels"], 1)
         self.embed_dim = embed_dim
 
+        if ddconfig.get("batch_norm_latent", False):
+            self.bn_eps = 1e-4
+            self.bn_momentum = 0.1
+            self.ps = [2, 2]
+            self.bn = torch.nn.BatchNorm2d(math.prod(self.ps) * ddconfig["z_channels"],
+                                           eps=self.bn_eps,
+                                           momentum=self.bn_momentum,
+                                           affine=False,
+                                           track_running_stats=True,
+                                           )
+            self.bn.eval()
+        else:
+            self.bn = None
+
+
     def get_autoencoder_params(self) -> list:
         params = super().get_autoencoder_params()
         return params
@@ -201,11 +218,36 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
             z = torch.cat(z, 0)
 
         z, reg_log = self.regularization(z)
+
+        if self.bn is not None:
+            z = rearrange(z,
+                          "... c (i pi) (j pj)  -> ... (c pi pj) i j",
+                          pi=self.ps[0],
+                          pj=self.ps[1],
+                          )
+
+            z = torch.nn.functional.batch_norm(z,
+                                               comfy.model_management.cast_to(self.bn.running_mean, dtype=z.dtype, device=z.device),
+                                               comfy.model_management.cast_to(self.bn.running_var, dtype=z.dtype, device=z.device),
+                                               momentum=self.bn_momentum,
+                                               eps=self.bn_eps)
+
         if return_reg_log:
             return z, reg_log
         return z
 
     def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
+        if self.bn is not None:
+            s = torch.sqrt(comfy.model_management.cast_to(self.bn.running_var.view(1, -1, 1, 1), dtype=z.dtype, device=z.device) + self.bn_eps)
+            m = comfy.model_management.cast_to(self.bn.running_mean.view(1, -1, 1, 1), dtype=z.dtype, device=z.device)
+            z = z * s + m
+            z = rearrange(
+                z,
+                "... (c pi pj) i j -> ... c (i pi) (j pj)",
+                pi=self.ps[0],
+                pj=self.ps[1],
+            )
+
         if self.max_batch_size is None:
             dec = self.post_quant_conv(z)
             dec = self.decoder(dec, **decoder_kwargs)
diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index 7437e0567..a8800ded0 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -517,6 +517,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
 
 @wrap_attn
 def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False, **kwargs):
+    exception_fallback = False
     if skip_reshape:
         b, _, _, dim_head = q.shape
         tensor_layout = "HND"
@@ -541,6 +542,8 @@ def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=
         out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
     except Exception as e:
         logging.error("Error running sage attention: {}, using pytorch attention instead.".format(e))
+        exception_fallback = True
+    if exception_fallback:
         if tensor_layout == "NHD":
             q, k, v = map(
                 lambda t: t.transpose(1, 2),
diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py
index 42f406f1a..0dc8fe789 100644
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -211,12 +211,14 @@ class TimestepEmbedder(nn.Module):
     Embeds scalar timesteps into vector representations.
     """
 
-    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None):
         super().__init__()
+        if output_size is None:
+            output_size = hidden_size
         self.mlp = nn.Sequential(
             operations.Linear(frequency_embedding_size, hidden_size, bias=True, dtype=dtype, device=device),
             nn.SiLU(),
-            operations.Linear(hidden_size, hidden_size, bias=True, dtype=dtype, device=device),
+            operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device),
         )
         self.frequency_embedding_size = frequency_embedding_size
 
diff --git a/comfy/ldm/modules/diffusionmodules/model.py b/comfy/ldm/modules/diffusionmodules/model.py
index 4245eedca..681a55db5 100644
--- a/comfy/ldm/modules/diffusionmodules/model.py
+++ b/comfy/ldm/modules/diffusionmodules/model.py
@@ -13,6 +13,12 @@ if model_management.xformers_enabled_vae():
     import xformers
     import xformers.ops
 
+def torch_cat_if_needed(xl, dim):
+    if len(xl) > 1:
+        return torch.cat(xl, dim)
+    else:
+        return xl[0]
+
 def get_timestep_embedding(timesteps, embedding_dim):
     """
     This matches the implementation in Denoising Diffusion Probabilistic Models:
@@ -43,6 +49,37 @@ def Normalize(in_channels, num_groups=32):
     return ops.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
 
 
+class CarriedConv3d(nn.Module):
+    def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
+        super().__init__()
+        self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
+
+    x = xl[0]
+    xl.clear()
+
+    if isinstance(op, CarriedConv3d):
+        if conv_carry_in is None:
+            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate')
+        else:
+            carry_len = conv_carry_in[0].shape[2]
+            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate')
+            x = torch.cat([conv_carry_in.pop(0), x], dim=2)
+
+        if conv_carry_out is not None:
+            to_push = x[:, :, -2:, :, :].clone()
+            conv_carry_out.append(to_push)
+
+    out = op(x)
+
+    return out
+
+
 class VideoConv3d(nn.Module):
     def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding_mode='replicate', padding=1, **kwargs):
         super().__init__()
@@ -89,29 +126,24 @@ class Upsample(nn.Module):
                                         stride=1,
                                         padding=1)
 
-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
         scale_factor = self.scale_factor
         if isinstance(scale_factor, (int, float)):
             scale_factor = (scale_factor,) * (x.ndim - 2)
 
         if x.ndim == 5 and scale_factor[0] > 1.0:
-            t = x.shape[2]
-            if t > 1:
-                a, b = x.split((1, t - 1), dim=2)
-                del x
-                b = interpolate_up(b, scale_factor)
-            else:
-                a = x
-
-            a = interpolate_up(a.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2)
-            if t > 1:
-                x = torch.cat((a, b), dim=2)
-            else:
-                x = a
+            results = []
+            if conv_carry_in is None:
+                first = x[:, :, :1, :, :]
+                results.append(interpolate_up(first.squeeze(2), scale_factor=scale_factor[1:]).unsqueeze(2))
+                x = x[:, :, 1:, :, :]
+            if x.shape[2] > 0:
+                results.append(interpolate_up(x, scale_factor))
+            x = torch_cat_if_needed(results, dim=2)
         else:
             x = interpolate_up(x, scale_factor)
         if self.with_conv:
-            x = self.conv(x)
+            x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
         return x
 
 
@@ -127,17 +159,20 @@ class Downsample(nn.Module):
                                         stride=stride,
                                         padding=0)
 
-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
         if self.with_conv:
-            if x.ndim == 4:
+            if isinstance(self.conv, CarriedConv3d):
+                x = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
+            elif x.ndim == 4:
                 pad = (0, 1, 0, 1)
                 mode = "constant"
                 x = torch.nn.functional.pad(x, pad, mode=mode, value=0)
+                x = self.conv(x)
             elif x.ndim == 5:
                 pad = (1, 1, 1, 1, 2, 0)
                 mode = "replicate"
                 x = torch.nn.functional.pad(x, pad, mode=mode)
-            x = self.conv(x)
+                x = self.conv(x)
         else:
             x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
         return x
@@ -183,23 +218,23 @@ class ResnetBlock(nn.Module):
                                                     stride=1,
                                                     padding=0)
 
-    def forward(self, x, temb=None):
+    def forward(self, x, temb=None, conv_carry_in=None, conv_carry_out=None):
         h = x
         h = self.norm1(h)
-        h = self.swish(h)
-        h = self.conv1(h)
+        h = [ self.swish(h) ]
+        h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
 
         if temb is not None:
             h = h + self.temb_proj(self.swish(temb))[:,:,None,None]
 
         h = self.norm2(h)
         h = self.swish(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
+        h = [ self.dropout(h) ]
+        h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
 
         if self.in_channels != self.out_channels:
             if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
+                x = conv_carry_causal_3d([x], self.conv_shortcut, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
             else:
                 x = self.nin_shortcut(x)
 
@@ -279,6 +314,7 @@ def pytorch_attention(q, k, v):
     orig_shape = q.shape
     B = orig_shape[0]
     C = orig_shape[1]
+    oom_fallback = False
     q, k, v = map(
         lambda t: t.view(B, 1, C, -1).transpose(2, 3).contiguous(),
         (q, k, v),
@@ -289,6 +325,8 @@ def pytorch_attention(q, k, v):
         out = out.transpose(2, 3).reshape(orig_shape)
     except model_management.OOM_EXCEPTION:
         logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
+        oom_fallback = True
+    if oom_fallback:
         out = slice_attention(q.view(B, -1, C), k.view(B, -1, C).transpose(1, 2), v.view(B, -1, C).transpose(1, 2)).reshape(orig_shape)
     return out
 
@@ -517,9 +555,14 @@ class Encoder(nn.Module):
         self.num_res_blocks = num_res_blocks
         self.resolution = resolution
         self.in_channels = in_channels
+        self.carried = False
 
         if conv3d:
-            conv_op = VideoConv3d
+            if not attn_resolutions:
+                conv_op = CarriedConv3d
+                self.carried = True
+            else:
+                conv_op = VideoConv3d
             mid_attn_conv_op = ops.Conv3d
         else:
             conv_op = ops.Conv2d
@@ -532,6 +575,7 @@ class Encoder(nn.Module):
                                        stride=1,
                                        padding=1)
 
+        self.time_compress = 1
         curr_res = resolution
         in_ch_mult = (1,)+tuple(ch_mult)
         self.in_ch_mult = in_ch_mult
@@ -558,10 +602,15 @@ class Encoder(nn.Module):
                 if time_compress is not None:
                     if (self.num_resolutions - 1 - i_level) > math.log2(time_compress):
                         stride = (1, 2, 2)
+                else:
+                    self.time_compress *= 2
                 down.downsample = Downsample(block_in, resamp_with_conv, stride=stride, conv_op=conv_op)
                 curr_res = curr_res // 2
             self.down.append(down)
 
+        if time_compress is not None:
+            self.time_compress = time_compress
+
         # middle
         self.mid = nn.Module()
         self.mid.block_1 = ResnetBlock(in_channels=block_in,
@@ -587,15 +636,42 @@ class Encoder(nn.Module):
     def forward(self, x):
         # timestep embedding
         temb = None
-        # downsampling
-        h = self.conv_in(x)
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](h, temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-            if i_level != self.num_resolutions-1:
-                h = self.down[i_level].downsample(h)
+
+        if self.carried:
+            xl = [x[:, :, :1, :, :]]
+            if x.shape[2] > self.time_compress:
+                tc = self.time_compress
+                xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // tc) * tc, :, :], tc * 2, dim = 2)
+            x = xl
+        else:
+            x = [x]
+        out = []
+
+        conv_carry_in = None
+
+        for i, x1 in enumerate(x):
+            conv_carry_out = []
+            if i == len(x) - 1:
+                conv_carry_out = None
+
+            # downsampling
+            x1 = [ x1 ]
+            h1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
+
+            for i_level in range(self.num_resolutions):
+                for i_block in range(self.num_res_blocks):
+                    h1 = self.down[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out)
+                    if len(self.down[i_level].attn) > 0:
+                        assert i == 0 #carried should not happen if attn exists
+                        h1 = self.down[i_level].attn[i_block](h1)
+                if i_level != self.num_resolutions-1:
+                    h1 = self.down[i_level].downsample(h1, conv_carry_in, conv_carry_out)
+
+            out.append(h1)
+            conv_carry_in = conv_carry_out
+
+        h = torch_cat_if_needed(out, dim=2)
+        del out
 
         # middle
         h = self.mid.block_1(h, temb)
@@ -604,15 +680,15 @@ class Encoder(nn.Module):
 
         # end
         h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
+        h = [ nonlinearity(h) ]
+        h = conv_carry_causal_3d(h, self.conv_out)
         return h
 
 
 class Decoder(nn.Module):
     def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
                  attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
-                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+                 resolution, z_channels, tanh_out=False, use_linear_attn=False,
                  conv_out_op=ops.Conv2d,
                  resnet_op=ResnetBlock,
                  attn_op=AttnBlock,
@@ -626,12 +702,18 @@ class Decoder(nn.Module):
         self.num_res_blocks = num_res_blocks
         self.resolution = resolution
         self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
         self.tanh_out = tanh_out
+        self.carried = False
 
         if conv3d:
-            conv_op = VideoConv3d
-            conv_out_op = VideoConv3d
+            if not attn_resolutions and resnet_op == ResnetBlock:
+                conv_op = CarriedConv3d
+                conv_out_op = CarriedConv3d
+                self.carried = True
+            else:
+                conv_op = VideoConv3d
+                conv_out_op = VideoConv3d
+
             mid_attn_conv_op = ops.Conv3d
         else:
             conv_op = ops.Conv2d
@@ -706,29 +788,43 @@ class Decoder(nn.Module):
         temb = None
 
         # z to block_in
-        h = self.conv_in(z)
+        h = conv_carry_causal_3d([z], self.conv_in)
 
         # middle
         h = self.mid.block_1(h, temb, **kwargs)
         h = self.mid.attn_1(h, **kwargs)
         h = self.mid.block_2(h, temb, **kwargs)
 
+        if self.carried:
+            h = torch.split(h, 2, dim=2)
+        else:
+            h = [ h ]
+        out = []
+
+        conv_carry_in = None
+
         # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks+1):
-                h = self.up[i_level].block[i_block](h, temb, **kwargs)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h, **kwargs)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
+        for i, h1 in enumerate(h):
+            conv_carry_out = []
+            if i == len(h) - 1:
+                conv_carry_out = None
+            for i_level in reversed(range(self.num_resolutions)):
+                for i_block in range(self.num_res_blocks+1):
+                    h1 = self.up[i_level].block[i_block](h1, temb, conv_carry_in, conv_carry_out, **kwargs)
+                    if len(self.up[i_level].attn) > 0:
+                        assert i == 0 #carried should not happen if attn exists
+                        h1 = self.up[i_level].attn[i_block](h1, **kwargs)
+                if i_level != 0:
+                    h1 = self.up[i_level].upsample(h1, conv_carry_in, conv_carry_out)
 
-        # end
-        if self.give_pre_end:
-            return h
+            h1 = self.norm_out(h1)
+            h1 = [ nonlinearity(h1) ]
+            h1 = conv_carry_causal_3d(h1, self.conv_out, conv_carry_in, conv_carry_out)
+            if self.tanh_out:
+                h1 = torch.tanh(h1)
+            out.append(h1)
+            conv_carry_in = conv_carry_out
 
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h, **kwargs)
-        if self.tanh_out:
-            h = torch.tanh(h)
-        return h
+        out = torch_cat_if_needed(out, dim=2)
+
+        return out
diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py
index 427ea19c1..8c75670cd 100644
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@@ -439,7 +439,10 @@ class QwenImageTransformer2DModel(nn.Module):
         patches = transformer_options.get("patches", {})
         blocks_replace = patches_replace.get("dit", {})
 
+        transformer_options["total_blocks"] = len(self.transformer_blocks)
+        transformer_options["block_type"] = "double"
         for i, block in enumerate(self.transformer_blocks):
+            transformer_options["block_index"] = i
             if ("double_block", i) in blocks_replace:
                 def block_wrap(args):
                     out = {}
diff --git a/comfy/lora.py b/comfy/lora.py
index 36d26293a..3a9077869 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -313,6 +313,15 @@ def model_lora_keys_unet(model, key_map={}):
                 key_map["transformer.{}".format(key_lora)] = k
                 key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k #SimpleTuner lycoris format
 
+    if isinstance(model, comfy.model_base.Lumina2):
+        diffusers_keys = comfy.utils.z_image_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.")
+        for k in diffusers_keys:
+            if k.endswith(".weight"):
+                to = diffusers_keys[k]
+                key_lora = k[:-len(".weight")]
+                key_map["diffusion_model.{}".format(key_lora)] = to
+                key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to
+
     return key_map
 
 
diff --git a/comfy/model_base.py b/comfy/model_base.py
index bf8d9b2e4..d0fd6a1c8 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -899,12 +899,13 @@ class Flux(BaseModel):
         attention_mask = kwargs.get("attention_mask", None)
         if attention_mask is not None:
             shape = kwargs["noise"].shape
-            mask_ref_size = kwargs["attention_mask_img_shape"]
-            # the model will pad to the patch size, and then divide
-            # essentially dividing and rounding up
-            (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
-            attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
-            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+            mask_ref_size = kwargs.get("attention_mask_img_shape", None)
+            if mask_ref_size is not None:
+                # the model will pad to the patch size, and then divide
+                # essentially dividing and rounding up
+                (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
+                attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
+                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
 
         guidance = kwargs.get("guidance", 3.5)
         if guidance is not None:
@@ -926,9 +927,19 @@ class Flux(BaseModel):
         out = {}
         ref_latents = kwargs.get("reference_latents", None)
         if ref_latents is not None:
-            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
+            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
         return out
 
+class Flux2(Flux):
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            target_text_len = 512
+            if cross_attn.shape[1] < target_text_len:
+                cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, target_text_len - cross_attn.shape[1], 0))
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        return out
 
 class GenmoMochi(BaseModel):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@@ -1104,9 +1115,13 @@ class Lumina2(BaseModel):
             if torch.numel(attention_mask) != attention_mask.sum():
                 out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
             out['num_tokens'] = comfy.conds.CONDConstant(max(1, torch.sum(attention_mask).item()))
+
         cross_attn = kwargs.get("cross_attn", None)
         if cross_attn is not None:
             out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+            if 'num_tokens' not in out:
+                out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
+
         return out
 
 class WAN21(BaseModel):
@@ -1541,3 +1556,94 @@ class HunyuanImage21Refiner(HunyuanImage21):
         out = super().extra_conds(**kwargs)
         out['disable_time_r'] = comfy.conds.CONDConstant(True)
         return out
+
+class HunyuanVideo15(HunyuanVideo):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device)
+
+    def concat_cond(self, **kwargs):
+        noise = kwargs.get("noise", None)
+        extra_channels = self.diffusion_model.img_in.proj.weight.shape[1] - noise.shape[1] - 1 #noise 32 img cond 32 + mask 1
+        if extra_channels == 0:
+            return None
+
+        image = kwargs.get("concat_latent_image", None)
+        device = kwargs["device"]
+
+        if image is None:
+            shape_image = list(noise.shape)
+            shape_image[1] = extra_channels
+            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+        else:
+            latent_dim = self.latent_format.latent_channels
+            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            for i in range(0, image.shape[1], latent_dim):
+                image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim])
+            image = utils.resize_to_batch_size(image, noise.shape[0])
+
+        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+        if mask is None:
+            mask = torch.zeros_like(noise)[:, :1]
+        else:
+            mask = 1.0 - mask
+            mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            if mask.shape[-3] < noise.shape[-3]:
+                mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
+            mask = utils.resize_to_batch_size(mask, noise.shape[0])
+
+        return torch.cat((image, mask), dim=1)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            if torch.numel(attention_mask) != attention_mask.sum():
+                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+        conditioning_byt5small = kwargs.get("conditioning_byt5small", None)
+        if conditioning_byt5small is not None:
+            out['txt_byt5'] = comfy.conds.CONDRegular(conditioning_byt5small)
+
+        guidance = kwargs.get("guidance", 6.0)
+        if guidance is not None:
+            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
+
+        clip_vision_output = kwargs.get("clip_vision_output", None)
+        if clip_vision_output is not None:
+            out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.last_hidden_state)
+
+        return out
+
+class HunyuanVideo15_SR_Distilled(HunyuanVideo15):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device)
+
+    def concat_cond(self, **kwargs):
+        noise = kwargs.get("noise", None)
+        image = kwargs.get("concat_latent_image", None)
+        noise_augmentation = kwargs.get("noise_augmentation", 0.0)
+        device = kwargs["device"]
+
+        if image is None:
+            image = torch.zeros([noise.shape[0], noise.shape[1] * 2 + 2, noise.shape[-3], noise.shape[-2], noise.shape[-1]], device=comfy.model_management.intermediate_device())
+        else:
+            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+            #image = self.process_latent_in(image) # scaling wasn't applied in reference code
+            image = utils.resize_to_batch_size(image, noise.shape[0])
+            lq_image_slice = slice(noise.shape[1] + 1, 2 * noise.shape[1] + 1)
+            if noise_augmentation > 0:
+                generator = torch.Generator(device="cpu")
+                generator.manual_seed(kwargs.get("seed", 0) - 10)
+                noise = torch.randn(image[:, lq_image_slice].shape, generator=generator, dtype=image.dtype, device="cpu").to(image.device)
+                image[:, lq_image_slice] = noise_augmentation * noise + min(1.0 - noise_augmentation, 0.75) * image[:, lq_image_slice]
+            else:
+                image[:, lq_image_slice] = 0.75 * image[:, lq_image_slice]
+        return image
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        out['disable_time_r'] = comfy.conds.CONDConstant(False)
+        return out
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 11f286db3..f7f342002 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -186,30 +186,71 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
 
         guidance_keys = list(filter(lambda a: a.startswith("{}guidance_in.".format(key_prefix)), state_dict_keys))
         dit_config["guidance_embed"] = len(guidance_keys) > 0
+
+        # HunyuanVideo 1.5
+        if '{}cond_type_embedding.weight'.format(key_prefix) in state_dict_keys:
+            dit_config["use_cond_type_embedding"] = True
+        else:
+            dit_config["use_cond_type_embedding"] = False
+        if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys:
+            dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0]
+        else:
+            dit_config["vision_in_dim"] = None
         return dit_config
 
     if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
         dit_config = {}
-        dit_config["image_model"] = "flux"
+        if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys:
+            dit_config["image_model"] = "flux2"
+            dit_config["axes_dim"] = [32, 32, 32, 32]
+            dit_config["num_heads"] = 48
+            dit_config["mlp_ratio"] = 3.0
+            dit_config["theta"] = 2000
+            dit_config["out_channels"] = 128
+            dit_config["global_modulation"] = True
+            dit_config["mlp_silu_act"] = True
+            dit_config["qkv_bias"] = False
+            dit_config["ops_bias"] = False
+            dit_config["default_ref_method"] = "index"
+            dit_config["ref_index_scale"] = 10.0
+            dit_config["txt_ids_dims"] = [3]
+            patch_size = 1
+        else:
+            dit_config["image_model"] = "flux"
+            dit_config["axes_dim"] = [16, 56, 56]
+            dit_config["num_heads"] = 24
+            dit_config["mlp_ratio"] = 4.0
+            dit_config["theta"] = 10000
+            dit_config["out_channels"] = 16
+            dit_config["qkv_bias"] = True
+            dit_config["txt_ids_dims"] = []
+            patch_size = 2
+
         dit_config["in_channels"] = 16
-        patch_size = 2
+        dit_config["hidden_size"] = 3072
+        dit_config["context_in_dim"] = 4096
+
         dit_config["patch_size"] = patch_size
         in_key = "{}img_in.weight".format(key_prefix)
         if in_key in state_dict_keys:
-            dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size)
-        dit_config["out_channels"] = 16
+            w = state_dict[in_key]
+            dit_config["in_channels"] = w.shape[1] // (patch_size * patch_size)
+            dit_config["hidden_size"] = w.shape[0]
+
+        txt_in_key = "{}txt_in.weight".format(key_prefix)
+        if txt_in_key in state_dict_keys:
+            w = state_dict[txt_in_key]
+            dit_config["context_in_dim"] = w.shape[1]
+            dit_config["hidden_size"] = w.shape[0]
+
         vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
         if vec_in_key in state_dict_keys:
             dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
-        dit_config["context_in_dim"] = 4096
-        dit_config["hidden_size"] = 3072
-        dit_config["mlp_ratio"] = 4.0
-        dit_config["num_heads"] = 24
+        else:
+            dit_config["vec_in_dim"] = None
+
         dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
         dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
-        dit_config["axes_dim"] = [16, 56, 56]
-        dit_config["theta"] = 10000
-        dit_config["qkv_bias"] = True
         if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma
             dit_config["image_model"] = "chroma"
             dit_config["in_channels"] = 64
@@ -232,6 +273,11 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["nerf_embedder_dtype"] = torch.float32
         else:
             dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
+            dit_config["yak_mlp"] = '{}double_blocks.0.img_mlp.gate_proj.weight'.format(key_prefix) in state_dict_keys
+            dit_config["txt_norm"] = "{}txt_norm.scale".format(key_prefix) in state_dict_keys
+            if dit_config["yak_mlp"] and dit_config["txt_norm"]:  # Ovis model
+                dit_config["txt_ids_dims"] = [1, 2]
+
         return dit_config
 
     if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview
@@ -378,14 +424,31 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         dit_config["image_model"] = "lumina2"
         dit_config["patch_size"] = 2
         dit_config["in_channels"] = 16
-        dit_config["dim"] = 2304
-        dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1]
+        w = state_dict['{}cap_embedder.1.weight'.format(key_prefix)]
+        dit_config["dim"] = w.shape[0]
+        dit_config["cap_feat_dim"] = w.shape[1]
         dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.')
-        dit_config["n_heads"] = 24
-        dit_config["n_kv_heads"] = 8
         dit_config["qk_norm"] = True
-        dit_config["axes_dims"] = [32, 32, 32]
-        dit_config["axes_lens"] = [300, 512, 512]
+
+        if dit_config["dim"] == 2304: # Original Lumina 2
+            dit_config["n_heads"] = 24
+            dit_config["n_kv_heads"] = 8
+            dit_config["axes_dims"] = [32, 32, 32]
+            dit_config["axes_lens"] = [300, 512, 512]
+            dit_config["rope_theta"] = 10000.0
+            dit_config["ffn_dim_multiplier"] = 4.0
+        elif dit_config["dim"] == 3840:  # Z image
+            dit_config["n_heads"] = 30
+            dit_config["n_kv_heads"] = 30
+            dit_config["axes_dims"] = [32, 48, 48]
+            dit_config["axes_lens"] = [1536, 512, 512]
+            dit_config["rope_theta"] = 256.0
+            dit_config["ffn_dim_multiplier"] = (8.0 / 3.0)
+            dit_config["z_image_modulation"] = True
+            dit_config["time_scale"] = 1000.0
+            if '{}cap_pad_token'.format(key_prefix) in state_dict_keys:
+                dit_config["pad_tokens_multiple"] = 32
+
         return dit_config
 
     if '{}head.modulation'.format(key_prefix) in state_dict_keys:  # Wan 2.1
diff --git a/comfy/model_management.py b/comfy/model_management.py
index a21df54b3..aeddbaefe 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -689,7 +689,7 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
             loaded_memory = loaded_model.model_loaded_memory()
             current_free_mem = get_free_memory(torch_dev) + loaded_memory
 
-            lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
+            lowvram_model_memory = max(0, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
             lowvram_model_memory = lowvram_model_memory - loaded_memory
 
             if lowvram_model_memory == 0:
@@ -1012,9 +1012,18 @@ def force_channels_last():
 
 
 STREAMS = {}
-NUM_STREAMS = 1
-if args.async_offload:
-    NUM_STREAMS = 2
+NUM_STREAMS = 0
+if args.async_offload is not None:
+    NUM_STREAMS = args.async_offload
+else:
+    #  Enable by default on Nvidia
+    if is_nvidia():
+        NUM_STREAMS = 2
+
+if args.disable_async_offload:
+    NUM_STREAMS = 0
+
+if NUM_STREAMS > 0:
     logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS))
 
 def current_stream(device):
@@ -1030,7 +1039,10 @@ def current_stream(device):
 stream_counters = {}
 def get_offload_stream(device):
     stream_counter = stream_counters.get(device, 0)
-    if NUM_STREAMS <= 1:
+    if NUM_STREAMS == 0:
+        return None
+
+    if torch.compiler.is_compiling():
         return None
 
     if device in STREAMS:
@@ -1043,7 +1055,9 @@ def get_offload_stream(device):
     elif is_device_cuda(device):
         ss = []
         for k in range(NUM_STREAMS):
-            ss.append(torch.cuda.Stream(device=device, priority=0))
+            s1 = torch.cuda.Stream(device=device, priority=0)
+            s1.as_context = torch.cuda.stream
+            ss.append(s1)
         STREAMS[device] = ss
         s = ss[stream_counter]
         stream_counters[device] = stream_counter
@@ -1051,7 +1065,9 @@ def get_offload_stream(device):
     elif is_device_xpu(device):
         ss = []
         for k in range(NUM_STREAMS):
-            ss.append(torch.xpu.Stream(device=device, priority=0))
+            s1 = torch.xpu.Stream(device=device, priority=0)
+            s1.as_context = torch.xpu.stream
+            ss.append(s1)
         STREAMS[device] = ss
         s = ss[stream_counter]
         stream_counters[device] = stream_counter
@@ -1069,12 +1085,19 @@ def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, str
             if dtype is None or weight.dtype == dtype:
                 return weight
         if stream is not None:
-            with stream:
+            wf_context = stream
+            if hasattr(wf_context, "as_context"):
+                wf_context = wf_context.as_context(stream)
+            with wf_context:
                 return weight.to(dtype=dtype, copy=copy)
         return weight.to(dtype=dtype, copy=copy)
 
+
     if stream is not None:
-        with stream:
+        wf_context = stream
+        if hasattr(wf_context, "as_context"):
+            wf_context = wf_context.as_context(stream)
+        with wf_context:
             r = torch.empty_like(weight, dtype=dtype, device=device)
             r.copy_(weight, non_blocking=non_blocking)
     else:
@@ -1098,13 +1121,14 @@ if not args.disable_pinned_memory:
             MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95
         logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
 
+PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"])
 
 def pin_memory(tensor):
     global TOTAL_PINNED_MEMORY
     if MAX_PINNED_MEMORY <= 0:
         return False
 
-    if type(tensor) is not torch.nn.parameter.Parameter:
+    if type(tensor).__name__ not in PINNING_ALLOWED_TYPES:
         return False
 
     if not is_device_cpu(tensor.device):
@@ -1124,6 +1148,9 @@ def pin_memory(tensor):
         return False
 
     ptr = tensor.data_ptr()
+    if ptr == 0:
+        return False
+
     if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0:
         PINNED_MEMORY[ptr] = size
         TOTAL_PINNED_MEMORY += size
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index cf1b0d441..df2d8e827 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -132,7 +132,7 @@ class LowVramPatch:
     def __call__(self, weight):
         intermediate_dtype = weight.dtype
         if self.convert_func is not None:
-            weight = self.convert_func(weight.to(dtype=torch.float32, copy=True), inplace=True)
+            weight = self.convert_func(weight, inplace=False)
 
         if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops
             intermediate_dtype = torch.float32
@@ -148,6 +148,15 @@ class LowVramPatch:
         else:
             return out
 
+#The above patch logic may cast up the weight to fp32, and do math. Go with fp32 x 3
+LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 3
+
+def low_vram_patch_estimate_vram(model, key):
+    weight, set_func, convert_func = get_key_weight(model, key)
+    if weight is None:
+        return 0
+    return weight.numel() * torch.float32.itemsize * LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR
+
 def get_key_weight(model, key):
     set_func = None
     convert_func = None
@@ -231,7 +240,6 @@ class ModelPatcher:
         self.object_patches_backup = {}
         self.weight_wrapper_patches = {}
         self.model_options = {"transformer_options":{}}
-        self.model_size()
         self.load_device = load_device
         self.offload_device = offload_device
         self.weight_inplace_update = weight_inplace_update
@@ -270,6 +278,9 @@ class ModelPatcher:
         if not hasattr(self.model, 'current_weight_patches_uuid'):
             self.model.current_weight_patches_uuid = None
 
+        if not hasattr(self.model, 'model_offload_buffer_memory'):
+            self.model.model_offload_buffer_memory = 0
+
     def model_size(self):
         if self.size > 0:
             return self.size
@@ -286,7 +297,7 @@ class ModelPatcher:
         return self.model.lowvram_patch_counter
 
     def clone(self):
-        n = self.__class__(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
+        n = self.__class__(self.model, self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update)
         n.patches = {}
         for k in self.patches:
             n.patches[k] = self.patches[k][:]
@@ -663,7 +674,16 @@ class ModelPatcher:
                     skip = True # skip random weights in non leaf modules
                     break
             if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
-                loading.append((comfy.model_management.module_size(m), n, m, params))
+                module_mem = comfy.model_management.module_size(m)
+                module_offload_mem = module_mem
+                if hasattr(m, "comfy_cast_weights"):
+                    weight_key = "{}.weight".format(n)
+                    bias_key = "{}.bias".format(n)
+                    if weight_key in self.patches:
+                        module_offload_mem += low_vram_patch_estimate_vram(self.model, weight_key)
+                    if bias_key in self.patches:
+                        module_offload_mem += low_vram_patch_estimate_vram(self.model, bias_key)
+                loading.append((module_offload_mem, module_mem, n, m, params))
         return loading
 
     def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
@@ -677,20 +697,22 @@ class ModelPatcher:
 
             load_completely = []
             offloaded = []
+            offload_buffer = 0
             loading.sort(reverse=True)
             for x in loading:
-                n = x[1]
-                m = x[2]
-                params = x[3]
-                module_mem = x[0]
+                module_offload_mem, module_mem, n, m, params = x
 
                 lowvram_weight = False
 
+                potential_offload = max(offload_buffer, module_offload_mem + (comfy.model_management.NUM_STREAMS * module_mem))
+                lowvram_fits = mem_counter + module_mem + potential_offload < lowvram_model_memory
+
                 weight_key = "{}.weight".format(n)
                 bias_key = "{}.bias".format(n)
 
                 if not full_load and hasattr(m, "comfy_cast_weights"):
-                    if mem_counter + module_mem >= lowvram_model_memory:
+                    if not lowvram_fits:
+                        offload_buffer = potential_offload
                         lowvram_weight = True
                         lowvram_counter += 1
                         lowvram_mem_counter += module_mem
@@ -724,9 +746,11 @@ class ModelPatcher:
                     if hasattr(m, "comfy_cast_weights"):
                         wipe_lowvram_weight(m)
 
-                    if full_load or mem_counter + module_mem < lowvram_model_memory:
+                    if full_load or lowvram_fits:
                         mem_counter += module_mem
                         load_completely.append((module_mem, n, m, params))
+                    else:
+                        offload_buffer = potential_offload
 
                 if cast_weight and hasattr(m, "comfy_cast_weights"):
                     m.prev_comfy_cast_weights = m.comfy_cast_weights
@@ -767,7 +791,7 @@ class ModelPatcher:
                     self.pin_weight_to_device("{}.{}".format(n, param))
 
             if lowvram_counter > 0:
-                logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), patch_counter))
+                logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, {:.2f} MB buffer reserved, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), offload_buffer / (1024 * 1024), patch_counter))
                 self.model.model_lowvram = True
             else:
                 logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
@@ -779,6 +803,7 @@ class ModelPatcher:
             self.model.lowvram_patch_counter += patch_counter
             self.model.device = device_to
             self.model.model_loaded_weight_memory = mem_counter
+            self.model.model_offload_buffer_memory = offload_buffer
             self.model.current_weight_patches_uuid = self.patches_uuid
 
             for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
@@ -832,6 +857,7 @@ class ModelPatcher:
                 self.model.to(device_to)
                 self.model.device = device_to
             self.model.model_loaded_weight_memory = 0
+            self.model.model_offload_buffer_memory = 0
 
             for m in self.model.modules():
                 if hasattr(m, "comfy_patched_weights"):
@@ -850,13 +876,14 @@ class ModelPatcher:
             patch_counter = 0
             unload_list = self._load_list()
             unload_list.sort()
+            offload_buffer = self.model.model_offload_buffer_memory
+
             for unload in unload_list:
-                if memory_to_free < memory_freed:
+                if memory_to_free + offload_buffer - self.model.model_offload_buffer_memory < memory_freed:
                     break
-                module_mem = unload[0]
-                n = unload[1]
-                m = unload[2]
-                params = unload[3]
+                module_offload_mem, module_mem, n, m, params = unload
+
+                potential_offload = module_offload_mem + (comfy.model_management.NUM_STREAMS * module_mem)
 
                 lowvram_possible = hasattr(m, "comfy_cast_weights")
                 if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
@@ -907,15 +934,18 @@ class ModelPatcher:
                             m.comfy_cast_weights = True
                         m.comfy_patched_weights = False
                         memory_freed += module_mem
+                        offload_buffer = max(offload_buffer, potential_offload)
                         logging.debug("freed {}".format(n))
 
                         for param in params:
                             self.pin_weight_to_device("{}.{}".format(n, param))
 
+
             self.model.model_lowvram = True
             self.model.lowvram_patch_counter += patch_counter
             self.model.model_loaded_weight_memory -= memory_freed
-            logging.info("loaded partially: {:.2f} MB loaded, lowvram patches: {}".format(self.model.model_loaded_weight_memory / (1024 * 1024), self.model.lowvram_patch_counter))
+            self.model.model_offload_buffer_memory = offload_buffer
+            logging.info("Unloaded partially: {:.2f} MB freed, {:.2f} MB remains loaded, {:.2f} MB buffer reserved, lowvram patches: {}".format(memory_freed / (1024 * 1024), self.model.model_loaded_weight_memory / (1024 * 1024), offload_buffer / (1024 * 1024), self.model.lowvram_patch_counter))
             return memory_freed
 
     def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
diff --git a/comfy/ops.py b/comfy/ops.py
index 2a90a5ba2..eae434e68 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -58,7 +58,8 @@ except (ModuleNotFoundError, TypeError):
 NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
 try:
     if comfy.model_management.is_nvidia():
-        if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
+        cudnn_version = torch.backends.cudnn.version()
+        if (cudnn_version >= 91002 and cudnn_version < 91500) and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
             #TODO: change upper bound version once it's fixed'
             NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
             logging.info("working around nvidia conv3d memory bug.")
@@ -94,6 +95,8 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
 
     if offload_stream is not None:
         wf_context = offload_stream
+        if hasattr(wf_context, "as_context"):
+            wf_context = wf_context.as_context(offload_stream)
     else:
         wf_context = contextlib.nullcontext()
 
@@ -108,20 +111,24 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
     if s.bias is not None:
         bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream)
 
-        if bias_has_function:
-            with wf_context:
-                for f in s.bias_function:
-                    bias = f(bias)
+    comfy.model_management.sync_stream(device, offload_stream)
+
+    bias_a = bias
+    weight_a = weight
+
+    if s.bias is not None:
+        for f in s.bias_function:
+            bias = f(bias)
 
     if weight_has_function or weight.dtype != dtype:
-        with wf_context:
-            weight = weight.to(dtype=dtype)
-            for f in s.weight_function:
-                weight = f(weight)
+        weight = weight.to(dtype=dtype)
+        if isinstance(weight, QuantizedTensor):
+            weight = weight.dequantize()
+        for f in s.weight_function:
+            weight = f(weight)
 
-    comfy.model_management.sync_stream(device, offload_stream)
     if offloadable:
-        return weight, bias, offload_stream
+        return weight, bias, (offload_stream, weight_a, bias_a)
     else:
         #Legacy function signature
         return weight, bias
@@ -130,13 +137,16 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
 def uncast_bias_weight(s, weight, bias, offload_stream):
     if offload_stream is None:
         return
-    if weight is not None:
-        device = weight.device
+    os, weight_a, bias_a = offload_stream
+    if os is None:
+        return
+    if weight_a is not None:
+        device = weight_a.device
     else:
-        if bias is None:
+        if bias_a is None:
             return
-        device = bias.device
-    offload_stream.wait_stream(comfy.model_management.current_stream(device))
+        device = bias_a.device
+    os.wait_stream(comfy.model_management.current_stream(device))
 
 
 class CastWeightBiasOp:
@@ -501,7 +511,7 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None
                     weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
                     return weight
                 else:
-                    return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+                    return weight.to(dtype=torch.float32) * self.scale_weight.to(device=weight.device, dtype=torch.float32)
 
             def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
                 weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
@@ -539,115 +549,136 @@ if CUBLAS_IS_AVAILABLE:
 # ==============================================================================
 from .quant_ops import QuantizedTensor, QUANT_ALGOS
 
-class MixedPrecisionOps(disable_weight_init):
-    _layer_quant_config = {}
-    _compute_dtype = torch.bfloat16
 
-    class Linear(torch.nn.Module, CastWeightBiasOp):
-        def __init__(
-            self,
-            in_features: int,
-            out_features: int,
-            bias: bool = True,
-            device=None,
-            dtype=None,
-        ) -> None:
-            super().__init__()
+def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False):
+    class MixedPrecisionOps(manual_cast):
+        _layer_quant_config = layer_quant_config
+        _compute_dtype = compute_dtype
+        _full_precision_mm = full_precision_mm
 
-            self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
-            # self.factory_kwargs = {"device": device, "dtype": dtype}
+        class Linear(torch.nn.Module, CastWeightBiasOp):
+            def __init__(
+                self,
+                in_features: int,
+                out_features: int,
+                bias: bool = True,
+                device=None,
+                dtype=None,
+            ) -> None:
+                super().__init__()
 
-            self.in_features = in_features
-            self.out_features = out_features
-            if bias:
-                self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
-            else:
-                self.register_parameter("bias", None)
+                self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
+                # self.factory_kwargs = {"device": device, "dtype": dtype}
 
-            self.tensor_class = None
+                self.in_features = in_features
+                self.out_features = out_features
+                if bias:
+                    self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
+                else:
+                    self.register_parameter("bias", None)
 
-        def reset_parameters(self):
-            return None
+                self.tensor_class = None
+                self._full_precision_mm = MixedPrecisionOps._full_precision_mm
 
-        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
-                                  strict, missing_keys, unexpected_keys, error_msgs):
+            def reset_parameters(self):
+                return None
 
-            device = self.factory_kwargs["device"]
-            layer_name = prefix.rstrip('.')
-            weight_key = f"{prefix}weight"
-            weight = state_dict.pop(weight_key, None)
-            if weight is None:
-                raise ValueError(f"Missing weight for layer {layer_name}")
+            def _load_from_state_dict(self, state_dict, prefix, local_metadata,
+                                    strict, missing_keys, unexpected_keys, error_msgs):
 
-            manually_loaded_keys = [weight_key]
+                device = self.factory_kwargs["device"]
+                layer_name = prefix.rstrip('.')
+                weight_key = f"{prefix}weight"
+                weight = state_dict.pop(weight_key, None)
+                if weight is None:
+                    raise ValueError(f"Missing weight for layer {layer_name}")
 
-            if layer_name not in MixedPrecisionOps._layer_quant_config:
-                self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
-            else:
-                quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None)
-                if quant_format is None:
-                    raise ValueError(f"Unknown quantization format for layer {layer_name}")
+                manually_loaded_keys = [weight_key]
 
-                qconfig = QUANT_ALGOS[quant_format]
-                self.layout_type = qconfig["comfy_tensor_layout"]
+                if layer_name not in MixedPrecisionOps._layer_quant_config:
+                    self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
+                else:
+                    quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None)
+                    if quant_format is None:
+                        raise ValueError(f"Unknown quantization format for layer {layer_name}")
 
-                weight_scale_key = f"{prefix}weight_scale"
-                layout_params = {
-                    'scale': state_dict.pop(weight_scale_key, None),
-                    'orig_dtype': MixedPrecisionOps._compute_dtype,
-                    'block_size': qconfig.get("group_size", None),
-                }
-                if layout_params['scale'] is not None:
-                    manually_loaded_keys.append(weight_scale_key)
+                    qconfig = QUANT_ALGOS[quant_format]
+                    self.layout_type = qconfig["comfy_tensor_layout"]
 
-                self.weight = torch.nn.Parameter(
-                    QuantizedTensor(weight.to(device=device), self.layout_type, layout_params),
-                    requires_grad=False
-                )
+                    weight_scale_key = f"{prefix}weight_scale"
+                    layout_params = {
+                        'scale': state_dict.pop(weight_scale_key, None),
+                        'orig_dtype': MixedPrecisionOps._compute_dtype,
+                        'block_size': qconfig.get("group_size", None),
+                    }
+                    if layout_params['scale'] is not None:
+                        manually_loaded_keys.append(weight_scale_key)
 
-                for param_name in qconfig["parameters"]:
-                    param_key = f"{prefix}{param_name}"
-                    _v = state_dict.pop(param_key, None)
-                    if _v is None:
-                        continue
-                    setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
-                    manually_loaded_keys.append(param_key)
+                    self.weight = torch.nn.Parameter(
+                        QuantizedTensor(weight.to(device=device), self.layout_type, layout_params),
+                        requires_grad=False
+                    )
 
-            super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+                    for param_name in qconfig["parameters"]:
+                        param_key = f"{prefix}{param_name}"
+                        _v = state_dict.pop(param_key, None)
+                        if _v is None:
+                            continue
+                        setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
+                        manually_loaded_keys.append(param_key)
 
-            for key in manually_loaded_keys:
-                if key in missing_keys:
-                    missing_keys.remove(key)
+                super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
 
-        def _forward(self, input, weight, bias):
-            return torch.nn.functional.linear(input, weight, bias)
+                for key in manually_loaded_keys:
+                    if key in missing_keys:
+                        missing_keys.remove(key)
 
-        def forward_comfy_cast_weights(self, input):
-            weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
-            x = self._forward(input, weight, bias)
-            uncast_bias_weight(self, weight, bias, offload_stream)
-            return x
+            def _forward(self, input, weight, bias):
+                return torch.nn.functional.linear(input, weight, bias)
 
-        def forward(self, input, *args, **kwargs):
-            run_every_op()
+            def forward_comfy_cast_weights(self, input):
+                weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
+                x = self._forward(input, weight, bias)
+                uncast_bias_weight(self, weight, bias, offload_stream)
+                return x
 
-            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
-                return self.forward_comfy_cast_weights(input, *args, **kwargs)
-            if (getattr(self, 'layout_type', None) is not None and
-                getattr(self, 'input_scale', None) is not None and
-                not isinstance(input, QuantizedTensor)):
-                input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype)
-            return self._forward(input, self.weight, self.bias)
+            def forward(self, input, *args, **kwargs):
+                run_every_op()
 
+                if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                    return self.forward_comfy_cast_weights(input, *args, **kwargs)
+                if (getattr(self, 'layout_type', None) is not None and
+                    getattr(self, 'input_scale', None) is not None and
+                    not isinstance(input, QuantizedTensor)):
+                    input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype)
+                return self._forward(input, self.weight, self.bias)
+
+            def convert_weight(self, weight, inplace=False, **kwargs):
+                if isinstance(weight, QuantizedTensor):
+                    return weight.dequantize()
+                else:
+                    return weight
+
+            def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs):
+                if getattr(self, 'layout_type', None) is not None:
+                    weight = QuantizedTensor.from_float(weight, self.layout_type, scale=None, dtype=self.weight.dtype, stochastic_rounding=seed, inplace_ops=True)
+                else:
+                    weight = weight.to(self.weight.dtype)
+                if return_weight:
+                    return weight
+
+                assert inplace_update is False  # TODO: eventually remove the inplace_update stuff
+                self.weight = torch.nn.Parameter(weight, requires_grad=False)
+
+    return MixedPrecisionOps
 
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None):
-    if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config:
-        MixedPrecisionOps._layer_quant_config = model_config.layer_quant_config
-        MixedPrecisionOps._compute_dtype = compute_dtype
-        logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers")
-        return MixedPrecisionOps
+    fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
+
+    if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config:
+        logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers")
+        return mixed_precision_ops(model_config.layer_quant_config, compute_dtype, full_precision_mm=not fp8_compute)
 
-    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
     if scaled_fp8 is not None:
         return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
 
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index 1d058bece..bb1fb860c 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -1,6 +1,7 @@
 import torch
 import logging
 from typing import Tuple, Dict
+import comfy.float
 
 _LAYOUT_REGISTRY = {}
 _GENERIC_UTILS = {}
@@ -228,6 +229,14 @@ class QuantizedTensor(torch.Tensor):
         new_kwargs = dequant_arg(kwargs)
         return func(*new_args, **new_kwargs)
 
+    def data_ptr(self):
+        return self._qdata.data_ptr()
+
+    def is_pinned(self):
+        return self._qdata.is_pinned()
+
+    def is_contiguous(self, *arg, **kwargs):
+        return self._qdata.is_contiguous(*arg, **kwargs)
 
 # ==============================================================================
 # Generic Utilities (Layout-Agnostic Operations)
@@ -338,6 +347,18 @@ def generic_copy_(func, args, kwargs):
     return func(*args, **kwargs)
 
 
+@register_generic_util(torch.ops.aten.to.dtype)
+def generic_to_dtype(func, args, kwargs):
+    """Handle .to(dtype) calls - dtype conversion only."""
+    src = args[0]
+    if isinstance(src, QuantizedTensor):
+        # For dtype-only conversion, just change the orig_dtype, no real cast is needed
+        target_dtype = args[1] if len(args) > 1 else kwargs.get('dtype')
+        src._layout_params["orig_dtype"] = target_dtype
+        return src
+    return func(*args, **kwargs)
+
+
 @register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default)
 def generic_has_compatible_shallow_copy_type(func, args, kwargs):
     return True
@@ -373,7 +394,7 @@ class TensorCoreFP8Layout(QuantizedLayout):
     - orig_dtype: Original dtype before quantization (for casting back)
     """
     @classmethod
-    def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn):
+    def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn, stochastic_rounding=0, inplace_ops=False):
         orig_dtype = tensor.dtype
 
         if scale is None:
@@ -383,22 +404,29 @@ class TensorCoreFP8Layout(QuantizedLayout):
             scale = torch.tensor(scale)
         scale = scale.to(device=tensor.device, dtype=torch.float32)
 
-        tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype)
-        # TODO: uncomment this if it's actually needed because the clamp has a small performance penality'
-        # lp_amax = torch.finfo(dtype).max
-        # torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled)
-        qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format)
+        if inplace_ops:
+            tensor *= (1.0 / scale).to(tensor.dtype)
+        else:
+            tensor = tensor * (1.0 / scale).to(tensor.dtype)
+
+        if stochastic_rounding > 0:
+            tensor = comfy.float.stochastic_rounding(tensor, dtype=dtype, seed=stochastic_rounding)
+        else:
+            lp_amax = torch.finfo(dtype).max
+            torch.clamp(tensor, min=-lp_amax, max=lp_amax, out=tensor)
+            tensor = tensor.to(dtype, memory_format=torch.contiguous_format)
 
         layout_params = {
             'scale': scale,
             'orig_dtype': orig_dtype
         }
-        return qdata, layout_params
+        return tensor, layout_params
 
     @staticmethod
     def dequantize(qdata, scale, orig_dtype, **kwargs):
         plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype)
-        return plain_tensor * scale
+        plain_tensor.mul_(scale)
+        return plain_tensor
 
     @classmethod
     def get_plain_tensors(cls, qtensor):
diff --git a/comfy/sd.py b/comfy/sd.py
index 2ea8cf761..ea790ad81 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -52,6 +52,8 @@ import comfy.text_encoders.ace
 import comfy.text_encoders.omnigen2
 import comfy.text_encoders.qwen_image
 import comfy.text_encoders.hunyuan_image
+import comfy.text_encoders.z_image
+import comfy.text_encoders.ovis
 
 import comfy.model_patcher
 import comfy.lora
@@ -59,6 +61,8 @@ import comfy.lora_convert
 import comfy.hooks
 import comfy.t2i_adapter.adapter
 import comfy.taesd.taesd
+import comfy.taesd.taehv
+import comfy.latent_formats
 
 import comfy.ldm.flux.redux
 
@@ -356,7 +360,7 @@ class VAE:
 
                     self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
                     self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype)
-                elif sd['decoder.conv_in.weight'].shape[1] == 32:
+                elif sd['decoder.conv_in.weight'].shape[1] == 32 and sd['decoder.conv_in.weight'].ndim == 5:
                     ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False}
                     self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
                     self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
@@ -382,6 +386,17 @@ class VAE:
                         self.upscale_ratio = 4
 
                     self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
+                    if 'decoder.post_quant_conv.weight' in sd:
+                        sd = comfy.utils.state_dict_prefix_replace(sd, {"decoder.post_quant_conv.": "post_quant_conv.", "encoder.quant_conv.": "quant_conv."})
+
+                    if 'bn.running_mean' in sd:
+                        ddconfig["batch_norm_latent"] = True
+                        self.downscale_ratio *= 2
+                        self.upscale_ratio *= 2
+                        self.latent_channels *= 4
+                        old_memory_used_decode = self.memory_used_decode
+                        self.memory_used_decode = lambda shape, dtype: old_memory_used_decode(shape, dtype) *  4.0
+
                     if 'post_quant_conv.weight' in sd:
                         self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
                     else:
@@ -441,20 +456,20 @@ class VAE:
             elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32:
                 ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True}
                 ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
-                self.latent_channels = 64
+                self.latent_channels = 32
                 self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
                 self.upscale_index_formula = (4, 16, 16)
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
                 self.downscale_index_formula = (4, 16, 16)
                 self.latent_dim = 3
-                self.not_video = True
+                self.not_video = False
                 self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
                 self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.EmptyRegularizer"},
                                                             encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig},
                                                             decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
 
-                self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
-                self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (2800 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
             elif "decoder.conv_in.conv.weight" in sd:
                 ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
                 ddconfig["conv3d"] = True
@@ -496,13 +511,14 @@ class VAE:
                     self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype)
                     self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype)
                 else:  # Wan 2.1 VAE
+                    dim = sd["decoder.head.0.gamma"].shape[0]
                     self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
                     self.upscale_index_formula = (4, 8, 8)
                     self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
                     self.downscale_index_formula = (4, 8, 8)
                     self.latent_dim = 3
                     self.latent_channels = 16
-                    ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
+                    ddconfig = {"dim": dim, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
                     self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig)
                     self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
                     self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype)
@@ -572,6 +588,35 @@ class VAE:
                 self.process_input = lambda audio: audio
                 self.working_dtypes = [torch.float32]
                 self.crop_input = False
+            elif "decoder.22.bias" in sd: # taehv, taew and lighttae
+                self.latent_channels = sd["decoder.1.weight"].shape[1]
+                self.latent_dim = 3
+                self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
+                self.upscale_index_formula = (4, 16, 16)
+                self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
+                self.downscale_index_formula = (4, 16, 16)
+                if self.latent_channels == 48: # Wan 2.2
+                    self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=None) # taehv doesn't need scaling
+                    self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
+                    self.process_output = lambda image: image
+                    self.memory_used_decode = lambda shape, dtype: (1800 * (max(1, (shape[-3] ** 0.7 * 0.1)) * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype))
+                elif self.latent_channels == 32 and sd["decoder.22.bias"].shape[0] == 12: # lighttae_hv15
+                    self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=comfy.latent_formats.HunyuanVideo15)
+                    self.process_input = lambda image: (_ for _ in ()).throw(NotImplementedError("This light tae doesn't support encoding currently"))
+                    self.memory_used_decode = lambda shape, dtype: (1200 * (max(1, (shape[-3] ** 0.7 * 0.05)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
+                else:
+                    if sd["decoder.1.weight"].dtype == torch.float16: # taehv currently only available in float16, so assume it's not lighttaew2_1 as otherwise state dicts are identical
+                        latent_format=comfy.latent_formats.HunyuanVideo
+                    else:
+                        latent_format=None # lighttaew2_1 doesn't need scaling
+                    self.first_stage_model = comfy.taesd.taehv.TAEHV(latent_channels=self.latent_channels, latent_format=latent_format)
+                    self.process_input = self.process_output = lambda image: image
+                    self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
+                    self.upscale_index_formula = (4, 8, 8)
+                    self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
+                    self.downscale_index_formula = (4, 8, 8)
+                    self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
+                    self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
             else:
                 logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
                 self.first_stage_model = None
@@ -911,12 +956,19 @@ class CLIPType(Enum):
     OMNIGEN2 = 17
     QWEN_IMAGE = 18
     HUNYUAN_IMAGE = 19
+    HUNYUAN_VIDEO_15 = 20
+    OVIS = 21
 
 
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
     clip_data = []
     for p in ckpt_paths:
-        clip_data.append(comfy.utils.load_torch_file(p, safe_load=True))
+        sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True)
+        if metadata is not None:
+            quant_metadata = metadata.get("_quantization_metadata", None)
+            if quant_metadata is not None:
+                sd["_quantization_metadata"] = quant_metadata
+        clip_data.append(sd)
     return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
 
 
@@ -934,6 +986,11 @@ class TEModel(Enum):
     QWEN25_7B = 11
     BYT5_SMALL_GLYPH = 12
     GEMMA_3_4B = 13
+    MISTRAL3_24B = 14
+    MISTRAL3_24B_PRUNED_FLUX2 = 15
+    QWEN3_4B = 16
+    QWEN3_2B = 17
+
 
 def detect_te_model(sd):
     if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@@ -966,6 +1023,18 @@ def detect_te_model(sd):
         if weight.shape[0] == 512:
             return TEModel.QWEN25_7B
     if "model.layers.0.post_attention_layernorm.weight" in sd:
+        weight = sd['model.layers.0.post_attention_layernorm.weight']
+        if 'model.layers.0.self_attn.q_norm.weight' in sd:
+            if weight.shape[0] == 2560:
+                return TEModel.QWEN3_4B
+            elif weight.shape[0] == 2048:
+                return TEModel.QWEN3_2B
+        if weight.shape[0] == 5120:
+            if "model.layers.39.post_attention_layernorm.weight" in sd:
+                return TEModel.MISTRAL3_24B
+            else:
+                return TEModel.MISTRAL3_24B_PRUNED_FLUX2
+
         return TEModel.LLAMA3_8
     return None
 
@@ -1080,6 +1149,16 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             else:
                 clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data))
                 clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer
+        elif te_model == TEModel.MISTRAL3_24B or te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2:
+            clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2)
+            clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer
+            tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
+        elif te_model == TEModel.QWEN3_4B:
+            clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer
+        elif te_model == TEModel.QWEN3_2B:
+            clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer
         else:
             # clip_l
             if clip_type == CLIPType.SD3:
@@ -1126,6 +1205,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         elif clip_type == CLIPType.HUNYUAN_IMAGE:
             clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
             clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
+        elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
+            clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
         else:
             clip_target.clip = sdxl_clip.SDXLClipModel
             clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@@ -1138,6 +1220,8 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
 
     parameters = 0
     for c in clip_data:
+        if "_quantization_metadata" in c:
+            c.pop("_quantization_metadata")
         parameters += comfy.utils.calculate_parameters(c)
         tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
 
diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index f8a7c2a1b..0fc9ab3db 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -90,7 +90,6 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
                  special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=True, enable_attention_masks=False, zero_out_masked=False,
                  return_projected_pooled=True, return_attention_masks=False, model_options={}):  # clip-vit-base-patch32
         super().__init__()
-        assert layer in self.LAYERS
 
         if textmodel_json_config is None:
             textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_clip_config.json")
@@ -109,13 +108,23 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
 
         operations = model_options.get("custom_operations", None)
         scaled_fp8 = None
+        quantization_metadata = model_options.get("quantization_metadata", None)
 
         if operations is None:
-            scaled_fp8 = model_options.get("scaled_fp8", None)
-            if scaled_fp8 is not None:
-                operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
+            layer_quant_config = None
+            if quantization_metadata is not None:
+                layer_quant_config = json.loads(quantization_metadata).get("layers", None)
+
+            if layer_quant_config is not None:
+                operations = comfy.ops.mixed_precision_ops(layer_quant_config, dtype, full_precision_mm=True)
+                logging.info(f"Using MixedPrecisionOps for text encoder: {len(layer_quant_config)} quantized layers")
             else:
-                operations = comfy.ops.manual_cast
+                # Fallback to scaled_fp8_ops for backward compatibility
+                scaled_fp8 = model_options.get("scaled_fp8", None)
+                if scaled_fp8 is not None:
+                    operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
+                else:
+                    operations = comfy.ops.manual_cast
 
         self.operations = operations
         self.transformer = model_class(config, dtype, device, self.operations)
@@ -154,7 +163,7 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
     def set_clip_options(self, options):
         layer_idx = options.get("layer", self.layer_idx)
         self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
-        if self.layer == "all":
+        if isinstance(self.layer, list) or self.layer == "all":
             pass
         elif layer_idx is None or abs(layer_idx) > self.num_layers:
             self.layer = "last"
@@ -256,7 +265,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
         if self.enable_attention_masks:
             attention_mask_model = attention_mask
 
-        if self.layer == "all":
+        if isinstance(self.layer, list):
+            intermediate_output = self.layer
+        elif self.layer == "all":
             intermediate_output = "all"
         else:
             intermediate_output = self.layer_idx
@@ -460,7 +471,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
     return embed_out
 
 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data={}, tokenizer_args={}):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data={}, tokenizer_args={}):
         if tokenizer_path is None:
             tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
         self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
@@ -468,6 +479,7 @@ class SDTokenizer:
         self.min_length = tokenizer_data.get("{}_min_length".format(embedding_key), min_length)
         self.end_token = None
         self.min_padding = min_padding
+        self.pad_left = pad_left
 
         empty = self.tokenizer('')["input_ids"]
         self.tokenizer_adds_end_token = has_end_token
@@ -522,6 +534,12 @@ class SDTokenizer:
                 return (embed, "{} {}".format(embedding_name[len(stripped):], leftover))
         return (embed, leftover)
 
+    def pad_tokens(self, tokens, amount):
+        if self.pad_left:
+            for i in range(amount):
+                tokens.insert(0, (self.pad_token, 1.0, 0))
+        else:
+            tokens.extend([(self.pad_token, 1.0, 0)] * amount)
 
     def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_options={}, **kwargs):
         '''
@@ -600,7 +618,7 @@ class SDTokenizer:
                         if self.end_token is not None:
                             batch.append((self.end_token, 1.0, 0))
                         if self.pad_to_max_length:
-                            batch.extend([(self.pad_token, 1.0, 0)] * (remaining_length))
+                            self.pad_tokens(batch, remaining_length)
                     #start new batch
                     batch = []
                     if self.start_token is not None:
@@ -614,11 +632,11 @@ class SDTokenizer:
         if self.end_token is not None:
             batch.append((self.end_token, 1.0, 0))
         if min_padding is not None:
-            batch.extend([(self.pad_token, 1.0, 0)] * min_padding)
+            self.pad_tokens(batch, min_padding)
         if self.pad_to_max_length and len(batch) < self.max_length:
-            batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch)))
+            self.pad_tokens(batch, self.max_length - len(batch))
         if min_length is not None and len(batch) < min_length:
-            batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch)))
+            self.pad_tokens(batch, min_length - len(batch))
 
         if not return_word_ids:
             batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 3943b0dd3..ff0e287f4 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -22,6 +22,7 @@ import comfy.text_encoders.omnigen2
 import comfy.text_encoders.higgsv2
 import comfy.text_encoders.qwen_image
 import comfy.text_encoders.hunyuan_image
+import comfy.text_encoders.z_image
 
 from . import supported_models_base
 from . import latent_formats
@@ -742,6 +743,37 @@ class FluxSchnell(Flux):
         out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
         return out
 
+class Flux2(Flux):
+    unet_config = {
+        "image_model": "flux2",
+    }
+
+    sampling_settings = {
+        "shift": 2.02,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux2
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Flux2(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None # TODO
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
+
 class GenmoMochi(supported_models_base.BASE):
     unet_config = {
         "image_model": "mochi_preview",
@@ -964,7 +996,7 @@ class Lumina2(supported_models_base.BASE):
         "shift": 6.0,
     }
 
-    memory_usage_factor = 1.2
+    memory_usage_factor = 1.4
 
     unet_extra_config = {}
     latent_format = latent_formats.Flux
@@ -983,6 +1015,26 @@ class Lumina2(supported_models_base.BASE):
         hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}gemma2_2b.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.lumina2.LuminaTokenizer, comfy.text_encoders.lumina2.te(**hunyuan_detect))
 
+class ZImage(Lumina2):
+    unet_config = {
+        "image_model": "lumina2",
+        "dim": 3840,
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.0,
+    }
+
+    memory_usage_factor = 1.7
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.z_image.ZImageTokenizer, comfy.text_encoders.z_image.te(**hunyuan_detect))
+
 class WAN21_T2V(supported_models_base.BASE):
     unet_config = {
         "image_model": "wan2.1",
@@ -1391,6 +1443,54 @@ class HunyuanImage21Refiner(HunyuanVideo):
         out = model_base.HunyuanImage21Refiner(self, device=device)
         return out
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Higgsv2]
+class HunyuanVideo15(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "vision_in_dim": 1152,
+    }
+
+    sampling_settings = {
+        "shift": 7.0,
+    }
+    memory_usage_factor = 4.0 #TODO
+    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    latent_format = latent_formats.HunyuanVideo15
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideo15(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+
+
+class HunyuanVideo15_SR_Distilled(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "vision_in_dim": 1152,
+        "in_channels": 98,
+    }
+
+    sampling_settings = {
+        "shift": 2.0,
+    }
+    memory_usage_factor = 4.0 #TODO
+    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    latent_format = latent_formats.HunyuanVideo15
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideo15_SR_Distilled(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Higgsv2]
 
 models += [SVD_img2vid]
diff --git a/comfy/taesd/taehv.py b/comfy/taesd/taehv.py
new file mode 100644
index 000000000..3dfe1e4d4
--- /dev/null
+++ b/comfy/taesd/taehv.py
@@ -0,0 +1,171 @@
+# Tiny AutoEncoder for HunyuanVideo and WanVideo https://github.com/madebyollin/taehv
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm.auto import tqdm
+from collections import namedtuple, deque
+
+import comfy.ops
+operations=comfy.ops.disable_weight_init
+
+DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
+TWorkItem = namedtuple("TWorkItem", ("input_tensor", "block_index"))
+
+def conv(n_in, n_out, **kwargs):
+    return operations.Conv2d(n_in, n_out, 3, padding=1, **kwargs)
+
+class Clamp(nn.Module):
+    def forward(self, x):
+        return torch.tanh(x / 3) * 3
+
+class MemBlock(nn.Module):
+    def __init__(self, n_in, n_out, act_func):
+        super().__init__()
+        self.conv = nn.Sequential(conv(n_in * 2, n_out), act_func, conv(n_out, n_out), act_func, conv(n_out, n_out))
+        self.skip = operations.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
+        self.act = act_func
+    def forward(self, x, past):
+        return self.act(self.conv(torch.cat([x, past], 1)) + self.skip(x))
+
+class TPool(nn.Module):
+    def __init__(self, n_f, stride):
+        super().__init__()
+        self.stride = stride
+        self.conv = operations.Conv2d(n_f*stride,n_f, 1, bias=False)
+    def forward(self, x):
+        _NT, C, H, W = x.shape
+        return self.conv(x.reshape(-1, self.stride * C, H, W))
+
+class TGrow(nn.Module):
+    def __init__(self, n_f, stride):
+        super().__init__()
+        self.stride = stride
+        self.conv = operations.Conv2d(n_f, n_f*stride, 1, bias=False)
+    def forward(self, x):
+        _NT, C, H, W = x.shape
+        x = self.conv(x)
+        return x.reshape(-1, C, H, W)
+
+def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
+
+    B, T, C, H, W = x.shape
+    if parallel:
+        x = x.reshape(B*T, C, H, W)
+        # parallel over input timesteps, iterate over blocks
+        for b in tqdm(model, disable=not show_progress_bar):
+            if isinstance(b, MemBlock):
+                BT, C, H, W = x.shape
+                T = BT // B
+                _x = x.reshape(B, T, C, H, W)
+                mem = F.pad(_x, (0,0,0,0,0,0,1,0), value=0)[:,:T].reshape(x.shape)
+                x = b(x, mem)
+            else:
+                x = b(x)
+        BT, C, H, W = x.shape
+        T = BT // B
+        x = x.view(B, T, C, H, W)
+    else:
+        out = []
+        work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
+        progress_bar = tqdm(range(T), disable=not show_progress_bar)
+        mem = [None] * len(model)
+        while work_queue:
+            xt, i = work_queue.popleft()
+            if i == 0:
+                progress_bar.update(1)
+            if i == len(model):
+                out.append(xt)
+                del xt
+            else:
+                b = model[i]
+                if isinstance(b, MemBlock):
+                    if mem[i] is None:
+                        xt_new = b(xt, xt * 0)
+                        mem[i] = xt.detach().clone()
+                    else:
+                        xt_new = b(xt, mem[i])
+                        mem[i] = xt.detach().clone()
+                    del xt
+                    work_queue.appendleft(TWorkItem(xt_new, i+1))
+                elif isinstance(b, TPool):
+                    if mem[i] is None:
+                        mem[i] = []
+                    mem[i].append(xt.detach().clone())
+                    if len(mem[i]) == b.stride:
+                        B, C, H, W = xt.shape
+                        xt = b(torch.cat(mem[i], 1).view(B*b.stride, C, H, W))
+                        mem[i] = []
+                        work_queue.appendleft(TWorkItem(xt, i+1))
+                elif isinstance(b, TGrow):
+                    xt = b(xt)
+                    NT, C, H, W = xt.shape
+                    for xt_next in reversed(xt.view(B, b.stride*C, H, W).chunk(b.stride, 1)):
+                        work_queue.appendleft(TWorkItem(xt_next, i+1))
+                    del xt
+                else:
+                    xt = b(xt)
+                    work_queue.appendleft(TWorkItem(xt, i+1))
+        progress_bar.close()
+        x = torch.stack(out, 1)
+    return x
+
+
+class TAEHV(nn.Module):
+    def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True):
+        super().__init__()
+        self.image_channels = 3
+        self.patch_size = 1
+        self.latent_channels = latent_channels
+        self.parallel = parallel
+        self.latent_format = latent_format
+        self.show_progress_bar = show_progress_bar
+        self.process_in = latent_format().process_in if latent_format is not None else (lambda x: x)
+        self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x)
+        if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5
+            self.patch_size = 2
+        if self.latent_channels == 32: # HunyuanVideo1.5
+            act_func = nn.LeakyReLU(0.2, inplace=True)
+        else: # HunyuanVideo, Wan 2.1
+            act_func = nn.ReLU(inplace=True)
+
+        self.encoder = nn.Sequential(
+            conv(self.image_channels*self.patch_size**2, 64), act_func,
+            TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
+            TPool(64, 2), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
+            TPool(64, 1), conv(64, 64, stride=2, bias=False), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func), MemBlock(64, 64, act_func),
+            conv(64, self.latent_channels),
+        )
+        n_f = [256, 128, 64, 64]
+        self.frames_to_trim = 2**sum(decoder_time_upscale) - 1
+        self.decoder = nn.Sequential(
+            Clamp(), conv(self.latent_channels, n_f[0]), act_func,
+            MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), MemBlock(n_f[0], n_f[0], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[0] else 1), TGrow(n_f[0], 1), conv(n_f[0], n_f[1], bias=False),
+            MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), MemBlock(n_f[1], n_f[1], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[1] else 1), TGrow(n_f[1], 2 if decoder_time_upscale[0] else 1), conv(n_f[1], n_f[2], bias=False),
+            MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False),
+            act_func, conv(n_f[3], self.image_channels*self.patch_size**2),
+        )
+        @property
+        def show_progress_bar(self):
+            return self._show_progress_bar
+
+        @show_progress_bar.setter
+        def show_progress_bar(self, value):
+            self._show_progress_bar = value
+
+    def encode(self, x, **kwargs):
+        if self.patch_size > 1: x = F.pixel_unshuffle(x, self.patch_size)
+        x = x.movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
+        if x.shape[1] % 4 != 0:
+            # pad at end to multiple of 4
+            n_pad = 4 - x.shape[1] % 4
+            padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
+            x = torch.cat([x, padding], 1)
+        x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
+        return self.process_out(x)
+
+    def decode(self, x, **kwargs):
+        x = self.process_in(x).movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
+        x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
+        if self.patch_size > 1: x = F.pixel_shuffle(x, self.patch_size)
+        return x[:, self.frames_to_trim:].movedim(2, 1)
diff --git a/comfy/text_encoders/flux.py b/comfy/text_encoders/flux.py
index d61ef6668..99f4812bb 100644
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@@ -1,10 +1,13 @@
 from comfy import sd1_clip
 import comfy.text_encoders.t5
 import comfy.text_encoders.sd3_clip
+import comfy.text_encoders.llama
 import comfy.model_management
-from transformers import T5TokenizerFast
+from transformers import T5TokenizerFast, LlamaTokenizerFast
 import torch
 import os
+import json
+import base64
 
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
     def __init__(self, embedding_directory=None, tokenizer_data={}):
@@ -68,3 +71,106 @@ def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
                 model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
             super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
     return FluxClipModel_
+
+def load_mistral_tokenizer(data):
+    if torch.is_tensor(data):
+        data = data.numpy().tobytes()
+
+    try:
+        from transformers.integrations.mistral import MistralConverter
+    except ModuleNotFoundError:
+        from transformers.models.pixtral.convert_pixtral_weights_to_hf import MistralConverter
+
+    mistral_vocab = json.loads(data)
+
+    special_tokens = {}
+    vocab = {}
+
+    max_vocab = mistral_vocab["config"]["default_vocab_size"]
+    max_vocab -= len(mistral_vocab["special_tokens"])
+
+    for w in mistral_vocab["vocab"]:
+        r = w["rank"]
+        if r >= max_vocab:
+            continue
+
+        vocab[base64.b64decode(w["token_bytes"])] = r
+
+    for w in mistral_vocab["special_tokens"]:
+        if "token_bytes" in w:
+            special_tokens[base64.b64decode(w["token_bytes"])] = w["rank"]
+        else:
+            special_tokens[w["token_str"]] = w["rank"]
+
+    all_special = []
+    for v in special_tokens:
+        all_special.append(v)
+
+    special_tokens.update(vocab)
+    vocab = special_tokens
+    return {"tokenizer_object": MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(), "legacy": False}
+
+class MistralTokenizerClass:
+    @staticmethod
+    def from_pretrained(path, **kwargs):
+        return LlamaTokenizerFast(**kwargs)
+
+class Mistral3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        self.tekken_data = tokenizer_data.get("tekken_model", None)
+        super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
+
+    def state_dict(self):
+        return {"tekken_model": self.tekken_data}
+
+class Flux2Tokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="mistral3_24b", tokenizer=Mistral3Tokenizer)
+        self.llama_template = '[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]{}[/INST]'
+
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
+        if llama_template is None:
+            llama_text = self.llama_template.format(text)
+        else:
+            llama_text = llama_template.format(text)
+
+        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
+        return tokens
+
+class Mistral3_24BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer=[10, 20, 30], layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        textmodel_json_config = {}
+        num_layers = model_options.get("num_layers", None)
+        if num_layers is not None:
+            textmodel_json_config["num_hidden_layers"] = num_layers
+            if num_layers < 40:
+                textmodel_json_config["final_norm"] = False
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Mistral3Small24B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+class Flux2TEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}, name="mistral3_24b", clip_model=Mistral3_24BModel):
+        super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
+
+    def encode_token_weights(self, token_weight_pairs):
+        out, pooled, extra = super().encode_token_weights(token_weight_pairs)
+
+        out = torch.stack((out[:, 0], out[:, 1], out[:, 2]), dim=1)
+        out = out.movedim(1, 2)
+        out = out.reshape(out.shape[0], out.shape[1], -1)
+        return out, pooled, extra
+
+def flux2_te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None, pruned=False):
+    class Flux2TEModel_(Flux2TEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["scaled_fp8"] = llama_scaled_fp8
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            if pruned:
+                model_options = model_options.copy()
+                model_options["num_layers"] = 30
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return Flux2TEModel_
diff --git a/comfy/text_encoders/hunyuan_video.py b/comfy/text_encoders/hunyuan_video.py
index b02148b33..0110517bb 100644
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@@ -1,6 +1,7 @@
 from comfy import sd1_clip
 import comfy.model_management
 import comfy.text_encoders.llama
+from .hunyuan_image import HunyuanImageTokenizer
 from transformers import LlamaTokenizerFast
 import torch
 import os
@@ -17,6 +18,9 @@ def llama_detect(state_dict, prefix=""):
     if scaled_fp8_key in state_dict:
         out["llama_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
 
+    if "_quantization_metadata" in state_dict:
+        out["llama_quantization_metadata"] = state_dict["_quantization_metadata"]
+
     return out
 
 
@@ -73,6 +77,14 @@ class HunyuanVideoTokenizer:
         return {}
 
 
+class HunyuanVideo15Tokenizer(HunyuanImageTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.llama_template = "<|im_start|>system\nYou are a helpful assistant. Describe the video by detailing the following aspects:\n1. The main content and theme of the video.\n2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.\n3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.\n4. background environment, light, style and atmosphere.\n5. camera angles, movements, and transitions used in the video.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+        return super().tokenize_with_weights(text, return_word_ids, prevent_empty_text=True, **kwargs)
+
 class HunyuanVideoClipModel(torch.nn.Module):
     def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}):
         super().__init__()
diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index a5cfcc7da..82eec6dc8 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -44,6 +44,29 @@ class Llama2Config:
     q_norm = None
     k_norm = None
     rope_scale = None
+    final_norm: bool = True
+
+@dataclass
+class Mistral3Small24BConfig:
+    vocab_size: int = 131072
+    hidden_size: int = 5120
+    intermediate_size: int = 32768
+    num_hidden_layers: int = 40
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 8192
+    rms_norm_eps: float = 1e-5
+    rope_theta: float = 1000000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = None
+    k_norm = None
+    rope_scale = None
+    final_norm: bool = True
 
 @dataclass
 class Qwen25_3BConfig:
@@ -65,6 +88,51 @@ class Qwen25_3BConfig:
     q_norm = None
     k_norm = None
     rope_scale = None
+    final_norm: bool = True
+
+@dataclass
+class Qwen3_4BConfig:
+    vocab_size: int = 151936
+    hidden_size: int = 2560
+    intermediate_size: int = 9728
+    num_hidden_layers: int = 36
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 40960
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+
+@dataclass
+class Ovis25_2BConfig:
+    vocab_size: int = 151936
+    hidden_size: int = 2048
+    intermediate_size: int = 6144
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 40960
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
 
 @dataclass
 class Qwen25_7BVLI_Config:
@@ -86,6 +154,7 @@ class Qwen25_7BVLI_Config:
     q_norm = None
     k_norm = None
     rope_scale = None
+    final_norm: bool = True
 
 @dataclass
 class Gemma2_2B_Config:
@@ -108,6 +177,7 @@ class Gemma2_2B_Config:
     k_norm = None
     sliding_attention = None
     rope_scale = None
+    final_norm: bool = True
 
 @dataclass
 class Gemma3_4B_Config:
@@ -130,6 +200,7 @@ class Gemma3_4B_Config:
     k_norm = "gemma3"
     sliding_attention = [False, False, False, False, False, 1024]
     rope_scale = [1.0, 8.0]
+    final_norm: bool = True
 
 class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
@@ -441,7 +512,12 @@ class Llama2_(nn.Module):
             transformer(config, index=i, device=device, dtype=dtype, ops=ops)
             for i in range(config.num_hidden_layers)
         ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
+
+        if config.final_norm:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
+        else:
+            self.norm = None
+
         # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
 
     def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]):
@@ -477,8 +553,12 @@ class Llama2_(nn.Module):
 
         intermediate = None
         all_intermediate = None
+        only_layers = None
         if intermediate_output is not None:
-            if intermediate_output == "all":
+            if isinstance(intermediate_output, list):
+                all_intermediate = []
+                only_layers = set(intermediate_output)
+            elif intermediate_output == "all":
                 all_intermediate = []
                 intermediate_output = None
             elif intermediate_output < 0:
@@ -486,7 +566,8 @@ class Llama2_(nn.Module):
 
         for i, layer in enumerate(self.layers):
             if all_intermediate is not None:
-                all_intermediate.append(x.unsqueeze(1).clone())
+                if only_layers is None or (i in only_layers):
+                    all_intermediate.append(x.unsqueeze(1).clone())
             x = layer(
                 x=x,
                 attention_mask=mask,
@@ -496,14 +577,17 @@ class Llama2_(nn.Module):
             if i == intermediate_output:
                 intermediate = x.clone()
 
-        x = self.norm(x)
+        if self.norm is not None:
+            x = self.norm(x)
+
         if all_intermediate is not None:
-            all_intermediate.append(x.unsqueeze(1).clone())
+            if only_layers is None or ((i + 1) in only_layers):
+                all_intermediate.append(x.unsqueeze(1).clone())
 
         if all_intermediate is not None:
             intermediate = torch.cat(all_intermediate, dim=1)
 
-        if intermediate is not None and final_layer_norm_intermediate:
+        if intermediate is not None and final_layer_norm_intermediate and self.norm is not None:
             intermediate = self.norm(intermediate)
 
         return x, intermediate
@@ -528,6 +612,15 @@ class Llama2(BaseLlama, torch.nn.Module):
         self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
         self.dtype = dtype
 
+class Mistral3Small24B(BaseLlama, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Mistral3Small24BConfig(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
 class Qwen25_3B(BaseLlama, torch.nn.Module):
     def __init__(self, config_dict, dtype, device, operations):
         super().__init__()
@@ -537,6 +630,24 @@ class Qwen25_3B(BaseLlama, torch.nn.Module):
         self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
         self.dtype = dtype
 
+class Qwen3_4B(BaseLlama, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_4BConfig(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
+class Ovis25_2B(BaseLlama, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Ovis25_2BConfig(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
 class Qwen25_7BVLI(BaseLlama, torch.nn.Module):
     def __init__(self, config_dict, dtype, device, operations):
         super().__init__()
diff --git a/comfy/text_encoders/ovis.py b/comfy/text_encoders/ovis.py
new file mode 100644
index 000000000..81c9bd51c
--- /dev/null
+++ b/comfy/text_encoders/ovis.py
@@ -0,0 +1,69 @@
+from transformers import Qwen2Tokenizer
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+import os
+import torch
+import numbers
+
+class Qwen3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='qwen3_2b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=284, pad_token=151643, tokenizer_data=tokenizer_data)
+
+
+class OvisTokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_2b", tokenizer=Qwen3Tokenizer)
+        self.llama_template = "<|im_start|>user\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background: {}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
+        if llama_template is None:
+            llama_text = self.llama_template.format(text)
+        else:
+            llama_text = llama_template.format(text)
+
+        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
+        return tokens
+
+class Ovis25_2BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Ovis25_2B, enable_attention_masks=attention_mask, return_attention_masks=False, zero_out_masked=True, model_options=model_options)
+
+
+class OvisTEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="qwen3_2b", clip_model=Ovis25_2BModel, model_options=model_options)
+
+    def encode_token_weights(self, token_weight_pairs, template_end=-1):
+        out, pooled = super().encode_token_weights(token_weight_pairs)
+        tok_pairs = token_weight_pairs["qwen3_2b"][0]
+        count_im_start = 0
+        if template_end == -1:
+            for i, v in enumerate(tok_pairs):
+                elem = v[0]
+                if not torch.is_tensor(elem):
+                    if isinstance(elem, numbers.Integral):
+                        if elem == 4004 and count_im_start < 1:
+                            template_end = i
+                            count_im_start += 1
+
+            if out.shape[1] > (template_end + 1):
+                if tok_pairs[template_end + 1][0] == 25:
+                    template_end += 1
+
+        out = out[:, template_end:]
+        return out, pooled, {}
+
+
+def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None):
+    class OvisTEModel_(OvisTEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["scaled_fp8"] = llama_scaled_fp8
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return OvisTEModel_
diff --git a/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json b/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json
index 67688e82c..df5b5d7fe 100644
--- a/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json
+++ b/comfy/text_encoders/qwen25_tokenizer/tokenizer_config.json
@@ -179,36 +179,36 @@
       "special": false
     },
     "151665": {
-      "content": "<|img|>",
+      "content": "<tool_response>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": true
+      "special": false
     },
     "151666": {
-      "content": "<|endofimg|>",
+      "content": "</tool_response>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": true
+      "special": false
     },
     "151667": {
-      "content": "<|meta|>",
+      "content": "<think>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": true
+      "special": false
     },
     "151668": {
-      "content": "<|endofmeta|>",
+      "content": "</think>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": true
+      "special": false
     }
   },
   "additional_special_tokens": [
diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py
index 40fa67937..c0d32a6ef 100644
--- a/comfy/text_encoders/qwen_image.py
+++ b/comfy/text_encoders/qwen_image.py
@@ -17,12 +17,14 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
         self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
         self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
 
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, **kwargs):
         skip_template = False
         if text.startswith('<|im_start|>'):
             skip_template = True
         if text.startswith('<|start_header_id|>'):
             skip_template = True
+        if prevent_empty_text and text == '':
+            text = ' '
 
         if skip_template:
             llama_text = text
diff --git a/comfy/text_encoders/z_image.py b/comfy/text_encoders/z_image.py
new file mode 100644
index 000000000..bb9273b20
--- /dev/null
+++ b/comfy/text_encoders/z_image.py
@@ -0,0 +1,48 @@
+from transformers import Qwen2Tokenizer
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+import os
+
+class Qwen3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
+
+
+class ZImageTokenizer(sd1_clip.SD1Tokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen3_4b", tokenizer=Qwen3Tokenizer)
+        self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
+        if llama_template is None:
+            llama_text = self.llama_template.format(text)
+        else:
+            llama_text = llama_template.format(text)
+
+        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
+        return tokens
+
+
+class Qwen3_4BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+
+class ZImageTEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="qwen3_4b", clip_model=Qwen3_4BModel, model_options=model_options)
+
+
+def te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None):
+    class ZImageTEModel_(ZImageTEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["scaled_fp8"] = llama_scaled_fp8
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return ZImageTEModel_
diff --git a/comfy/utils.py b/comfy/utils.py
index 4bd281057..37485e497 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -675,6 +675,72 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
 
     return key_map
 
+def z_image_to_diffusers(mmdit_config, output_prefix=""):
+    n_layers = mmdit_config.get("n_layers", 0)
+    hidden_size = mmdit_config.get("dim", 0)
+    n_context_refiner = mmdit_config.get("n_refiner_layers", 2)
+    n_noise_refiner = mmdit_config.get("n_refiner_layers", 2)
+    key_map = {}
+
+    def add_block_keys(prefix_from, prefix_to, has_adaln=True):
+        for end in ("weight", "bias"):
+            k = "{}.attention.".format(prefix_from)
+            qkv = "{}.attention.qkv.{}".format(prefix_to, end)
+            key_map["{}to_q.{}".format(k, end)] = (qkv, (0, 0, hidden_size))
+            key_map["{}to_k.{}".format(k, end)] = (qkv, (0, hidden_size, hidden_size))
+            key_map["{}to_v.{}".format(k, end)] = (qkv, (0, hidden_size * 2, hidden_size))
+
+        block_map = {
+            "attention.norm_q.weight": "attention.q_norm.weight",
+            "attention.norm_k.weight": "attention.k_norm.weight",
+            "attention.to_out.0.weight": "attention.out.weight",
+            "attention.to_out.0.bias": "attention.out.bias",
+            "attention_norm1.weight": "attention_norm1.weight",
+            "attention_norm2.weight": "attention_norm2.weight",
+            "feed_forward.w1.weight": "feed_forward.w1.weight",
+            "feed_forward.w2.weight": "feed_forward.w2.weight",
+            "feed_forward.w3.weight": "feed_forward.w3.weight",
+            "ffn_norm1.weight": "ffn_norm1.weight",
+            "ffn_norm2.weight": "ffn_norm2.weight",
+        }
+        if has_adaln:
+            block_map["adaLN_modulation.0.weight"] = "adaLN_modulation.0.weight"
+            block_map["adaLN_modulation.0.bias"] = "adaLN_modulation.0.bias"
+        for k, v in block_map.items():
+            key_map["{}.{}".format(prefix_from, k)] = "{}.{}".format(prefix_to, v)
+
+    for i in range(n_layers):
+        add_block_keys("layers.{}".format(i), "{}layers.{}".format(output_prefix, i))
+
+    for i in range(n_context_refiner):
+        add_block_keys("context_refiner.{}".format(i), "{}context_refiner.{}".format(output_prefix, i))
+
+    for i in range(n_noise_refiner):
+        add_block_keys("noise_refiner.{}".format(i), "{}noise_refiner.{}".format(output_prefix, i))
+
+    MAP_BASIC = [
+        ("final_layer.linear.weight", "all_final_layer.2-1.linear.weight"),
+        ("final_layer.linear.bias", "all_final_layer.2-1.linear.bias"),
+        ("final_layer.adaLN_modulation.1.weight", "all_final_layer.2-1.adaLN_modulation.1.weight"),
+        ("final_layer.adaLN_modulation.1.bias", "all_final_layer.2-1.adaLN_modulation.1.bias"),
+        ("x_embedder.weight", "all_x_embedder.2-1.weight"),
+        ("x_embedder.bias", "all_x_embedder.2-1.bias"),
+        ("x_pad_token", "x_pad_token"),
+        ("cap_embedder.0.weight", "cap_embedder.0.weight"),
+        ("cap_embedder.1.weight", "cap_embedder.1.weight"),
+        ("cap_embedder.1.bias", "cap_embedder.1.bias"),
+        ("cap_pad_token", "cap_pad_token"),
+        ("t_embedder.mlp.0.weight", "t_embedder.mlp.0.weight"),
+        ("t_embedder.mlp.0.bias", "t_embedder.mlp.0.bias"),
+        ("t_embedder.mlp.2.weight", "t_embedder.mlp.2.weight"),
+        ("t_embedder.mlp.2.bias", "t_embedder.mlp.2.bias"),
+    ]
+
+    for c, diffusers in MAP_BASIC:
+        key_map[diffusers] = "{}{}".format(output_prefix, c)
+
+    return key_map
+
 def repeat_to_batch_size(tensor, batch_size, dim=0):
     if tensor.shape[dim] > batch_size:
         return tensor.narrow(dim, 0, batch_size)
diff --git a/comfy/weight_adapter/lora.py b/comfy/weight_adapter/lora.py
index 4db004e50..3cc60bb1b 100644
--- a/comfy/weight_adapter/lora.py
+++ b/comfy/weight_adapter/lora.py
@@ -194,6 +194,7 @@ class LoRAAdapter(WeightAdapterBase):
             lora_diff = torch.mm(
                 mat1.flatten(start_dim=1), mat2.flatten(start_dim=1)
             ).reshape(weight.shape)
+            del mat1, mat2
             if dora_scale is not None:
                 weight = weight_decompose(
                     dora_scale,
diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py
index 0d4389a6e..bfb77eb5f 100644
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@@ -13,6 +13,7 @@ from comfy.cli_args import args
 SERVER_FEATURE_FLAGS: Dict[str, Any] = {
     "supports_preview_metadata": True,
     "max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
+    "extension": {"manager": {"supports_v4": True}},
 }
 
 
diff --git a/comfy_api/internal/async_to_sync.py b/comfy_api/internal/async_to_sync.py
index f5f805a62..257ade82e 100644
--- a/comfy_api/internal/async_to_sync.py
+++ b/comfy_api/internal/async_to_sync.py
@@ -8,7 +8,7 @@ import os
 import textwrap
 import threading
 from enum import Enum
-from typing import Optional, Type, get_origin, get_args
+from typing import Optional, Type, get_origin, get_args, get_type_hints
 
 
 class TypeTracker:
@@ -220,11 +220,18 @@ class AsyncToSyncConverter:
             self._async_instance = async_class(*args, **kwargs)
 
             # Handle annotated class attributes (like execution: Execution)
-            # Get all annotations from the class hierarchy
-            all_annotations = {}
-            for base_class in reversed(inspect.getmro(async_class)):
-                if hasattr(base_class, "__annotations__"):
-                    all_annotations.update(base_class.__annotations__)
+            # Get all annotations from the class hierarchy and resolve string annotations
+            try:
+                # get_type_hints resolves string annotations to actual type objects
+                # This handles classes using 'from __future__ import annotations'
+                all_annotations = get_type_hints(async_class)
+            except Exception:
+                # Fallback to raw annotations if get_type_hints fails
+                # (e.g., for undefined forward references)
+                all_annotations = {}
+                for base_class in reversed(inspect.getmro(async_class)):
+                    if hasattr(base_class, "__annotations__"):
+                        all_annotations.update(base_class.__annotations__)
 
             # For each annotated attribute, check if it needs to be created or wrapped
             for attr_name, attr_type in all_annotations.items():
@@ -625,15 +632,19 @@ class AsyncToSyncConverter:
         """Extract class attributes that are classes themselves."""
         class_attributes = []
 
+        # Get resolved type hints to handle string annotations
+        try:
+            type_hints = get_type_hints(async_class)
+        except Exception:
+            type_hints = {}
+
         # Look for class attributes that are classes
         for name, attr in sorted(inspect.getmembers(async_class)):
             if isinstance(attr, type) and not name.startswith("_"):
                 class_attributes.append((name, attr))
-            elif (
-                hasattr(async_class, "__annotations__")
-                and name in async_class.__annotations__
-            ):
-                annotation = async_class.__annotations__[name]
+            elif name in type_hints:
+                # Use resolved type hint instead of raw annotation
+                annotation = type_hints[name]
                 if isinstance(annotation, type):
                     class_attributes.append((name, annotation))
 
@@ -908,11 +919,15 @@ class AsyncToSyncConverter:
             attribute_mappings = {}
 
             # First check annotations for typed attributes (including from parent classes)
-            # Collect all annotations from the class hierarchy
-            all_annotations = {}
-            for base_class in reversed(inspect.getmro(async_class)):
-                if hasattr(base_class, "__annotations__"):
-                    all_annotations.update(base_class.__annotations__)
+            # Resolve string annotations to actual types
+            try:
+                all_annotations = get_type_hints(async_class)
+            except Exception:
+                # Fallback to raw annotations
+                all_annotations = {}
+                for base_class in reversed(inspect.getmro(async_class)):
+                    if hasattr(base_class, "__annotations__"):
+                        all_annotations.update(base_class.__annotations__)
 
             for attr_name, attr_type in sorted(all_annotations.items()):
                 for class_name, class_type in class_attributes:
diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py
index b7a3fa9c1..0fa01d1e7 100644
--- a/comfy_api/latest/__init__.py
+++ b/comfy_api/latest/__init__.py
@@ -7,9 +7,9 @@ from comfy_api.internal.singleton import ProxiedSingleton
 from comfy_api.internal.async_to_sync import create_sync_class
 from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
 from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
-from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents
-from . import _io as io
-from . import _ui as ui
+from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
+from . import _io_public as io
+from . import _ui_public as ui
 # from comfy_api.latest._resources import _RESOURCES as resources  #noqa: F401
 from comfy_execution.utils import get_executing_context
 from comfy_execution.progress import get_progress_state, PreviewImageTuple
@@ -104,6 +104,8 @@ class Types:
     VideoCodec = VideoCodec
     VideoContainer = VideoContainer
     VideoComponents = VideoComponents
+    MESH = MESH
+    VOXEL = VOXEL
 
 ComfyAPI = ComfyAPI_latest
 
diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py
index a335df4d0..87c81d73a 100644
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
+from fractions import Fraction
 from typing import Optional, Union, IO
 import io
 import av
@@ -72,6 +73,33 @@ class VideoInput(ABC):
         frame_count = components.images.shape[0]
         return float(frame_count / components.frame_rate)
 
+    def get_frame_count(self) -> int:
+        """
+        Returns the number of frames in the video.
+
+        Default implementation uses :meth:`get_components`, which may require
+        loading all frames into memory. File-based implementations should
+        override this method and use container/stream metadata instead.
+
+        Returns:
+            Total number of frames as an integer.
+        """
+        return int(self.get_components().images.shape[0])
+
+    def get_frame_rate(self) -> Fraction:
+        """
+        Returns the frame rate of the video.
+
+        Default implementation materializes the video into memory via
+        `get_components()`. Subclasses that can inspect the underlying
+        container (e.g. `VideoFromFile`) should override this with a more
+        efficient implementation.
+
+        Returns:
+            Frame rate as a Fraction.
+        """
+        return self.get_components().frame_rate
+
     def get_container_format(self) -> str:
         """
         Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py
index f646504c8..a4cd3737d 100644
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -121,6 +121,71 @@ class VideoFromFile(VideoInput):
 
         raise ValueError(f"Could not determine duration for file '{self.__file}'")
 
+    def get_frame_count(self) -> int:
+        """
+        Returns the number of frames in the video without materializing them as
+        torch tensors.
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+
+        with av.open(self.__file, mode="r") as container:
+            video_stream = self._get_first_video_stream(container)
+            # 1. Prefer the frames field if available
+            if video_stream.frames and video_stream.frames > 0:
+                return int(video_stream.frames)
+
+            # 2. Try to estimate from duration and average_rate using only metadata
+            if container.duration is not None and video_stream.average_rate:
+                duration_seconds = float(container.duration / av.time_base)
+                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
+                if estimated_frames > 0:
+                    return estimated_frames
+
+            if (
+                getattr(video_stream, "duration", None) is not None
+                and getattr(video_stream, "time_base", None) is not None
+                and video_stream.average_rate
+            ):
+                duration_seconds = float(video_stream.duration * video_stream.time_base)
+                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
+                if estimated_frames > 0:
+                    return estimated_frames
+
+            # 3. Last resort: decode frames and count them (streaming)
+            frame_count = 0
+            container.seek(0)
+            for packet in container.demux(video_stream):
+                for _ in packet.decode():
+                    frame_count += 1
+
+            if frame_count == 0:
+                raise ValueError(f"Could not determine frame count for file '{self.__file}'")
+            return frame_count
+
+    def get_frame_rate(self) -> Fraction:
+        """
+        Returns the average frame rate of the video using container metadata
+        without decoding all frames.
+        """
+        if isinstance(self.__file, io.BytesIO):
+            self.__file.seek(0)
+
+        with av.open(self.__file, mode="r") as container:
+            video_stream = self._get_first_video_stream(container)
+            # Preferred: use PyAV's average_rate (usually already a Fraction-like)
+            if video_stream.average_rate:
+                return Fraction(video_stream.average_rate)
+
+            # Fallback: estimate from frames + duration if available
+            if video_stream.frames and container.duration:
+                duration_seconds = float(container.duration / av.time_base)
+                if duration_seconds > 0:
+                    return Fraction(video_stream.frames / duration_seconds).limit_denominator()
+
+            # Last resort: match get_components_internal default
+            return Fraction(1)
+
     def get_container_format(self) -> str:
         """
         Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
@@ -238,6 +303,13 @@ class VideoFromFile(VideoInput):
                         packet.stream = stream_map[packet.stream]
                         output_container.mux(packet)
 
+    def _get_first_video_stream(self, container: InputContainer):
+        video_stream = next((s for s in container.streams if s.type == "video"), None)
+        if video_stream is None:
+            raise ValueError(f"No video stream found in file '{self.__file}'")
+        return video_stream
+
+
 class VideoFromComponents(VideoInput):
     """
     Class representing video input from tensors.
@@ -264,7 +336,10 @@ class VideoFromComponents(VideoInput):
             raise ValueError("Only MP4 format is supported for now")
         if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
             raise ValueError("Only H264 codec is supported for now")
-        with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}) as output:
+        extra_kwargs = {}
+        if isinstance(format, VideoContainer) and format != VideoContainer.AUTO:
+            extra_kwargs["format"] = format.value
+        with av.open(path, mode='w', options={'movflags': 'use_metadata_tags'}, **extra_kwargs) as output:
             # Add metadata before writing any streams
             if metadata is not None:
                 for key, value in metadata.items():
diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index 0b701260f..257f07c42 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -4,6 +4,7 @@ import copy
 import inspect
 from abc import ABC, abstractmethod
 from collections import Counter
+from collections.abc import Iterable
 from dataclasses import asdict, dataclass
 from enum import Enum
 from typing import Any, Callable, Literal, TypedDict, TypeVar, TYPE_CHECKING
@@ -27,6 +28,7 @@ from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classpr
     prune_dict, shallow_clone_class)
 from comfy_api.latest._resources import Resources, ResourcesLocal
 from comfy_execution.graph_utils import ExecutionBlocker
+from ._util import MESH, VOXEL
 
 # from comfy_extras.nodes_images import SVG as SVG_ # NOTE: needs to be moved before can be imported due to circular reference
 
@@ -149,6 +151,9 @@ class _IO_V3:
     def __init__(self):
         pass
 
+    def validate(self):
+        pass
+
     @property
     def io_type(self):
         return self.Parent.io_type
@@ -181,6 +186,9 @@ class Input(_IO_V3):
     def get_io_type(self):
         return _StringIOType(self.io_type)
 
+    def get_all(self) -> list[Input]:
+        return [self]
+
 class WidgetInput(Input):
     '''
     Base class for a V3 Input with widget.
@@ -628,6 +636,10 @@ class UpscaleModel(ComfyTypeIO):
     if TYPE_CHECKING:
         Type = ImageModelDescriptor
 
+@comfytype(io_type="LATENT_UPSCALE_MODEL")
+class LatentUpscaleModel(ComfyTypeIO):
+    Type = Any
+
 @comfytype(io_type="AUDIO")
 class Audio(ComfyTypeIO):
     class AudioDict(TypedDict):
@@ -656,11 +668,11 @@ class LossMap(ComfyTypeIO):
 
 @comfytype(io_type="VOXEL")
 class Voxel(ComfyTypeIO):
-    Type = Any # TODO: VOXEL class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3
+    Type = VOXEL
 
 @comfytype(io_type="MESH")
 class Mesh(ComfyTypeIO):
-    Type = Any # TODO: MESH class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3
+    Type = MESH
 
 @comfytype(io_type="HOOKS")
 class Hooks(ComfyTypeIO):
@@ -809,13 +821,61 @@ class MultiType:
             else:
                 return super().as_dict()
 
+@comfytype(io_type="COMFY_MATCHTYPE_V3")
+class MatchType(ComfyTypeIO):
+    class Template:
+        def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType] = AnyType):
+            self.template_id = template_id
+            # account for syntactic sugar
+            if not isinstance(allowed_types, Iterable):
+                allowed_types = [allowed_types]
+            for t in allowed_types:
+                if not isinstance(t, type):
+                    if not isinstance(t, _ComfyType):
+                        raise ValueError(f"Allowed types must be a ComfyType or a list of ComfyTypes, got {t.__class__.__name__}")
+                else:
+                    if not issubclass(t, _ComfyType):
+                        raise ValueError(f"Allowed types must be a ComfyType or a list of ComfyTypes, got {t.__name__}")
+            self.allowed_types = allowed_types
+
+        def as_dict(self):
+            return {
+                "template_id": self.template_id,
+                "allowed_types": ",".join([t.io_type for t in self.allowed_types]),
+            }
+
+    class Input(Input):
+        def __init__(self, id: str, template: MatchType.Template,
+                    display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
+            super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
+            self.template = template
+
+        def as_dict(self):
+            return super().as_dict() | prune_dict({
+                "template": self.template.as_dict(),
+            })
+
+    class Output(Output):
+        def __init__(self, template: MatchType.Template, id: str=None, display_name: str=None, tooltip: str=None,
+                     is_output_list=False):
+            super().__init__(id, display_name, tooltip, is_output_list)
+            self.template = template
+
+        def as_dict(self):
+            return super().as_dict() | prune_dict({
+                "template": self.template.as_dict(),
+            })
+
 class DynamicInput(Input, ABC):
     '''
     Abstract class for dynamic input registration.
     '''
-    @abstractmethod
     def get_dynamic(self) -> list[Input]:
-        ...
+        return []
+
+    def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
+        pass
+
 
 class DynamicOutput(Output, ABC):
     '''
@@ -825,99 +885,223 @@ class DynamicOutput(Output, ABC):
                  is_output_list=False):
         super().__init__(id, display_name, tooltip, is_output_list)
 
-    @abstractmethod
     def get_dynamic(self) -> list[Output]:
-        ...
+        return []
 
 
 @comfytype(io_type="COMFY_AUTOGROW_V3")
-class AutogrowDynamic(ComfyTypeI):
-    Type = list[Any]
-    class Input(DynamicInput):
-        def __init__(self, id: str, template_input: Input, min: int=1, max: int=None,
-                     display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
-            super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
-            self.template_input = template_input
-            if min is not None:
-                assert(min >= 1)
-            if max is not None:
-                assert(max >= 1)
+class Autogrow(ComfyTypeI):
+    Type = dict[str, Any]
+    _MaxNames = 100  # NOTE: max 100 names for sanity
+
+    class _AutogrowTemplate:
+        def __init__(self, input: Input):
+            # dynamic inputs are not allowed as the template input
+            assert(not isinstance(input, DynamicInput))
+            self.input = copy.copy(input)
+            if isinstance(self.input, WidgetInput):
+                self.input.force_input = True
+            self.names: list[str] = []
+            self.cached_inputs = {}
+
+        def _create_input(self, input: Input, name: str):
+            new_input = copy.copy(self.input)
+            new_input.id = name
+            return new_input
+
+        def _create_cached_inputs(self):
+            for name in self.names:
+                self.cached_inputs[name] = self._create_input(self.input, name)
+
+        def get_all(self) -> list[Input]:
+            return list(self.cached_inputs.values())
+
+        def as_dict(self):
+            return prune_dict({
+                "input": create_input_dict_v1([self.input]),
+            })
+
+        def validate(self):
+            self.input.validate()
+
+        def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
+            real_inputs = []
+            for name, input in self.cached_inputs.items():
+                if name in live_inputs:
+                    real_inputs.append(input)
+            add_to_input_dict_v1(d, real_inputs, live_inputs, curr_prefix)
+            add_dynamic_id_mapping(d, real_inputs, curr_prefix)
+
+    class TemplatePrefix(_AutogrowTemplate):
+        def __init__(self, input: Input, prefix: str, min: int=1, max: int=10):
+            super().__init__(input)
+            self.prefix = prefix
+            assert(min >= 0)
+            assert(max >= 1)
+            assert(max <= Autogrow._MaxNames)
             self.min = min
             self.max = max
+            self.names = [f"{self.prefix}{i}" for i in range(self.max)]
+            self._create_cached_inputs()
+
+        def as_dict(self):
+            return super().as_dict() | prune_dict({
+                "prefix": self.prefix,
+                "min": self.min,
+                "max": self.max,
+            })
+
+    class TemplateNames(_AutogrowTemplate):
+        def __init__(self, input: Input, names: list[str], min: int=1):
+            super().__init__(input)
+            self.names = names[:Autogrow._MaxNames]
+            assert(min >= 0)
+            self.min = min
+            self._create_cached_inputs()
+
+        def as_dict(self):
+            return super().as_dict() | prune_dict({
+                "names": self.names,
+                "min": self.min,
+            })
+
+    class Input(DynamicInput):
+        def __init__(self, id: str, template: Autogrow.TemplatePrefix | Autogrow.TemplateNames,
+                     display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
+            super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
+            self.template = template
+
+        def as_dict(self):
+            return super().as_dict() | prune_dict({
+                "template": self.template.as_dict(),
+            })
 
         def get_dynamic(self) -> list[Input]:
-            curr_count = 1
-            new_inputs = []
-            for i in range(self.min):
-                new_input = copy.copy(self.template_input)
-                new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$"
-                if new_input.display_name is not None:
-                    new_input.display_name = f"{new_input.display_name}{curr_count}"
-                new_input.optional = self.optional or new_input.optional
-                if isinstance(self.template_input, WidgetInput):
-                    new_input.force_input = True
-                new_inputs.append(new_input)
-                curr_count += 1
-            # pretend to expand up to max
-            for i in range(curr_count-1, self.max):
-                new_input = copy.copy(self.template_input)
-                new_input.id = f"{new_input.id}{curr_count}_${self.id}_ag$"
-                if new_input.display_name is not None:
-                    new_input.display_name = f"{new_input.display_name}{curr_count}"
-                new_input.optional = True
-                if isinstance(self.template_input, WidgetInput):
-                    new_input.force_input = True
-                new_inputs.append(new_input)
-                curr_count += 1
-            return new_inputs
+            return self.template.get_all()
 
-@comfytype(io_type="COMFY_COMBODYNAMIC_V3")
-class ComboDynamic(ComfyTypeI):
-    class Input(DynamicInput):
-        def __init__(self, id: str):
-            pass
+        def get_all(self) -> list[Input]:
+            return [self] + self.template.get_all()
 
-@comfytype(io_type="COMFY_MATCHTYPE_V3")
-class MatchType(ComfyTypeIO):
-    class Template:
-        def __init__(self, template_id: str, allowed_types: _ComfyType | list[_ComfyType]):
-            self.template_id = template_id
-            self.allowed_types = [allowed_types] if isinstance(allowed_types, _ComfyType) else allowed_types
+        def validate(self):
+            self.template.validate()
+
+        def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
+            curr_prefix = f"{curr_prefix}{self.id}."
+            # need to remove self from expected inputs dictionary; replaced by template inputs in frontend
+            for inner_dict in d.values():
+                if self.id in inner_dict:
+                    del inner_dict[self.id]
+            self.template.expand_schema_for_dynamic(d, live_inputs, curr_prefix)
+
+@comfytype(io_type="COMFY_DYNAMICCOMBO_V3")
+class DynamicCombo(ComfyTypeI):
+    Type = dict[str, Any]
+
+    class Option:
+        def __init__(self, key: str, inputs: list[Input]):
+            self.key = key
+            self.inputs = inputs
 
         def as_dict(self):
             return {
-                "template_id": self.template_id,
-                "allowed_types": "".join(t.io_type for t in self.allowed_types),
+                "key": self.key,
+                "inputs": create_input_dict_v1(self.inputs),
             }
 
     class Input(DynamicInput):
-        def __init__(self, id: str, template: MatchType.Template,
+        def __init__(self, id: str, options: list[DynamicCombo.Option],
                     display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, extra_dict=None):
             super().__init__(id, display_name, optional, tooltip, lazy, extra_dict)
-            self.template = template
+            self.options = options
+
+        def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
+            # check if dynamic input's id is in live_inputs
+            if self.id in live_inputs:
+                curr_prefix = f"{curr_prefix}{self.id}."
+                key = live_inputs[self.id]
+                selected_option = None
+                for option in self.options:
+                    if option.key == key:
+                        selected_option = option
+                        break
+                if selected_option is not None:
+                    add_to_input_dict_v1(d, selected_option.inputs, live_inputs, curr_prefix)
+                    add_dynamic_id_mapping(d, selected_option.inputs, curr_prefix, self)
 
         def get_dynamic(self) -> list[Input]:
-            return [self]
+            return [input for option in self.options for input in option.inputs]
+
+        def get_all(self) -> list[Input]:
+            return [self] + [input for option in self.options for input in option.inputs]
 
         def as_dict(self):
             return super().as_dict() | prune_dict({
-                "template": self.template.as_dict(),
+                "options": [o.as_dict() for o in self.options],
             })
 
-    class Output(DynamicOutput):
-        def __init__(self, id: str, template: MatchType.Template, display_name: str=None, tooltip: str=None,
-                     is_output_list=False):
-            super().__init__(id, display_name, tooltip, is_output_list)
-            self.template = template
+        def validate(self):
+            # make sure all nested inputs are validated
+            for option in self.options:
+                for input in option.inputs:
+                    input.validate()
 
-        def get_dynamic(self) -> list[Output]:
-            return [self]
+@comfytype(io_type="COMFY_DYNAMICSLOT_V3")
+class DynamicSlot(ComfyTypeI):
+    Type = dict[str, Any]
+
+    class Input(DynamicInput):
+        def __init__(self, slot: Input, inputs: list[Input],
+                    display_name: str=None, tooltip: str=None, lazy: bool=None, extra_dict=None):
+            assert(not isinstance(slot, DynamicInput))
+            self.slot = copy.copy(slot)
+            self.slot.display_name = slot.display_name if slot.display_name is not None else display_name
+            optional = True
+            self.slot.tooltip = slot.tooltip if slot.tooltip is not None else tooltip
+            self.slot.lazy = slot.lazy if slot.lazy is not None else lazy
+            self.slot.extra_dict = slot.extra_dict if slot.extra_dict is not None else extra_dict
+            super().__init__(slot.id, self.slot.display_name, optional, self.slot.tooltip, self.slot.lazy, self.slot.extra_dict)
+            self.inputs = inputs
+            self.force_input = None
+            # force widget inputs to have no widgets, otherwise this would be awkward
+            if isinstance(self.slot, WidgetInput):
+                self.force_input = True
+                self.slot.force_input = True
+
+        def expand_schema_for_dynamic(self, d: dict[str, Any], live_inputs: dict[str, Any], curr_prefix=''):
+            if self.id in live_inputs:
+                curr_prefix = f"{curr_prefix}{self.id}."
+                add_to_input_dict_v1(d, self.inputs, live_inputs, curr_prefix)
+                add_dynamic_id_mapping(d, [self.slot] + self.inputs, curr_prefix)
+
+        def get_dynamic(self) -> list[Input]:
+            return [self.slot] + self.inputs
+
+        def get_all(self) -> list[Input]:
+            return [self] + [self.slot] + self.inputs
 
         def as_dict(self):
             return super().as_dict() | prune_dict({
-                "template": self.template.as_dict(),
+                "slotType": str(self.slot.get_io_type()),
+                "inputs": create_input_dict_v1(self.inputs),
+                "forceInput": self.force_input,
             })
 
+        def validate(self):
+            self.slot.validate()
+            for input in self.inputs:
+                input.validate()
+
+def add_dynamic_id_mapping(d: dict[str, Any], inputs: list[Input], curr_prefix: str, self: DynamicInput=None):
+    dynamic = d.setdefault("dynamic_paths", {})
+    if self is not None:
+        dynamic[self.id] = f"{curr_prefix}{self.id}"
+    for i in inputs:
+        if not isinstance(i, DynamicInput):
+            dynamic[f"{i.id}"] = f"{curr_prefix}{i.id}"
+
+class V3Data(TypedDict):
+    hidden_inputs: dict[str, Any]
+    dynamic_paths: dict[str, Any]
 
 class HiddenHolder:
     def __init__(self, unique_id: str, prompt: Any,
@@ -979,6 +1163,7 @@ class NodeInfoV1:
     output_is_list: list[bool]=None
     output_name: list[str]=None
     output_tooltips: list[str]=None
+    output_matchtypes: list[str]=None
     name: str=None
     display_name: str=None
     description: str=None
@@ -1056,7 +1241,11 @@ class Schema:
         '''Validate the schema:
         - verify ids on inputs and outputs are unique - both internally and in relation to each other
         '''
-        input_ids = [i.id for i in self.inputs] if self.inputs is not None else []
+        nested_inputs: list[Input] = []
+        if self.inputs is not None:
+            for input in self.inputs:
+                nested_inputs.extend(input.get_all())
+        input_ids = [i.id for i in nested_inputs] if nested_inputs is not None else []
         output_ids = [o.id for o in self.outputs] if self.outputs is not None else []
         input_set = set(input_ids)
         output_set = set(output_ids)
@@ -1072,6 +1261,13 @@ class Schema:
             issues.append(f"Ids must be unique between inputs and outputs, but {intersection} are not.")
         if len(issues) > 0:
             raise ValueError("\n".join(issues))
+        # validate inputs and outputs
+        if self.inputs is not None:
+            for input in self.inputs:
+                input.validate()
+        if self.outputs is not None:
+            for output in self.outputs:
+                output.validate()
 
     def finalize(self):
         """Add hidden based on selected schema options, and give outputs without ids default ids."""
@@ -1097,19 +1293,10 @@ class Schema:
                 if output.id is None:
                     output.id = f"_{i}_{output.io_type}_"
 
-    def get_v1_info(self, cls) -> NodeInfoV1:
+    def get_v1_info(self, cls, live_inputs: dict[str, Any]=None) -> NodeInfoV1:
+        # NOTE: live_inputs will not be used anymore very soon and this will be done another way
         # get V1 inputs
-        input = {
-            "required": {}
-        }
-        if self.inputs:
-            for i in self.inputs:
-                if isinstance(i, DynamicInput):
-                    dynamic_inputs = i.get_dynamic()
-                    for d in dynamic_inputs:
-                        add_to_dict_v1(d, input)
-                else:
-                    add_to_dict_v1(i, input)
+        input = create_input_dict_v1(self.inputs, live_inputs)
         if self.hidden:
             for hidden in self.hidden:
                 input.setdefault("hidden", {})[hidden.name] = (hidden.value,)
@@ -1118,12 +1305,24 @@ class Schema:
         output_is_list = []
         output_name = []
         output_tooltips = []
+        output_matchtypes = []
+        any_matchtypes = False
         if self.outputs:
             for o in self.outputs:
                 output.append(o.io_type)
                 output_is_list.append(o.is_output_list)
                 output_name.append(o.display_name if o.display_name else o.io_type)
                 output_tooltips.append(o.tooltip if o.tooltip else None)
+                # special handling for MatchType
+                if isinstance(o, MatchType.Output):
+                    output_matchtypes.append(o.template.template_id)
+                    any_matchtypes = True
+                else:
+                    output_matchtypes.append(None)
+
+        # clear out lists that are all None
+        if not any_matchtypes:
+            output_matchtypes = None
 
         info = NodeInfoV1(
             input=input,
@@ -1132,6 +1331,7 @@ class Schema:
             output_is_list=output_is_list,
             output_name=output_name,
             output_tooltips=output_tooltips,
+            output_matchtypes=output_matchtypes,
             name=self.node_id,
             display_name=self.display_name,
             category=self.category,
@@ -1177,16 +1377,57 @@ class Schema:
         return info
 
 
-def add_to_dict_v1(i: Input, input: dict):
+def create_input_dict_v1(inputs: list[Input], live_inputs: dict[str, Any]=None) -> dict:
+    input = {
+        "required": {}
+    }
+    add_to_input_dict_v1(input, inputs, live_inputs)
+    return input
+
+def add_to_input_dict_v1(d: dict[str, Any], inputs: list[Input], live_inputs: dict[str, Any]=None, curr_prefix=''):
+    for i in inputs:
+        if isinstance(i, DynamicInput):
+            add_to_dict_v1(i, d)
+            if live_inputs is not None:
+                i.expand_schema_for_dynamic(d, live_inputs, curr_prefix)
+        else:
+            add_to_dict_v1(i, d)
+
+def add_to_dict_v1(i: Input, d: dict, dynamic_dict: dict=None):
     key = "optional" if i.optional else "required"
     as_dict = i.as_dict()
     # for v1, we don't want to include the optional key
     as_dict.pop("optional", None)
-    input.setdefault(key, {})[i.id] = (i.get_io_type(), as_dict)
+    if dynamic_dict is None:
+        value = (i.get_io_type(), as_dict)
+    else:
+        value = (i.get_io_type(), as_dict, dynamic_dict)
+    d.setdefault(key, {})[i.id] = value
 
 def add_to_dict_v3(io: Input | Output, d: dict):
     d[io.id] = (io.get_io_type(), io.as_dict())
 
+def build_nested_inputs(values: dict[str, Any], v3_data: V3Data):
+    paths = v3_data.get("dynamic_paths", None)
+    if paths is None:
+        return values
+    values = values.copy()
+    result = {}
+
+    for key, path in paths.items():
+        parts = path.split(".")
+        current = result
+
+        for i, p in enumerate(parts):
+            is_last = (i == len(parts) - 1)
+
+            if is_last:
+                current[p] = values.pop(key, None)
+            else:
+                current = current.setdefault(p, {})
+
+    values.update(result)
+    return values
 
 
 class _ComfyNodeBaseInternal(_ComfyNodeInternal):
@@ -1306,12 +1547,12 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
 
     @final
     @classmethod
-    def PREPARE_CLASS_CLONE(cls, hidden_inputs: dict) -> type[ComfyNode]:
+    def PREPARE_CLASS_CLONE(cls, v3_data: V3Data) -> type[ComfyNode]:
         """Creates clone of real node class to prevent monkey-patching."""
         c_type: type[ComfyNode] = cls if is_class(cls) else type(cls)
         type_clone: type[ComfyNode] = shallow_clone_class(c_type)
         # set hidden
-        type_clone.hidden = HiddenHolder.from_dict(hidden_inputs)
+        type_clone.hidden = HiddenHolder.from_dict(v3_data["hidden_inputs"])
         return type_clone
 
     @final
@@ -1428,14 +1669,18 @@ class _ComfyNodeBaseInternal(_ComfyNodeInternal):
 
     @final
     @classmethod
-    def INPUT_TYPES(cls, include_hidden=True, return_schema=False) -> dict[str, dict] | tuple[dict[str, dict], Schema]:
+    def INPUT_TYPES(cls, include_hidden=True, return_schema=False, live_inputs=None) -> dict[str, dict] | tuple[dict[str, dict], Schema, V3Data]:
         schema = cls.FINALIZE_SCHEMA()
-        info = schema.get_v1_info(cls)
+        info = schema.get_v1_info(cls, live_inputs)
         input = info.input
         if not include_hidden:
             input.pop("hidden", None)
         if return_schema:
-            return input, schema
+            v3_data: V3Data = {}
+            dynamic = input.pop("dynamic_paths", None)
+            if dynamic is not None:
+                v3_data["dynamic_paths"] = dynamic
+            return input, schema, v3_data
         return input
 
     @final
@@ -1508,7 +1753,7 @@ class ComfyNode(_ComfyNodeBaseInternal):
         raise NotImplementedError
 
     @classmethod
-    def validate_inputs(cls, **kwargs) -> bool:
+    def validate_inputs(cls, **kwargs) -> bool | str:
         """Optionally, define this function to validate inputs; equivalent to V1's VALIDATE_INPUTS."""
         raise NotImplementedError
 
@@ -1623,6 +1868,7 @@ __all__ = [
     "StyleModel",
     "Gligen",
     "UpscaleModel",
+    "LatentUpscaleModel",
     "Audio",
     "Video",
     "SVG",
@@ -1646,6 +1892,10 @@ __all__ = [
     "SEGS",
     "AnyType",
     "MultiType",
+    # Dynamic Types
+    "MatchType",
+    # "DynamicCombo",
+    # "Autogrow",
     # Other classes
     "HiddenHolder",
     "Hidden",
@@ -1656,4 +1906,5 @@ __all__ = [
     "NodeOutput",
     "add_to_dict_v1",
     "add_to_dict_v3",
+    "V3Data",
 ]
diff --git a/comfy_api/latest/_io_public.py b/comfy_api/latest/_io_public.py
new file mode 100644
index 000000000..43c7680f3
--- /dev/null
+++ b/comfy_api/latest/_io_public.py
@@ -0,0 +1 @@
+from ._io import *  # noqa: F403
diff --git a/comfy_api/latest/_ui_public.py b/comfy_api/latest/_ui_public.py
new file mode 100644
index 000000000..85b11d78b
--- /dev/null
+++ b/comfy_api/latest/_ui_public.py
@@ -0,0 +1 @@
+from ._ui import *  # noqa: F403
diff --git a/comfy_api/latest/_util/__init__.py b/comfy_api/latest/_util/__init__.py
index 9019c46db..fc5431dda 100644
--- a/comfy_api/latest/_util/__init__.py
+++ b/comfy_api/latest/_util/__init__.py
@@ -1,8 +1,11 @@
 from .video_types import VideoContainer, VideoCodec, VideoComponents
+from .geometry_types import VOXEL, MESH
 
 __all__ = [
     # Utility Types
     "VideoContainer",
     "VideoCodec",
     "VideoComponents",
+    "VOXEL",
+    "MESH",
 ]
diff --git a/comfy_api/latest/_util/geometry_types.py b/comfy_api/latest/_util/geometry_types.py
new file mode 100644
index 000000000..385122778
--- /dev/null
+++ b/comfy_api/latest/_util/geometry_types.py
@@ -0,0 +1,12 @@
+import torch
+
+
+class VOXEL:
+    def __init__(self, data: torch.Tensor):
+        self.data = data
+
+
+class MESH:
+    def __init__(self, vertices: torch.Tensor, faces: torch.Tensor):
+        self.vertices = vertices
+        self.faces = faces
diff --git a/comfy_api/v0_0_2/__init__.py b/comfy_api/v0_0_2/__init__.py
index de0f95001..c4fa1d971 100644
--- a/comfy_api/v0_0_2/__init__.py
+++ b/comfy_api/v0_0_2/__init__.py
@@ -6,7 +6,7 @@ from comfy_api.latest import (
 )
 from typing import Type, TYPE_CHECKING
 from comfy_api.internal.async_to_sync import create_sync_class
-from comfy_api.latest import io, ui, ComfyExtension  #noqa: F401
+from comfy_api.latest import io, ui, IO, UI, ComfyExtension  #noqa: F401
 
 
 class ComfyAPIAdapter_v0_0_2(ComfyAPI_latest):
@@ -42,4 +42,8 @@ __all__ = [
     "InputImpl",
     "Types",
     "ComfyExtension",
+    "io",
+    "IO",
+    "ui",
+    "UI",
 ]
diff --git a/comfy_api_nodes/apis/bfl_api.py b/comfy_api_nodes/apis/bfl_api.py
index 0fc8c0607..d8d3557b3 100644
--- a/comfy_api_nodes/apis/bfl_api.py
+++ b/comfy_api_nodes/apis/bfl_api.py
@@ -70,6 +70,29 @@ class BFLFluxProGenerateRequest(BaseModel):
     # )
 
 
+class Flux2ProGenerateRequest(BaseModel):
+    prompt: str = Field(...)
+    width: int = Field(1024, description="Must be a multiple of 32.")
+    height: int = Field(768, description="Must be a multiple of 32.")
+    seed: int | None = Field(None)
+    prompt_upsampling: bool | None = Field(None)
+    input_image: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_2: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_3: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_4: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_5: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_6: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_7: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_8: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    input_image_9: str | None = Field(None, description="Base64 encoded image for image-to-image generation")
+    safety_tolerance: int | None = Field(
+        5, description="Tolerance level for input and output moderation. Value 0 being most strict.", ge=0, le=5
+    )
+    output_format: str | None = Field(
+        "png", description="Output format for the generated image. Can be 'jpeg' or 'png'."
+    )
+
+
 class BFLFluxKontextProGenerateRequest(BaseModel):
     prompt: str = Field(..., description='The text prompt for what you wannt to edit.')
     input_image: Optional[str] = Field(None, description='Image to edit in base64 format')
@@ -109,8 +132,9 @@ class BFLFluxProUltraGenerateRequest(BaseModel):
 
 
 class BFLFluxProGenerateResponse(BaseModel):
-    id: str = Field(..., description='The unique identifier for the generation task.')
-    polling_url: str = Field(..., description='URL to poll for the generation result.')
+    id: str = Field(..., description="The unique identifier for the generation task.")
+    polling_url: str = Field(..., description="URL to poll for the generation result.")
+    cost: float | None = Field(None, description="Price in cents")
 
 
 class BFLStatus(str, Enum):
diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py
index 2bf28bf93..a380ecc86 100644
--- a/comfy_api_nodes/apis/gemini_api.py
+++ b/comfy_api_nodes/apis/gemini_api.py
@@ -1,22 +1,236 @@
-from typing import Optional
+from datetime import date
+from enum import Enum
+from typing import Any
 
-from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
+
+
+class GeminiSafetyCategory(str, Enum):
+    HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
+    HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
+    HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
+    HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
+
+
+class GeminiSafetyThreshold(str, Enum):
+    OFF = "OFF"
+    BLOCK_NONE = "BLOCK_NONE"
+    BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE"
+    BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE"
+    BLOCK_ONLY_HIGH = "BLOCK_ONLY_HIGH"
+
+
+class GeminiSafetySetting(BaseModel):
+    category: GeminiSafetyCategory
+    threshold: GeminiSafetyThreshold
+
+
+class GeminiRole(str, Enum):
+    user = "user"
+    model = "model"
+
+
+class GeminiMimeType(str, Enum):
+    application_pdf = "application/pdf"
+    audio_mpeg = "audio/mpeg"
+    audio_mp3 = "audio/mp3"
+    audio_wav = "audio/wav"
+    image_png = "image/png"
+    image_jpeg = "image/jpeg"
+    image_webp = "image/webp"
+    text_plain = "text/plain"
+    video_mov = "video/mov"
+    video_mpeg = "video/mpeg"
+    video_mp4 = "video/mp4"
+    video_mpg = "video/mpg"
+    video_avi = "video/avi"
+    video_wmv = "video/wmv"
+    video_mpegps = "video/mpegps"
+    video_flv = "video/flv"
+
+
+class GeminiInlineData(BaseModel):
+    data: str | None = Field(
+        None,
+        description="The base64 encoding of the image, PDF, or video to include inline in the prompt. "
+        "When including media inline, you must also specify the media type (mimeType) of the data. Size limit: 20MB",
+    )
+    mimeType: GeminiMimeType | None = Field(None)
+
+
+class GeminiFileData(BaseModel):
+    fileUri: str | None = Field(None)
+    mimeType: GeminiMimeType | None = Field(None)
+
+
+class GeminiPart(BaseModel):
+    inlineData: GeminiInlineData | None = Field(None)
+    fileData: GeminiFileData | None = Field(None)
+    text: str | None = Field(None)
+
+
+class GeminiTextPart(BaseModel):
+    text: str | None = Field(None)
+
+
+class GeminiContent(BaseModel):
+    parts: list[GeminiPart] = Field([])
+    role: GeminiRole = Field(..., examples=["user"])
+
+
+class GeminiSystemInstructionContent(BaseModel):
+    parts: list[GeminiTextPart] = Field(
+        ...,
+        description="A list of ordered parts that make up a single message. "
+        "Different parts may have different IANA MIME types.",
+    )
+    role: GeminiRole = Field(
+        ...,
+        description="The identity of the entity that creates the message. "
+        "The following values are supported: "
+        "user: This indicates that the message is sent by a real person, typically a user-generated message. "
+        "model: This indicates that the message is generated by the model. "
+        "The model value is used to insert messages from model into the conversation during multi-turn conversations. "
+        "For non-multi-turn conversations, this field can be left blank or unset.",
+    )
+
+
+class GeminiFunctionDeclaration(BaseModel):
+    description: str | None = Field(None)
+    name: str = Field(...)
+    parameters: dict[str, Any] = Field(..., description="JSON schema for the function parameters")
+
+
+class GeminiTool(BaseModel):
+    functionDeclarations: list[GeminiFunctionDeclaration] | None = Field(None)
+
+
+class GeminiOffset(BaseModel):
+    nanos: int | None = Field(None, ge=0, le=999999999)
+    seconds: int | None = Field(None, ge=-315576000000, le=315576000000)
+
+
+class GeminiVideoMetadata(BaseModel):
+    endOffset: GeminiOffset | None = Field(None)
+    startOffset: GeminiOffset | None = Field(None)
+
+
+class GeminiGenerationConfig(BaseModel):
+    maxOutputTokens: int | None = Field(None, ge=16, le=8192)
+    seed: int | None = Field(None)
+    stopSequences: list[str] | None = Field(None)
+    temperature: float | None = Field(None, ge=0.0, le=2.0)
+    topK: int | None = Field(None, ge=1)
+    topP: float | None = Field(None, ge=0.0, le=1.0)
 
 
 class GeminiImageConfig(BaseModel):
-    aspectRatio: Optional[str] = None
+    aspectRatio: str | None = Field(None)
+    imageSize: str | None = Field(None)
 
 
 class GeminiImageGenerationConfig(GeminiGenerationConfig):
-    responseModalities: Optional[list[str]] = None
-    imageConfig: Optional[GeminiImageConfig] = None
+    responseModalities: list[str] | None = Field(None)
+    imageConfig: GeminiImageConfig | None = Field(None)
 
 
 class GeminiImageGenerateContentRequest(BaseModel):
-    contents: list[GeminiContent]
-    generationConfig: Optional[GeminiImageGenerationConfig] = None
-    safetySettings: Optional[list[GeminiSafetySetting]] = None
-    systemInstruction: Optional[GeminiSystemInstructionContent] = None
-    tools: Optional[list[GeminiTool]] = None
-    videoMetadata: Optional[GeminiVideoMetadata] = None
+    contents: list[GeminiContent] = Field(...)
+    generationConfig: GeminiImageGenerationConfig | None = Field(None)
+    safetySettings: list[GeminiSafetySetting] | None = Field(None)
+    systemInstruction: GeminiSystemInstructionContent | None = Field(None)
+    tools: list[GeminiTool] | None = Field(None)
+    videoMetadata: GeminiVideoMetadata | None = Field(None)
+
+
+class GeminiGenerateContentRequest(BaseModel):
+    contents: list[GeminiContent] = Field(...)
+    generationConfig: GeminiGenerationConfig | None = Field(None)
+    safetySettings: list[GeminiSafetySetting] | None = Field(None)
+    systemInstruction: GeminiSystemInstructionContent | None = Field(None)
+    tools: list[GeminiTool] | None = Field(None)
+    videoMetadata: GeminiVideoMetadata | None = Field(None)
+
+
+class Modality(str, Enum):
+    MODALITY_UNSPECIFIED = "MODALITY_UNSPECIFIED"
+    TEXT = "TEXT"
+    IMAGE = "IMAGE"
+    VIDEO = "VIDEO"
+    AUDIO = "AUDIO"
+    DOCUMENT = "DOCUMENT"
+
+
+class ModalityTokenCount(BaseModel):
+    modality: Modality | None = None
+    tokenCount: int | None = Field(None, description="Number of tokens for the given modality.")
+
+
+class Probability(str, Enum):
+    NEGLIGIBLE = "NEGLIGIBLE"
+    LOW = "LOW"
+    MEDIUM = "MEDIUM"
+    HIGH = "HIGH"
+    UNKNOWN = "UNKNOWN"
+
+
+class GeminiSafetyRating(BaseModel):
+    category: GeminiSafetyCategory | None = None
+    probability: Probability | None = Field(
+        None,
+        description="The probability that the content violates the specified safety category",
+    )
+
+
+class GeminiCitation(BaseModel):
+    authors: list[str] | None = None
+    endIndex: int | None = None
+    license: str | None = None
+    publicationDate: date | None = None
+    startIndex: int | None = None
+    title: str | None = None
+    uri: str | None = None
+
+
+class GeminiCitationMetadata(BaseModel):
+    citations: list[GeminiCitation] | None = None
+
+
+class GeminiCandidate(BaseModel):
+    citationMetadata: GeminiCitationMetadata | None = None
+    content: GeminiContent | None = None
+    finishReason: str | None = None
+    safetyRatings: list[GeminiSafetyRating] | None = None
+
+
+class GeminiPromptFeedback(BaseModel):
+    blockReason: str | None = None
+    blockReasonMessage: str | None = None
+    safetyRatings: list[GeminiSafetyRating] | None = None
+
+
+class GeminiUsageMetadata(BaseModel):
+    cachedContentTokenCount: int | None = Field(
+        None,
+        description="Output only. Number of tokens in the cached part in the input (the cached content).",
+    )
+    candidatesTokenCount: int | None = Field(None, description="Number of tokens in the response(s).")
+    candidatesTokensDetails: list[ModalityTokenCount] | None = Field(
+        None, description="Breakdown of candidate tokens by modality."
+    )
+    promptTokenCount: int | None = Field(
+        None,
+        description="Number of tokens in the request. When cachedContent is set, this is still the total effective prompt size meaning this includes the number of tokens in the cached content.",
+    )
+    promptTokensDetails: list[ModalityTokenCount] | None = Field(
+        None, description="Breakdown of prompt tokens by modality."
+    )
+    thoughtsTokenCount: int | None = Field(None, description="Number of tokens present in thoughts output.")
+    toolUsePromptTokenCount: int | None = Field(None, description="Number of tokens present in tool-use prompt(s).")
+
+
+class GeminiGenerateContentResponse(BaseModel):
+    candidates: list[GeminiCandidate] | None = Field(None)
+    promptFeedback: GeminiPromptFeedback | None = Field(None)
+    usageMetadata: GeminiUsageMetadata | None = Field(None)
+    modelVersion: str | None = Field(None)
diff --git a/comfy_api_nodes/apis/kling_api.py b/comfy_api_nodes/apis/kling_api.py
new file mode 100644
index 000000000..0a3b447c5
--- /dev/null
+++ b/comfy_api_nodes/apis/kling_api.py
@@ -0,0 +1,66 @@
+from pydantic import BaseModel, Field
+
+
+class OmniProText2VideoRequest(BaseModel):
+    model_name: str = Field(..., description="kling-video-o1")
+    aspect_ratio: str = Field(..., description="'16:9', '9:16' or '1:1'")
+    duration: str = Field(..., description="'5' or '10'")
+    prompt: str = Field(...)
+    mode: str = Field("pro")
+
+
+class OmniParamImage(BaseModel):
+    image_url: str = Field(...)
+    type: str | None = Field(None, description="Can be 'first_frame' or 'end_frame'")
+
+
+class OmniParamVideo(BaseModel):
+    video_url: str = Field(...)
+    refer_type: str | None = Field(..., description="Can be 'base' or 'feature'")
+    keep_original_sound: str = Field(..., description="'yes' or 'no'")
+
+
+class OmniProFirstLastFrameRequest(BaseModel):
+    model_name: str = Field(..., description="kling-video-o1")
+    image_list: list[OmniParamImage] = Field(..., min_length=1, max_length=7)
+    duration: str = Field(..., description="'5' or '10'")
+    prompt: str = Field(...)
+    mode: str = Field("pro")
+
+
+class OmniProReferences2VideoRequest(BaseModel):
+    model_name: str = Field(..., description="kling-video-o1")
+    aspect_ratio: str | None = Field(..., description="'16:9', '9:16' or '1:1'")
+    image_list: list[OmniParamImage] | None = Field(
+        None, max_length=7, description="Max length 4 when video is present."
+    )
+    video_list: list[OmniParamVideo] | None = Field(None, max_length=1)
+    duration: str | None = Field(..., description="From 3 to 10.")
+    prompt: str = Field(...)
+    mode: str = Field("pro")
+
+
+class TaskStatusVideoResult(BaseModel):
+    duration: str | None = Field(None, description="Total video duration")
+    id: str | None = Field(None, description="Generated video ID")
+    url: str | None = Field(None, description="URL for generated video")
+
+
+class TaskStatusVideoResults(BaseModel):
+    videos: list[TaskStatusVideoResult] | None = Field(None)
+
+
+class TaskStatusVideoResponseData(BaseModel):
+    created_at: int | None = Field(None, description="Task creation time")
+    updated_at: int | None = Field(None, description="Task update time")
+    task_status: str | None = None
+    task_status_msg: str | None = Field(None, description="Additional failure reason. Only for polling endpoint.")
+    task_id: str | None = Field(None, description="Task ID")
+    task_result: TaskStatusVideoResults | None = Field(None)
+
+
+class TaskStatusVideoResponse(BaseModel):
+    code: int | None = Field(None, description="Error code")
+    message: str | None = Field(None, description="Error message")
+    request_id: str | None = Field(None, description="Request ID")
+    data: TaskStatusVideoResponseData | None = Field(None)
diff --git a/comfy_api_nodes/apis/topaz_api.py b/comfy_api_nodes/apis/topaz_api.py
new file mode 100644
index 000000000..4d9e62e72
--- /dev/null
+++ b/comfy_api_nodes/apis/topaz_api.py
@@ -0,0 +1,133 @@
+from typing import Optional, Union
+
+from pydantic import BaseModel, Field
+
+
+class ImageEnhanceRequest(BaseModel):
+    model: str = Field("Reimagine")
+    output_format: str = Field("jpeg")
+    subject_detection: str = Field("All")
+    face_enhancement: bool = Field(True)
+    face_enhancement_creativity: float = Field(0, description="Is ignored if face_enhancement is false")
+    face_enhancement_strength: float = Field(0.8, description="Is ignored if face_enhancement is false")
+    source_url: str = Field(...)
+    output_width: Optional[int] = Field(None)
+    output_height: Optional[int] = Field(None)
+    crop_to_fill: bool = Field(False)
+    prompt: Optional[str] = Field(None, description="Text prompt for creative upscaling guidance")
+    creativity: int = Field(3, description="Creativity settings range from 1 to 9")
+    face_preservation: str = Field("true", description="To preserve the identity of characters")
+    color_preservation: str = Field("true", description="To preserve the original color")
+
+
+class ImageAsyncTaskResponse(BaseModel):
+    process_id: str = Field(...)
+
+
+class ImageStatusResponse(BaseModel):
+    process_id: str = Field(...)
+    status: str = Field(...)
+    progress: Optional[int] = Field(None)
+    credits: int = Field(...)
+
+
+class ImageDownloadResponse(BaseModel):
+    download_url: str = Field(...)
+    expiry: int = Field(...)
+
+
+class Resolution(BaseModel):
+    width: int = Field(...)
+    height: int = Field(...)
+
+
+class CreateCreateVideoRequestSource(BaseModel):
+    container: str = Field(...)
+    size: int = Field(..., description="Size of the video file in bytes")
+    duration: int = Field(..., description="Duration of the video file in seconds")
+    frameCount: int = Field(..., description="Total number of frames in the video")
+    frameRate: int = Field(...)
+    resolution: Resolution = Field(...)
+
+
+class VideoFrameInterpolationFilter(BaseModel):
+    model: str = Field(...)
+    slowmo: Optional[int] = Field(None)
+    fps: int = Field(...)
+    duplicate: bool = Field(...)
+    duplicate_threshold: float = Field(...)
+
+
+class VideoEnhancementFilter(BaseModel):
+    model: str = Field(...)
+    auto: Optional[str] = Field(None, description="Auto, Manual, Relative")
+    focusFixLevel: Optional[str] = Field(None, description="Downscales video input for correction of blurred subjects")
+    compression: Optional[float] = Field(None, description="Strength of compression recovery")
+    details: Optional[float] = Field(None, description="Amount of detail reconstruction")
+    prenoise: Optional[float] = Field(None, description="Amount of noise to add to input to reduce over-smoothing")
+    noise: Optional[float] = Field(None, description="Amount of noise reduction")
+    halo: Optional[float] = Field(None, description="Amount of halo reduction")
+    preblur: Optional[float] = Field(None, description="Anti-aliasing and deblurring strength")
+    blur: Optional[float] = Field(None, description="Amount of sharpness applied")
+    grain: Optional[float] = Field(None, description="Grain after AI model processing")
+    grainSize: Optional[float] = Field(None, description="Size of generated grain")
+    recoverOriginalDetailValue: Optional[float] = Field(None, description="Source details into the output video")
+    creativity: Optional[str] = Field(None, description="Creativity level(high, low) for slc-1 only")
+    isOptimizedMode: Optional[bool] = Field(None, description="Set to true for Starlight Creative (slc-1) only")
+
+
+class OutputInformationVideo(BaseModel):
+    resolution: Resolution = Field(...)
+    frameRate: int = Field(...)
+    audioCodec: Optional[str] = Field(..., description="Required if audioTransfer is Copy or Convert")
+    audioTransfer: str = Field(..., description="Copy, Convert, None")
+    dynamicCompressionLevel: str = Field(..., description="Low, Mid, High")
+
+
+class Overrides(BaseModel):
+    isPaidDiffusion: bool = Field(True)
+
+
+class CreateVideoRequest(BaseModel):
+    source: CreateCreateVideoRequestSource = Field(...)
+    filters: list[Union[VideoFrameInterpolationFilter, VideoEnhancementFilter]] = Field(...)
+    output: OutputInformationVideo = Field(...)
+    overrides: Overrides = Field(Overrides(isPaidDiffusion=True))
+
+
+class CreateVideoResponse(BaseModel):
+    requestId: str = Field(...)
+
+
+class VideoAcceptResponse(BaseModel):
+    uploadId: str = Field(...)
+    urls: list[str] = Field(...)
+
+
+class VideoCompleteUploadRequestPart(BaseModel):
+    partNum: int = Field(...)
+    eTag: str = Field(...)
+
+
+class VideoCompleteUploadRequest(BaseModel):
+    uploadResults: list[VideoCompleteUploadRequestPart] = Field(...)
+
+
+class VideoCompleteUploadResponse(BaseModel):
+    message: str = Field(..., description="Confirmation message")
+
+
+class VideoStatusResponseEstimates(BaseModel):
+    cost: list[int] = Field(...)
+
+
+class VideoStatusResponseDownloadUrl(BaseModel):
+    url: str = Field(...)
+
+
+class VideoStatusResponse(BaseModel):
+    status: str = Field(...)
+    estimates: Optional[VideoStatusResponseEstimates] = Field(None)
+    progress: Optional[float] = Field(None)
+    message: Optional[str] = Field("")
+    download: Optional[VideoStatusResponseDownloadUrl] = Field(None)
diff --git a/comfy_api_nodes/apis/veo_api.py b/comfy_api_nodes/apis/veo_api.py
index a55137afb..8328d1aa4 100644
--- a/comfy_api_nodes/apis/veo_api.py
+++ b/comfy_api_nodes/apis/veo_api.py
@@ -1,34 +1,21 @@
-from typing import Optional, Union
-from enum import Enum
+from typing import Optional
 
 from pydantic import BaseModel, Field
 
 
-class Image2(BaseModel):
-    bytesBase64Encoded: str
-    gcsUri: Optional[str] = None
-    mimeType: Optional[str] = None
+class VeoRequestInstanceImage(BaseModel):
+    bytesBase64Encoded: str | None = Field(None)
+    gcsUri: str | None = Field(None)
+    mimeType: str | None = Field(None)
 
 
-class Image3(BaseModel):
-    bytesBase64Encoded: Optional[str] = None
-    gcsUri: str
-    mimeType: Optional[str] = None
-
-
-class Instance1(BaseModel):
-    image: Optional[Union[Image2, Image3]] = Field(
-        None, description='Optional image to guide video generation'
-    )
+class VeoRequestInstance(BaseModel):
+    image: VeoRequestInstanceImage | None = Field(None)
+    lastFrame: VeoRequestInstanceImage | None = Field(None)
     prompt: str = Field(..., description='Text description of the video')
 
 
-class PersonGeneration1(str, Enum):
-    ALLOW = 'ALLOW'
-    BLOCK = 'BLOCK'
-
-
-class Parameters1(BaseModel):
+class VeoRequestParameters(BaseModel):
     aspectRatio: Optional[str] = Field(None, examples=['16:9'])
     durationSeconds: Optional[int] = None
     enhancePrompt: Optional[bool] = None
@@ -37,17 +24,18 @@ class Parameters1(BaseModel):
         description='Generate audio for the video. Only supported by veo 3 models.',
     )
     negativePrompt: Optional[str] = None
-    personGeneration: Optional[PersonGeneration1] = None
+    personGeneration: str | None = Field(None, description="ALLOW or BLOCK")
     sampleCount: Optional[int] = None
     seed: Optional[int] = None
     storageUri: Optional[str] = Field(
         None, description='Optional Cloud Storage URI to upload the video'
     )
+    resolution: str | None = Field(None)
 
 
 class VeoGenVidRequest(BaseModel):
-    instances: Optional[list[Instance1]] = None
-    parameters: Optional[Parameters1] = None
+    instances: list[VeoRequestInstance] | None = Field(None)
+    parameters: VeoRequestParameters | None = Field(None)
 
 
 class VeoGenVidResponse(BaseModel):
diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py
index 1740fb377..8826dea0c 100644
--- a/comfy_api_nodes/nodes_bfl.py
+++ b/comfy_api_nodes/nodes_bfl.py
@@ -1,7 +1,7 @@
 from inspect import cleandoc
-from typing import Optional
 
 import torch
+from pydantic import BaseModel
 from typing_extensions import override
 
 from comfy_api.latest import IO, ComfyExtension
@@ -9,15 +9,16 @@ from comfy_api_nodes.apis.bfl_api import (
     BFLFluxExpandImageRequest,
     BFLFluxFillImageRequest,
     BFLFluxKontextProGenerateRequest,
-    BFLFluxProGenerateRequest,
     BFLFluxProGenerateResponse,
     BFLFluxProUltraGenerateRequest,
     BFLFluxStatusResponse,
     BFLStatus,
+    Flux2ProGenerateRequest,
 )
 from comfy_api_nodes.util import (
     ApiEndpoint,
     download_url_to_image_tensor,
+    get_number_of_images,
     poll_op,
     resize_mask_to_image,
     sync_op,
@@ -116,7 +117,7 @@ class FluxProUltraImageNode(IO.ComfyNode):
         prompt_upsampling: bool = False,
         raw: bool = False,
         seed: int = 0,
-        image_prompt: Optional[torch.Tensor] = None,
+        image_prompt: torch.Tensor | None = None,
         image_prompt_strength: float = 0.1,
     ) -> IO.NodeOutput:
         if image_prompt is None:
@@ -230,7 +231,7 @@ class FluxKontextProImageNode(IO.ComfyNode):
         aspect_ratio: str,
         guidance: float,
         steps: int,
-        input_image: Optional[torch.Tensor] = None,
+        input_image: torch.Tensor | None = None,
         seed=0,
         prompt_upsampling=False,
     ) -> IO.NodeOutput:
@@ -280,124 +281,6 @@ class FluxKontextMaxImageNode(FluxKontextProImageNode):
     DISPLAY_NAME = "Flux.1 Kontext [max] Image"
 
 
-class FluxProImageNode(IO.ComfyNode):
-    """
-    Generates images synchronously based on prompt and resolution.
-    """
-
-    @classmethod
-    def define_schema(cls) -> IO.Schema:
-        return IO.Schema(
-            node_id="FluxProImageNode",
-            display_name="Flux 1.1 [pro] Image",
-            category="api node/image/BFL",
-            description=cleandoc(cls.__doc__ or ""),
-            inputs=[
-                IO.String.Input(
-                    "prompt",
-                    multiline=True,
-                    default="",
-                    tooltip="Prompt for the image generation",
-                ),
-                IO.Boolean.Input(
-                    "prompt_upsampling",
-                    default=False,
-                    tooltip="Whether to perform upsampling on the prompt. "
-                    "If active, automatically modifies the prompt for more creative generation, "
-                    "but results are nondeterministic (same seed will not produce exactly the same result).",
-                ),
-                IO.Int.Input(
-                    "width",
-                    default=1024,
-                    min=256,
-                    max=1440,
-                    step=32,
-                ),
-                IO.Int.Input(
-                    "height",
-                    default=768,
-                    min=256,
-                    max=1440,
-                    step=32,
-                ),
-                IO.Int.Input(
-                    "seed",
-                    default=0,
-                    min=0,
-                    max=0xFFFFFFFFFFFFFFFF,
-                    control_after_generate=True,
-                    tooltip="The random seed used for creating the noise.",
-                ),
-                IO.Image.Input(
-                    "image_prompt",
-                    optional=True,
-                ),
-                # "image_prompt_strength": (
-                #     IO.FLOAT,
-                #     {
-                #         "default": 0.1,
-                #         "min": 0.0,
-                #         "max": 1.0,
-                #         "step": 0.01,
-                #         "tooltip": "Blend between the prompt and the image prompt.",
-                #     },
-                # ),
-            ],
-            outputs=[IO.Image.Output()],
-            hidden=[
-                IO.Hidden.auth_token_comfy_org,
-                IO.Hidden.api_key_comfy_org,
-                IO.Hidden.unique_id,
-            ],
-            is_api_node=True,
-        )
-
-    @classmethod
-    async def execute(
-        cls,
-        prompt: str,
-        prompt_upsampling,
-        width: int,
-        height: int,
-        seed=0,
-        image_prompt=None,
-        # image_prompt_strength=0.1,
-    ) -> IO.NodeOutput:
-        image_prompt = image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt)
-        initial_response = await sync_op(
-            cls,
-            ApiEndpoint(
-                path="/proxy/bfl/flux-pro-1.1/generate",
-                method="POST",
-            ),
-            response_model=BFLFluxProGenerateResponse,
-            data=BFLFluxProGenerateRequest(
-                prompt=prompt,
-                prompt_upsampling=prompt_upsampling,
-                width=width,
-                height=height,
-                seed=seed,
-                image_prompt=image_prompt,
-            ),
-        )
-        response = await poll_op(
-            cls,
-            ApiEndpoint(initial_response.polling_url),
-            response_model=BFLFluxStatusResponse,
-            status_extractor=lambda r: r.status,
-            progress_extractor=lambda r: r.progress,
-            completed_statuses=[BFLStatus.ready],
-            failed_statuses=[
-                BFLStatus.request_moderated,
-                BFLStatus.content_moderated,
-                BFLStatus.error,
-                BFLStatus.task_not_found,
-            ],
-            queued_statuses=[],
-        )
-        return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"]))
-
-
 class FluxProExpandNode(IO.ComfyNode):
     """
     Outpaints image based on prompt.
@@ -640,16 +523,125 @@ class FluxProFillNode(IO.ComfyNode):
         return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"]))
 
 
+class Flux2ProImageNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="Flux2ProImageNode",
+            display_name="Flux.2 [pro] Image",
+            category="api node/image/BFL",
+            description="Generates images synchronously based on prompt and resolution.",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Prompt for the image generation or edit",
+                ),
+                IO.Int.Input(
+                    "width",
+                    default=1024,
+                    min=256,
+                    max=2048,
+                    step=32,
+                ),
+                IO.Int.Input(
+                    "height",
+                    default=768,
+                    min=256,
+                    max=2048,
+                    step=32,
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=0xFFFFFFFFFFFFFFFF,
+                    control_after_generate=True,
+                    tooltip="The random seed used for creating the noise.",
+                ),
+                IO.Boolean.Input(
+                    "prompt_upsampling",
+                    default=False,
+                    tooltip="Whether to perform upsampling on the prompt. "
+                    "If active, automatically modifies the prompt for more creative generation, "
+                    "but results are nondeterministic (same seed will not produce exactly the same result).",
+                ),
+                IO.Image.Input("images", optional=True, tooltip="Up to 4 images to be used as references."),
+            ],
+            outputs=[IO.Image.Output()],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        width: int,
+        height: int,
+        seed: int,
+        prompt_upsampling: bool,
+        images: torch.Tensor | None = None,
+    ) -> IO.NodeOutput:
+        reference_images = {}
+        if images is not None:
+            if get_number_of_images(images) > 9:
+                raise ValueError("The current maximum number of supported images is 9.")
+            for image_index in range(images.shape[0]):
+                key_name = f"input_image_{image_index + 1}" if image_index else "input_image"
+                reference_images[key_name] = tensor_to_base64_string(images[image_index], total_pixels=2048 * 2048)
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/bfl/flux-2-pro/generate", method="POST"),
+            response_model=BFLFluxProGenerateResponse,
+            data=Flux2ProGenerateRequest(
+                prompt=prompt,
+                width=width,
+                height=height,
+                seed=seed,
+                prompt_upsampling=prompt_upsampling,
+                **reference_images,
+            ),
+        )
+
+        def price_extractor(_r: BaseModel) -> float | None:
+            return None if initial_response.cost is None else initial_response.cost / 100
+
+        response = await poll_op(
+            cls,
+            ApiEndpoint(initial_response.polling_url),
+            response_model=BFLFluxStatusResponse,
+            status_extractor=lambda r: r.status,
+            progress_extractor=lambda r: r.progress,
+            price_extractor=price_extractor,
+            completed_statuses=[BFLStatus.ready],
+            failed_statuses=[
+                BFLStatus.request_moderated,
+                BFLStatus.content_moderated,
+                BFLStatus.error,
+                BFLStatus.task_not_found,
+            ],
+            queued_statuses=[],
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"]))
+
+
 class BFLExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[IO.ComfyNode]]:
         return [
             FluxProUltraImageNode,
-            # FluxProImageNode,
             FluxKontextProImageNode,
             FluxKontextMaxImageNode,
             FluxProExpandNode,
             FluxProFillNode,
+            Flux2ProImageNode,
         ]
 
 
diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py
index 67f2469ad..08f7b0f64 100644
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@@ -3,16 +3,11 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API
 See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
 """
 
-from __future__ import annotations
-
 import base64
-import json
 import os
-import time
-import uuid
 from enum import Enum
 from io import BytesIO
-from typing import Literal, Optional
+from typing import Literal
 
 import torch
 from typing_extensions import override
@@ -20,29 +15,31 @@ from typing_extensions import override
 import folder_paths
 from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api.util import VideoCodec, VideoContainer
-from comfy_api_nodes.apis import (
+from comfy_api_nodes.apis.gemini_api import (
     GeminiContent,
+    GeminiFileData,
     GeminiGenerateContentRequest,
     GeminiGenerateContentResponse,
-    GeminiInlineData,
-    GeminiMimeType,
-    GeminiPart,
-)
-from comfy_api_nodes.apis.gemini_api import (
     GeminiImageConfig,
     GeminiImageGenerateContentRequest,
     GeminiImageGenerationConfig,
+    GeminiInlineData,
+    GeminiMimeType,
+    GeminiPart,
+    GeminiRole,
+    Modality,
 )
 from comfy_api_nodes.util import (
     ApiEndpoint,
     audio_to_base64_string,
     bytesio_to_image_tensor,
+    get_number_of_images,
     sync_op,
     tensor_to_base64_string,
+    upload_images_to_comfyapi,
     validate_string,
     video_to_base64_string,
 )
-from server import PromptServer
 
 GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini"
 GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024  # 20 MB
@@ -57,6 +54,7 @@ class GeminiModel(str, Enum):
     gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17"
     gemini_2_5_pro = "gemini-2.5-pro"
     gemini_2_5_flash = "gemini-2.5-flash"
+    gemini_3_0_pro = "gemini-3-pro-preview"
 
 
 class GeminiImageModel(str, Enum):
@@ -68,24 +66,43 @@ class GeminiImageModel(str, Enum):
     gemini_2_5_flash_image = "gemini-2.5-flash-image"
 
 
-def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]:
-    """
-    Convert image tensor input to Gemini API compatible parts.
-
-    Args:
-        image_input: Batch of image tensors from ComfyUI.
-
-    Returns:
-        List of GeminiPart objects containing the encoded images.
-    """
+async def create_image_parts(
+    cls: type[IO.ComfyNode],
+    images: torch.Tensor,
+    image_limit: int = 0,
+) -> list[GeminiPart]:
     image_parts: list[GeminiPart] = []
-    for image_index in range(image_input.shape[0]):
-        image_as_b64 = tensor_to_base64_string(image_input[image_index].unsqueeze(0))
+    if image_limit < 0:
+        raise ValueError("image_limit must be greater than or equal to 0 when creating Gemini image parts.")
+    total_images = get_number_of_images(images)
+    if total_images <= 0:
+        raise ValueError("No images provided to create_image_parts; at least one image is required.")
+
+    # If image_limit == 0 --> use all images; otherwise clamp to image_limit.
+    effective_max = total_images if image_limit == 0 else min(total_images, image_limit)
+
+    # Number of images we'll send as URLs (fileData)
+    num_url_images = min(effective_max, 10)  # Vertex API max number of image links
+    reference_images_urls = await upload_images_to_comfyapi(
+        cls,
+        images,
+        max_images=num_url_images,
+    )
+    for reference_image_url in reference_images_urls:
+        image_parts.append(
+            GeminiPart(
+                fileData=GeminiFileData(
+                    mimeType=GeminiMimeType.image_png,
+                    fileUri=reference_image_url,
+                )
+            )
+        )
+    for idx in range(num_url_images, effective_max):
         image_parts.append(
             GeminiPart(
                 inlineData=GeminiInlineData(
                     mimeType=GeminiMimeType.image_png,
-                    data=image_as_b64,
+                    data=tensor_to_base64_string(images[idx]),
                 )
             )
         )
@@ -103,6 +120,16 @@ def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Litera
     Returns:
         List of response parts matching the requested type.
     """
+    if response.candidates is None:
+        if response.promptFeedback and response.promptFeedback.blockReason:
+            feedback = response.promptFeedback
+            raise ValueError(
+                f"Gemini API blocked the request. Reason: {feedback.blockReason} ({feedback.blockReasonMessage})"
+            )
+        raise ValueError(
+            "Gemini API returned no response candidates. If you are using the `IMAGE` modality, "
+            "try changing it to `IMAGE+TEXT` to view the model's reasoning and understand why image generation failed."
+        )
     parts = []
     for part in response.candidates[0].content.parts:
         if part_type == "text" and hasattr(part, "text") and part.text:
@@ -139,6 +166,50 @@ def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Te
     return torch.cat(image_tensors, dim=0)
 
 
+def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None:
+    if not response.modelVersion:
+        return None
+    # Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing
+    if response.modelVersion in ("gemini-2.5-pro-preview-05-06", "gemini-2.5-pro"):
+        input_tokens_price = 1.25
+        output_text_tokens_price = 10.0
+        output_image_tokens_price = 0.0
+    elif response.modelVersion in (
+        "gemini-2.5-flash-preview-04-17",
+        "gemini-2.5-flash",
+    ):
+        input_tokens_price = 0.30
+        output_text_tokens_price = 2.50
+        output_image_tokens_price = 0.0
+    elif response.modelVersion in (
+        "gemini-2.5-flash-image-preview",
+        "gemini-2.5-flash-image",
+    ):
+        input_tokens_price = 0.30
+        output_text_tokens_price = 2.50
+        output_image_tokens_price = 30.0
+    elif response.modelVersion == "gemini-3-pro-preview":
+        input_tokens_price = 2
+        output_text_tokens_price = 12.0
+        output_image_tokens_price = 0.0
+    elif response.modelVersion == "gemini-3-pro-image-preview":
+        input_tokens_price = 2
+        output_text_tokens_price = 12.0
+        output_image_tokens_price = 120.0
+    else:
+        return None
+    final_price = response.usageMetadata.promptTokenCount * input_tokens_price
+    if response.usageMetadata.candidatesTokensDetails:
+        for i in response.usageMetadata.candidatesTokensDetails:
+            if i.modality == Modality.IMAGE:
+                final_price += output_image_tokens_price * i.tokenCount  # for Nano Banana models
+            else:
+                final_price += output_text_tokens_price * i.tokenCount
+    if response.usageMetadata.thoughtsTokenCount:
+        final_price += output_text_tokens_price * response.usageMetadata.thoughtsTokenCount
+    return final_price / 1_000_000.0
+
+
 class GeminiNode(IO.ComfyNode):
     """
     Node to generate text responses from a Gemini model.
@@ -272,10 +343,10 @@ class GeminiNode(IO.ComfyNode):
         prompt: str,
         model: str,
         seed: int,
-        images: Optional[torch.Tensor] = None,
-        audio: Optional[Input.Audio] = None,
-        video: Optional[Input.Video] = None,
-        files: Optional[list[GeminiPart]] = None,
+        images: torch.Tensor | None = None,
+        audio: Input.Audio | None = None,
+        video: Input.Video | None = None,
+        files: list[GeminiPart] | None = None,
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=False)
 
@@ -284,8 +355,7 @@ class GeminiNode(IO.ComfyNode):
 
         # Add other modal parts
         if images is not None:
-            image_parts = create_image_parts(images)
-            parts.extend(image_parts)
+            parts.extend(await create_image_parts(cls, images))
         if audio is not None:
             parts.extend(cls.create_audio_parts(audio))
         if video is not None:
@@ -300,39 +370,16 @@ class GeminiNode(IO.ComfyNode):
             data=GeminiGenerateContentRequest(
                 contents=[
                     GeminiContent(
-                        role="user",
+                        role=GeminiRole.user,
                         parts=parts,
                     )
                 ]
             ),
             response_model=GeminiGenerateContentResponse,
+            price_extractor=calculate_tokens_price,
         )
 
-        # Get result output
         output_text = get_text_from_response(response)
-        if output_text:
-            # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button.
-            render_spec = {
-                "node_id": cls.hidden.unique_id,
-                "component": "ChatHistoryWidget",
-                "props": {
-                    "history": json.dumps(
-                        [
-                            {
-                                "prompt": prompt,
-                                "response": output_text,
-                                "response_id": str(uuid.uuid4()),
-                                "timestamp": time.time(),
-                            }
-                        ]
-                    ),
-                },
-            }
-            PromptServer.instance.send_sync(
-                "display_component",
-                render_spec,
-            )
-
         return IO.NodeOutput(output_text or "Empty response from Gemini model...")
 
 
@@ -406,7 +453,7 @@ class GeminiInputFiles(IO.ComfyNode):
         )
 
     @classmethod
-    def execute(cls, file: str, GEMINI_INPUT_FILES: Optional[list[GeminiPart]] = None) -> IO.NodeOutput:
+    def execute(cls, file: str, GEMINI_INPUT_FILES: list[GeminiPart] | None = None) -> IO.NodeOutput:
         """Loads and formats input files for Gemini API."""
         if GEMINI_INPUT_FILES is None:
             GEMINI_INPUT_FILES = []
@@ -421,7 +468,7 @@ class GeminiImage(IO.ComfyNode):
     def define_schema(cls):
         return IO.Schema(
             node_id="GeminiImageNode",
-            display_name="Google Gemini Image",
+            display_name="Nano Banana (Google Gemini Image)",
             category="api node/image/Gemini",
             description="Edit images synchronously via Google API.",
             inputs=[
@@ -469,6 +516,13 @@ class GeminiImage(IO.ComfyNode):
                     "or otherwise generates 1:1 squares.",
                     optional=True,
                 ),
+                IO.Combo.Input(
+                    "response_modalities",
+                    options=["IMAGE+TEXT", "IMAGE"],
+                    tooltip="Choose 'IMAGE' for image-only output, or "
+                    "'IMAGE+TEXT' to return both the generated image and a text response.",
+                    optional=True,
+                ),
             ],
             outputs=[
                 IO.Image.Output(),
@@ -488,9 +542,10 @@ class GeminiImage(IO.ComfyNode):
         prompt: str,
         model: str,
         seed: int,
-        images: Optional[torch.Tensor] = None,
-        files: Optional[list[GeminiPart]] = None,
+        images: torch.Tensor | None = None,
+        files: list[GeminiPart] | None = None,
         aspect_ratio: str = "auto",
+        response_modalities: str = "IMAGE+TEXT",
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=True, min_length=1)
         parts: list[GeminiPart] = [GeminiPart(text=prompt)]
@@ -500,8 +555,7 @@ class GeminiImage(IO.ComfyNode):
         image_config = GeminiImageConfig(aspectRatio=aspect_ratio)
 
         if images is not None:
-            image_parts = create_image_parts(images)
-            parts.extend(image_parts)
+            parts.extend(await create_image_parts(cls, images))
         if files is not None:
             parts.extend(files)
 
@@ -510,43 +564,137 @@ class GeminiImage(IO.ComfyNode):
             endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
             data=GeminiImageGenerateContentRequest(
                 contents=[
-                    GeminiContent(role="user", parts=parts),
+                    GeminiContent(role=GeminiRole.user, parts=parts),
                 ],
                 generationConfig=GeminiImageGenerationConfig(
-                    responseModalities=["TEXT", "IMAGE"],
+                    responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                     imageConfig=None if aspect_ratio == "auto" else image_config,
                 ),
             ),
             response_model=GeminiGenerateContentResponse,
+            price_extractor=calculate_tokens_price,
+        )
+        return IO.NodeOutput(get_image_from_response(response), get_text_from_response(response))
+
+
+class GeminiImage2(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="GeminiImage2Node",
+            display_name="Nano Banana Pro (Google Gemini Image)",
+            category="api node/image/Gemini",
+            description="Generate or edit images synchronously via Google Vertex API.",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="Text prompt describing the image to generate or the edits to apply. "
+                    "Include any constraints, styles, or details the model should follow.",
+                    default="",
+                ),
+                IO.Combo.Input(
+                    "model",
+                    options=["gemini-3-pro-image-preview"],
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=42,
+                    min=0,
+                    max=0xFFFFFFFFFFFFFFFF,
+                    control_after_generate=True,
+                    tooltip="When the seed is fixed to a specific value, the model makes a best effort to provide "
+                    "the same response for repeated requests. Deterministic output isn't guaranteed. "
+                    "Also, changing the model or parameter settings, such as the temperature, "
+                    "can cause variations in the response even when you use the same seed value. "
+                    "By default, a random seed value is used.",
+                ),
+                IO.Combo.Input(
+                    "aspect_ratio",
+                    options=["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"],
+                    default="auto",
+                    tooltip="If set to 'auto', matches your input image's aspect ratio; "
+                    "if no image is provided, a 16:9 square is usually generated.",
+                ),
+                IO.Combo.Input(
+                    "resolution",
+                    options=["1K", "2K", "4K"],
+                    tooltip="Target output resolution. For 2K/4K the native Gemini upscaler is used.",
+                ),
+                IO.Combo.Input(
+                    "response_modalities",
+                    options=["IMAGE+TEXT", "IMAGE"],
+                    tooltip="Choose 'IMAGE' for image-only output, or "
+                    "'IMAGE+TEXT' to return both the generated image and a text response.",
+                ),
+                IO.Image.Input(
+                    "images",
+                    optional=True,
+                    tooltip="Optional reference image(s). "
+                    "To include multiple images, use the Batch Images node (up to 14).",
+                ),
+                IO.Custom("GEMINI_INPUT_FILES").Input(
+                    "files",
+                    optional=True,
+                    tooltip="Optional file(s) to use as context for the model. "
+                    "Accepts inputs from the Gemini Generate Content Input Files node.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+                IO.String.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
         )
 
-        output_image = get_image_from_response(response)
-        output_text = get_text_from_response(response)
-        if output_text:
-            # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button.
-            render_spec = {
-                "node_id": cls.hidden.unique_id,
-                "component": "ChatHistoryWidget",
-                "props": {
-                    "history": json.dumps(
-                        [
-                            {
-                                "prompt": prompt,
-                                "response": output_text,
-                                "response_id": str(uuid.uuid4()),
-                                "timestamp": time.time(),
-                            }
-                        ]
-                    ),
-                },
-            }
-            PromptServer.instance.send_sync(
-                "display_component",
-                render_spec,
-            )
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        model: str,
+        seed: int,
+        aspect_ratio: str,
+        resolution: str,
+        response_modalities: str,
+        images: torch.Tensor | None = None,
+        files: list[GeminiPart] | None = None,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, strip_whitespace=True, min_length=1)
 
-        output_text = output_text or "Empty response from Gemini model..."
-        return IO.NodeOutput(output_image, output_text)
+        parts: list[GeminiPart] = [GeminiPart(text=prompt)]
+        if images is not None:
+            if get_number_of_images(images) > 14:
+                raise ValueError("The current maximum number of supported images is 14.")
+            parts.extend(await create_image_parts(cls, images))
+        if files is not None:
+            parts.extend(files)
+
+        image_config = GeminiImageConfig(imageSize=resolution)
+        if aspect_ratio != "auto":
+            image_config.aspectRatio = aspect_ratio
+
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
+            data=GeminiImageGenerateContentRequest(
+                contents=[
+                    GeminiContent(role=GeminiRole.user, parts=parts),
+                ],
+                generationConfig=GeminiImageGenerationConfig(
+                    responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
+                    imageConfig=image_config,
+                ),
+            ),
+            response_model=GeminiGenerateContentResponse,
+            price_extractor=calculate_tokens_price,
+        )
+        return IO.NodeOutput(get_image_from_response(response), get_text_from_response(response))
 
 
 class GeminiExtension(ComfyExtension):
@@ -555,6 +703,7 @@ class GeminiExtension(ComfyExtension):
         return [
             GeminiNode,
             GeminiImage,
+            GeminiImage2,
             GeminiInputFiles,
         ]
 
diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py
index 7b23e9cf9..850c44db6 100644
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@@ -4,15 +4,13 @@ For source of truth on the allowed permutations of request fields, please refere
 - [Compatibility Table](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap)
 """
 
-from __future__ import annotations
-from typing import Optional, TypeVar
-import math
 import logging
-
-from typing_extensions import override
+import math
 
 import torch
+from typing_extensions import override
 
+from comfy_api.latest import IO, ComfyExtension, Input, InputImpl
 from comfy_api_nodes.apis import (
     KlingCameraControl,
     KlingCameraConfig,
@@ -50,25 +48,31 @@ from comfy_api_nodes.apis import (
     KlingCharacterEffectModelName,
     KlingSingleImageEffectModelName,
 )
+from comfy_api_nodes.apis.kling_api import (
+    OmniParamImage,
+    OmniParamVideo,
+    OmniProFirstLastFrameRequest,
+    OmniProReferences2VideoRequest,
+    OmniProText2VideoRequest,
+    TaskStatusVideoResponse,
+)
 from comfy_api_nodes.util import (
-    validate_image_dimensions,
+    ApiEndpoint,
+    download_url_to_image_tensor,
+    download_url_to_video_output,
+    get_number_of_images,
+    poll_op,
+    sync_op,
+    tensor_to_base64_string,
+    upload_audio_to_comfyapi,
+    upload_images_to_comfyapi,
+    upload_video_to_comfyapi,
     validate_image_aspect_ratio,
+    validate_image_dimensions,
+    validate_string,
     validate_video_dimensions,
     validate_video_duration,
-    tensor_to_base64_string,
-    validate_string,
-    upload_audio_to_comfyapi,
-    download_url_to_image_tensor,
-    upload_video_to_comfyapi,
-    download_url_to_video_output,
-    sync_op,
-    ApiEndpoint,
-    poll_op,
 )
-from comfy_api.input_impl import VideoFromFile
-from comfy_api.input.basic_types import AudioInput
-from comfy_api.input.video_types import VideoInput
-from comfy_api.latest import ComfyExtension, IO
 
 KLING_API_VERSION = "v1"
 PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video"
@@ -94,8 +98,6 @@ AVERAGE_DURATION_IMAGE_GEN = 32
 AVERAGE_DURATION_VIDEO_EFFECTS = 320
 AVERAGE_DURATION_VIDEO_EXTEND = 320
 
-R = TypeVar("R")
-
 
 MODE_TEXT2VIDEO = {
     "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"),
@@ -130,6 +132,8 @@ MODE_START_END_FRAME = {
     "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"),
     "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"),
     "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"),
+    "pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"),
+    "pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"),
 }
 """
 Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples.
@@ -206,6 +210,20 @@ VOICES_CONFIG = {
 }
 
 
+async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusVideoResponse) -> IO.NodeOutput:
+    if response.code:
+        raise RuntimeError(
+            f"Kling request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}"
+        )
+    final_response = await poll_op(
+        cls,
+        ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"),
+        response_model=TaskStatusVideoResponse,
+        status_extractor=lambda r: (r.data.task_status if r.data else None),
+    )
+    return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
+
+
 def is_valid_camera_control_configs(configs: list[float]) -> bool:
     """Verifies that at least one camera control configuration is non-zero."""
     return any(not math.isclose(value, 0.0) for value in configs)
@@ -296,7 +314,7 @@ def get_video_from_response(response) -> KlingVideoResult:
     return video
 
 
-def get_video_url_from_response(response) -> Optional[str]:
+def get_video_url_from_response(response) -> str | None:
     """Returns the first video url from the Kling video generation task result.
     Will not raise an error if the response is not valid.
     """
@@ -315,7 +333,7 @@ def get_images_from_response(response) -> list[KlingImageResult]:
     return images
 
 
-def get_images_urls_from_response(response) -> Optional[str]:
+def get_images_urls_from_response(response) -> str | None:
     """Returns the list of image urls from the Kling image generation task result.
     Will not raise an error if the response is not valid. If there is only one image, returns the url as a string. If there are multiple images, returns a list of urls.
     """
@@ -349,7 +367,7 @@ async def execute_text2video(
     model_mode: str,
     duration: str,
     aspect_ratio: str,
-    camera_control: Optional[KlingCameraControl] = None,
+    camera_control: KlingCameraControl | None = None,
 ) -> IO.NodeOutput:
     validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V)
     task_creation_response = await sync_op(
@@ -394,8 +412,8 @@ async def execute_image2video(
     model_mode: str,
     aspect_ratio: str,
     duration: str,
-    camera_control: Optional[KlingCameraControl] = None,
-    end_frame: Optional[torch.Tensor] = None,
+    camera_control: KlingCameraControl | None = None,
+    end_frame: torch.Tensor | None = None,
 ) -> IO.NodeOutput:
     validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V)
     validate_input_image(start_frame)
@@ -451,9 +469,9 @@ async def execute_video_effect(
     model_name: str,
     duration: KlingVideoGenDuration,
     image_1: torch.Tensor,
-    image_2: Optional[torch.Tensor] = None,
-    model_mode: Optional[KlingVideoGenMode] = None,
-) -> tuple[VideoFromFile, str, str]:
+    image_2: torch.Tensor | None = None,
+    model_mode: KlingVideoGenMode | None = None,
+) -> tuple[InputImpl.VideoFromFile, str, str]:
     if dual_character:
         request_input_field = KlingDualCharacterEffectInput(
             model_name=model_name,
@@ -499,13 +517,13 @@ async def execute_video_effect(
 
 async def execute_lipsync(
     cls: type[IO.ComfyNode],
-    video: VideoInput,
-    audio: Optional[AudioInput] = None,
-    voice_language: Optional[str] = None,
-    model_mode: Optional[str] = None,
-    text: Optional[str] = None,
-    voice_speed: Optional[float] = None,
-    voice_id: Optional[str] = None,
+    video: Input.Video,
+    audio: Input.Audio | None = None,
+    voice_language: str | None = None,
+    model_mode: str | None = None,
+    text: str | None = None,
+    voice_speed: float | None = None,
+    voice_id: str | None = None,
 ) -> IO.NodeOutput:
     if text:
         validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC)
@@ -518,7 +536,9 @@ async def execute_lipsync(
 
     # Upload the audio file to Comfy API and get download URL
     if audio:
-        audio_url = await upload_audio_to_comfyapi(cls, audio)
+        audio_url = await upload_audio_to_comfyapi(
+            cls, audio, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mpeg", filename="output.mp3"
+        )
         logging.info("Uploaded audio to Comfy API. URL: %s", audio_url)
     else:
         audio_url = None
@@ -738,6 +758,386 @@ class KlingTextToVideoNode(IO.ComfyNode):
         )
 
 
+class OmniProTextToVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="KlingOmniProTextToVideoNode",
+            display_name="Kling Omni Text to Video (Pro)",
+            category="api node/video/Kling",
+            description="Use text prompts to generate videos with the latest Kling model.",
+            inputs=[
+                IO.Combo.Input("model_name", options=["kling-video-o1"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A text prompt describing the video content. "
+                    "This can include both positive and negative descriptions.",
+                ),
+                IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
+                IO.Combo.Input("duration", options=[5, 10]),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model_name: str,
+        prompt: str,
+        aspect_ratio: str,
+        duration: int,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=2500)
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
+            response_model=TaskStatusVideoResponse,
+            data=OmniProText2VideoRequest(
+                model_name=model_name,
+                prompt=prompt,
+                aspect_ratio=aspect_ratio,
+                duration=str(duration),
+            ),
+        )
+        return await finish_omni_video_task(cls, response)
+
+
+class OmniProFirstLastFrameNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="KlingOmniProFirstLastFrameNode",
+            display_name="Kling Omni First-Last-Frame to Video (Pro)",
+            category="api node/video/Kling",
+            description="Use a start frame, an optional end frame, or reference images with the latest Kling model.",
+            inputs=[
+                IO.Combo.Input("model_name", options=["kling-video-o1"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A text prompt describing the video content. "
+                    "This can include both positive and negative descriptions.",
+                ),
+                IO.Combo.Input("duration", options=["5", "10"]),
+                IO.Image.Input("first_frame"),
+                IO.Image.Input(
+                    "end_frame",
+                    optional=True,
+                    tooltip="An optional end frame for the video. "
+                    "This cannot be used simultaneously with 'reference_images'.",
+                ),
+                IO.Image.Input(
+                    "reference_images",
+                    optional=True,
+                    tooltip="Up to 6 additional reference images.",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model_name: str,
+        prompt: str,
+        duration: int,
+        first_frame: Input.Image,
+        end_frame: Input.Image | None = None,
+        reference_images: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=2500)
+        if end_frame is not None and reference_images is not None:
+            raise ValueError("The 'end_frame' input cannot be used simultaneously with 'reference_images'.")
+        validate_image_dimensions(first_frame, min_width=300, min_height=300)
+        validate_image_aspect_ratio(first_frame, (1, 2.5), (2.5, 1))
+        image_list: list[OmniParamImage] = [
+            OmniParamImage(
+                image_url=(await upload_images_to_comfyapi(cls, first_frame, wait_label="Uploading first frame"))[0],
+                type="first_frame",
+            )
+        ]
+        if end_frame is not None:
+            validate_image_dimensions(end_frame, min_width=300, min_height=300)
+            validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1))
+            image_list.append(
+                OmniParamImage(
+                    image_url=(await upload_images_to_comfyapi(cls, end_frame, wait_label="Uploading end frame"))[0],
+                    type="end_frame",
+                )
+            )
+        if reference_images is not None:
+            if get_number_of_images(reference_images) > 6:
+                raise ValueError("The maximum number of reference images allowed is 6.")
+            for i in reference_images:
+                validate_image_dimensions(i, min_width=300, min_height=300)
+                validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
+            for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference frame(s)"):
+                image_list.append(OmniParamImage(image_url=i))
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
+            response_model=TaskStatusVideoResponse,
+            data=OmniProFirstLastFrameRequest(
+                model_name=model_name,
+                prompt=prompt,
+                duration=str(duration),
+                image_list=image_list,
+            ),
+        )
+        return await finish_omni_video_task(cls, response)
+
+
+class OmniProImageToVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="KlingOmniProImageToVideoNode",
+            display_name="Kling Omni Image to Video (Pro)",
+            category="api node/video/Kling",
+            description="Use up to 7 reference images to generate a video with the latest Kling model.",
+            inputs=[
+                IO.Combo.Input("model_name", options=["kling-video-o1"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A text prompt describing the video content. "
+                    "This can include both positive and negative descriptions.",
+                ),
+                IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
+                IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider),
+                IO.Image.Input(
+                    "reference_images",
+                    tooltip="Up to 7 reference images.",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model_name: str,
+        prompt: str,
+        aspect_ratio: str,
+        duration: int,
+        reference_images: Input.Image,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=2500)
+        if get_number_of_images(reference_images) > 7:
+            raise ValueError("The maximum number of reference images is 7.")
+        for i in reference_images:
+            validate_image_dimensions(i, min_width=300, min_height=300)
+            validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
+        image_list: list[OmniParamImage] = []
+        for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
+            image_list.append(OmniParamImage(image_url=i))
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
+            response_model=TaskStatusVideoResponse,
+            data=OmniProReferences2VideoRequest(
+                model_name=model_name,
+                prompt=prompt,
+                aspect_ratio=aspect_ratio,
+                duration=str(duration),
+                image_list=image_list,
+            ),
+        )
+        return await finish_omni_video_task(cls, response)
+
+
+class OmniProVideoToVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="KlingOmniProVideoToVideoNode",
+            display_name="Kling Omni Video to Video (Pro)",
+            category="api node/video/Kling",
+            description="Use a video and up to 4 reference images to generate a video with the latest Kling model.",
+            inputs=[
+                IO.Combo.Input("model_name", options=["kling-video-o1"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A text prompt describing the video content. "
+                    "This can include both positive and negative descriptions.",
+                ),
+                IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
+                IO.Int.Input("duration", default=3, min=3, max=10, display_mode=IO.NumberDisplay.slider),
+                IO.Video.Input("reference_video", tooltip="Video to use as a reference."),
+                IO.Boolean.Input("keep_original_sound", default=True),
+                IO.Image.Input(
+                    "reference_images",
+                    tooltip="Up to 4 additional reference images.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model_name: str,
+        prompt: str,
+        aspect_ratio: str,
+        duration: int,
+        reference_video: Input.Video,
+        keep_original_sound: bool,
+        reference_images: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=2500)
+        validate_video_duration(reference_video, min_duration=3.0, max_duration=10.05)
+        validate_video_dimensions(reference_video, min_width=720, min_height=720, max_width=2160, max_height=2160)
+        image_list: list[OmniParamImage] = []
+        if reference_images is not None:
+            if get_number_of_images(reference_images) > 4:
+                raise ValueError("The maximum number of reference images allowed with a video input is 4.")
+            for i in reference_images:
+                validate_image_dimensions(i, min_width=300, min_height=300)
+                validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
+            for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
+                image_list.append(OmniParamImage(image_url=i))
+        video_list = [
+            OmniParamVideo(
+                video_url=await upload_video_to_comfyapi(cls, reference_video, wait_label="Uploading reference video"),
+                refer_type="feature",
+                keep_original_sound="yes" if keep_original_sound else "no",
+            )
+        ]
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
+            response_model=TaskStatusVideoResponse,
+            data=OmniProReferences2VideoRequest(
+                model_name=model_name,
+                prompt=prompt,
+                aspect_ratio=aspect_ratio,
+                duration=str(duration),
+                image_list=image_list if image_list else None,
+                video_list=video_list,
+            ),
+        )
+        return await finish_omni_video_task(cls, response)
+
+
+class OmniProEditVideoNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls) -> IO.Schema:
+        return IO.Schema(
+            node_id="KlingOmniProEditVideoNode",
+            display_name="Kling Omni Edit Video (Pro)",
+            category="api node/video/Kling",
+            description="Edit an existing video with the latest model from Kling.",
+            inputs=[
+                IO.Combo.Input("model_name", options=["kling-video-o1"]),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    tooltip="A text prompt describing the video content. "
+                    "This can include both positive and negative descriptions.",
+                ),
+                IO.Video.Input("video", tooltip="Video for editing. The output video length will be the same."),
+                IO.Boolean.Input("keep_original_sound", default=True),
+                IO.Image.Input(
+                    "reference_images",
+                    tooltip="Up to 4 additional reference images.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model_name: str,
+        prompt: str,
+        video: Input.Video,
+        keep_original_sound: bool,
+        reference_images: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        validate_string(prompt, min_length=1, max_length=2500)
+        validate_video_duration(video, min_duration=3.0, max_duration=10.05)
+        validate_video_dimensions(video, min_width=720, min_height=720, max_width=2160, max_height=2160)
+        image_list: list[OmniParamImage] = []
+        if reference_images is not None:
+            if get_number_of_images(reference_images) > 4:
+                raise ValueError("The maximum number of reference images allowed with a video input is 4.")
+            for i in reference_images:
+                validate_image_dimensions(i, min_width=300, min_height=300)
+                validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
+            for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
+                image_list.append(OmniParamImage(image_url=i))
+        video_list = [
+            OmniParamVideo(
+                video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading base video"),
+                refer_type="base",
+                keep_original_sound="yes" if keep_original_sound else "no",
+            )
+        ]
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
+            response_model=TaskStatusVideoResponse,
+            data=OmniProReferences2VideoRequest(
+                model_name=model_name,
+                prompt=prompt,
+                aspect_ratio=None,
+                duration=None,
+                image_list=image_list if image_list else None,
+                video_list=video_list,
+            ),
+        )
+        return await finish_omni_video_task(cls, response)
+
+
 class KlingCameraControlT2VNode(IO.ComfyNode):
     """
     Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera.
@@ -785,7 +1185,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode):
         negative_prompt: str,
         cfg_scale: float,
         aspect_ratio: str,
-        camera_control: Optional[KlingCameraControl] = None,
+        camera_control: KlingCameraControl | None = None,
     ) -> IO.NodeOutput:
         return await execute_text2video(
             cls,
@@ -852,8 +1252,8 @@ class KlingImage2VideoNode(IO.ComfyNode):
         mode: str,
         aspect_ratio: str,
         duration: str,
-        camera_control: Optional[KlingCameraControl] = None,
-        end_frame: Optional[torch.Tensor] = None,
+        camera_control: KlingCameraControl | None = None,
+        end_frame: torch.Tensor | None = None,
     ) -> IO.NodeOutput:
         return await execute_image2video(
             cls,
@@ -963,15 +1363,11 @@ class KlingStartEndFrameNode(IO.ComfyNode):
                 IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
                 IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"),
                 IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0),
-                IO.Combo.Input(
-                    "aspect_ratio",
-                    options=[i.value for i in KlingVideoGenAspectRatio],
-                    default="16:9",
-                ),
+                IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
                 IO.Combo.Input(
                     "mode",
                     options=modes,
-                    default=modes[2],
+                    default=modes[8],
                     tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.",
                 ),
             ],
@@ -1168,7 +1564,10 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode):
             category="api node/video/Kling",
             description="Achieve different special effects when generating a video based on the effect_scene.",
             inputs=[
-                IO.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"),
+                IO.Image.Input(
+                    "image",
+                    tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1",
+                ),
                 IO.Combo.Input(
                     "effect_scene",
                     options=[i.value for i in KlingSingleImageEffectsScene],
@@ -1252,8 +1651,8 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode):
     @classmethod
     async def execute(
         cls,
-        video: VideoInput,
-        audio: AudioInput,
+        video: Input.Video,
+        audio: Input.Audio,
         voice_language: str,
     ) -> IO.NodeOutput:
         return await execute_lipsync(
@@ -1312,7 +1711,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode):
     @classmethod
     async def execute(
         cls,
-        video: VideoInput,
+        video: Input.Video,
         text: str,
         voice: str,
         voice_speed: float,
@@ -1469,7 +1868,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
         human_fidelity: float,
         n: int,
         aspect_ratio: KlingImageGenAspectRatio,
-        image: Optional[torch.Tensor] = None,
+        image: torch.Tensor | None = None,
     ) -> IO.NodeOutput:
         validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
         validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN)
@@ -1531,6 +1930,11 @@ class KlingExtension(ComfyExtension):
             KlingImageGenerationNode,
             KlingSingleImageVideoEffectNode,
             KlingDualCharacterVideoEffectNode,
+            OmniProTextToVideoNode,
+            OmniProFirstLastFrameNode,
+            OmniProImageToVideoNode,
+            OmniProVideoToVideoNode,
+            OmniProEditVideoNode,
         ]
 
 
diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py
index acf35d276..c8da5464b 100644
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -1,15 +1,10 @@
 from io import BytesIO
-from typing import Optional, Union
-import json
 import os
-import time
-import uuid
 from enum import Enum
 from inspect import cleandoc
 import numpy as np
 import torch
 from PIL import Image
-from server import PromptServer
 import folder_paths
 import base64
 from comfy_api.latest import IO, ComfyExtension
@@ -587,11 +582,11 @@ class OpenAIChatNode(IO.ComfyNode):
     def create_input_message_contents(
         cls,
         prompt: str,
-        image: Optional[torch.Tensor] = None,
-        files: Optional[list[InputFileContent]] = None,
+        image: torch.Tensor | None = None,
+        files: list[InputFileContent] | None = None,
     ) -> InputMessageContentList:
         """Create a list of input message contents from prompt and optional image."""
-        content_list: list[Union[InputContent, InputTextContent, InputImageContent, InputFileContent]] = [
+        content_list: list[InputContent | InputTextContent | InputImageContent | InputFileContent] = [
             InputTextContent(text=prompt, type="input_text"),
         ]
         if image is not None:
@@ -617,9 +612,9 @@ class OpenAIChatNode(IO.ComfyNode):
         prompt: str,
         persist_context: bool = False,
         model: SupportedOpenAIModel = SupportedOpenAIModel.gpt_5.value,
-        images: Optional[torch.Tensor] = None,
-        files: Optional[list[InputFileContent]] = None,
-        advanced_options: Optional[CreateModelResponseProperties] = None,
+        images: torch.Tensor | None = None,
+        files: list[InputFileContent] | None = None,
+        advanced_options: CreateModelResponseProperties | None = None,
     ) -> IO.NodeOutput:
         validate_string(prompt, strip_whitespace=False)
 
@@ -660,30 +655,7 @@ class OpenAIChatNode(IO.ComfyNode):
                 status_extractor=lambda response: response.status,
                 completed_statuses=["incomplete", "completed"]
             )
-        output_text = cls.get_text_from_message_content(cls.get_message_content_from_response(result_response))
-
-        # Update history
-        render_spec = {
-            "node_id": cls.hidden.unique_id,
-            "component": "ChatHistoryWidget",
-            "props": {
-                "history": json.dumps(
-                    [
-                        {
-                            "prompt": prompt,
-                            "response": output_text,
-                            "response_id": str(uuid.uuid4()),
-                            "timestamp": time.time(),
-                        }
-                    ]
-                ),
-            },
-        }
-        PromptServer.instance.send_sync(
-            "display_component",
-            render_spec,
-        )
-        return IO.NodeOutput(output_text)
+        return IO.NodeOutput(cls.get_text_from_message_content(cls.get_message_content_from_response(result_response)))
 
 
 class OpenAIInputFiles(IO.ComfyNode):
@@ -790,8 +762,8 @@ class OpenAIChatConfig(IO.ComfyNode):
     def execute(
         cls,
         truncation: bool,
-        instructions: Optional[str] = None,
-        max_output_tokens: Optional[int] = None,
+        instructions: str | None = None,
+        max_output_tokens: int | None = None,
     ) -> IO.NodeOutput:
         """
         Configure advanced options for the OpenAI Chat Node.
diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py
new file mode 100644
index 000000000..f522756e5
--- /dev/null
+++ b/comfy_api_nodes/nodes_topaz.py
@@ -0,0 +1,418 @@
+import builtins
+from io import BytesIO
+
+import aiohttp
+import torch
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis import topaz_api
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_image_tensor,
+    download_url_to_video_output,
+    get_fs_object_size,
+    get_number_of_images,
+    poll_op,
+    sync_op,
+    upload_images_to_comfyapi,
+    validate_container_format_is_mp4,
+)
+
+UPSCALER_MODELS_MAP = {
+    "Starlight (Astra) Fast": "slf-1",
+    "Starlight (Astra) Creative": "slc-1",
+}
+UPSCALER_VALUES_MAP = {
+    "FullHD (1080p)": 1920,
+    "4K (2160p)": 3840,
+}
+
+
+class TopazImageEnhance(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="TopazImageEnhance",
+            display_name="Topaz Image Enhance",
+            category="api node/image/Topaz",
+            description="Industry-standard upscaling and image enhancement.",
+            inputs=[
+                IO.Combo.Input("model", options=["Reimagine"]),
+                IO.Image.Input("image"),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Optional text prompt for creative upscaling guidance.",
+                    optional=True,
+                ),
+                IO.Combo.Input(
+                    "subject_detection",
+                    options=["All", "Foreground", "Background"],
+                    optional=True,
+                ),
+                IO.Boolean.Input(
+                    "face_enhancement",
+                    default=True,
+                    optional=True,
+                    tooltip="Enhance faces (if present) during processing.",
+                ),
+                IO.Float.Input(
+                    "face_enhancement_creativity",
+                    default=0.0,
+                    min=0.0,
+                    max=1.0,
+                    step=0.01,
+                    display_mode=IO.NumberDisplay.number,
+                    optional=True,
+                    tooltip="Set the creativity level for face enhancement.",
+                ),
+                IO.Float.Input(
+                    "face_enhancement_strength",
+                    default=1.0,
+                    min=0.0,
+                    max=1.0,
+                    step=0.01,
+                    display_mode=IO.NumberDisplay.number,
+                    optional=True,
+                    tooltip="Controls how sharp enhanced faces are relative to the background.",
+                ),
+                IO.Boolean.Input(
+                    "crop_to_fill",
+                    default=False,
+                    optional=True,
+                    tooltip="By default, the image is letterboxed when the output aspect ratio differs. "
+                    "Enable to crop the image to fill the output dimensions.",
+                ),
+                IO.Int.Input(
+                    "output_width",
+                    default=0,
+                    min=0,
+                    max=32000,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    optional=True,
+                    tooltip="Zero value means to calculate automatically (usually it will be original size or output_height if specified).",
+                ),
+                IO.Int.Input(
+                    "output_height",
+                    default=0,
+                    min=0,
+                    max=32000,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    optional=True,
+                    tooltip="Zero value means to output in the same height as original or output width.",
+                ),
+                IO.Int.Input(
+                    "creativity",
+                    default=3,
+                    min=1,
+                    max=9,
+                    step=1,
+                    display_mode=IO.NumberDisplay.slider,
+                    optional=True,
+                ),
+                IO.Boolean.Input(
+                    "face_preservation",
+                    default=True,
+                    optional=True,
+                    tooltip="Preserve subjects' facial identity.",
+                ),
+                IO.Boolean.Input(
+                    "color_preservation",
+                    default=True,
+                    optional=True,
+                    tooltip="Preserve the original colors.",
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: torch.Tensor,
+        prompt: str = "",
+        subject_detection: str = "All",
+        face_enhancement: bool = True,
+        face_enhancement_creativity: float = 1.0,
+        face_enhancement_strength: float = 0.8,
+        crop_to_fill: bool = False,
+        output_width: int = 0,
+        output_height: int = 0,
+        creativity: int = 3,
+        face_preservation: bool = True,
+        color_preservation: bool = True,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Only one input image is supported.")
+        download_url = await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png")
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/topaz/image/v1/enhance-gen/async", method="POST"),
+            response_model=topaz_api.ImageAsyncTaskResponse,
+            data=topaz_api.ImageEnhanceRequest(
+                model=model,
+                prompt=prompt,
+                subject_detection=subject_detection,
+                face_enhancement=face_enhancement,
+                face_enhancement_creativity=face_enhancement_creativity,
+                face_enhancement_strength=face_enhancement_strength,
+                crop_to_fill=crop_to_fill,
+                output_width=output_width if output_width else None,
+                output_height=output_height if output_height else None,
+                creativity=creativity,
+                face_preservation=str(face_preservation).lower(),
+                color_preservation=str(color_preservation).lower(),
+                source_url=download_url[0],
+                output_format="png",
+            ),
+            content_type="multipart/form-data",
+        )
+
+        await poll_op(
+            cls,
+            poll_endpoint=ApiEndpoint(path=f"/proxy/topaz/image/v1/status/{initial_response.process_id}"),
+            response_model=topaz_api.ImageStatusResponse,
+            status_extractor=lambda x: x.status,
+            progress_extractor=lambda x: getattr(x, "progress", 0),
+            price_extractor=lambda x: x.credits * 0.08,
+            poll_interval=8.0,
+            max_poll_attempts=160,
+            estimated_duration=60,
+        )
+
+        results = await sync_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/topaz/image/v1/download/{initial_response.process_id}"),
+            response_model=topaz_api.ImageDownloadResponse,
+            monitor_progress=False,
+        )
+        return IO.NodeOutput(await download_url_to_image_tensor(results.download_url))
+
+
+class TopazVideoEnhance(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="TopazVideoEnhance",
+            display_name="Topaz Video Enhance",
+            category="api node/video/Topaz",
+            description="Breathe new life into video with powerful upscaling and recovery technology.",
+            inputs=[
+                IO.Video.Input("video"),
+                IO.Boolean.Input("upscaler_enabled", default=True),
+                IO.Combo.Input("upscaler_model", options=list(UPSCALER_MODELS_MAP.keys())),
+                IO.Combo.Input("upscaler_resolution", options=list(UPSCALER_VALUES_MAP.keys())),
+                IO.Combo.Input(
+                    "upscaler_creativity",
+                    options=["low", "middle", "high"],
+                    default="low",
+                    tooltip="Creativity level (applies only to Starlight (Astra) Creative).",
+                    optional=True,
+                ),
+                IO.Boolean.Input("interpolation_enabled", default=False, optional=True),
+                IO.Combo.Input("interpolation_model", options=["apo-8"], default="apo-8", optional=True),
+                IO.Int.Input(
+                    "interpolation_slowmo",
+                    default=1,
+                    min=1,
+                    max=16,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Slow-motion factor applied to the input video. "
+                    "For example, 2 makes the output twice as slow and doubles the duration.",
+                    optional=True,
+                ),
+                IO.Int.Input(
+                    "interpolation_frame_rate",
+                    default=60,
+                    min=15,
+                    max=240,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Output frame rate.",
+                    optional=True,
+                ),
+                IO.Boolean.Input(
+                    "interpolation_duplicate",
+                    default=False,
+                    tooltip="Analyze the input for duplicate frames and remove them.",
+                    optional=True,
+                ),
+                IO.Float.Input(
+                    "interpolation_duplicate_threshold",
+                    default=0.01,
+                    min=0.001,
+                    max=0.1,
+                    step=0.001,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Detection sensitivity for duplicate frames.",
+                    optional=True,
+                ),
+                IO.Combo.Input(
+                    "dynamic_compression_level",
+                    options=["Low", "Mid", "High"],
+                    default="Low",
+                    tooltip="CQP level.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        upscaler_enabled: bool,
+        upscaler_model: str,
+        upscaler_resolution: str,
+        upscaler_creativity: str = "low",
+        interpolation_enabled: bool = False,
+        interpolation_model: str = "apo-8",
+        interpolation_slowmo: int = 1,
+        interpolation_frame_rate: int = 60,
+        interpolation_duplicate: bool = False,
+        interpolation_duplicate_threshold: float = 0.01,
+        dynamic_compression_level: str = "Low",
+    ) -> IO.NodeOutput:
+        if upscaler_enabled is False and interpolation_enabled is False:
+            raise ValueError("There is nothing to do: both upscaling and interpolation are disabled.")
+        validate_container_format_is_mp4(video)
+        src_width, src_height = video.get_dimensions()
+        src_frame_rate = int(video.get_frame_rate())
+        duration_sec = video.get_duration()
+        src_video_stream = video.get_stream_source()
+        target_width = src_width
+        target_height = src_height
+        target_frame_rate = src_frame_rate
+        filters = []
+        if upscaler_enabled:
+            target_width = UPSCALER_VALUES_MAP[upscaler_resolution]
+            target_height = UPSCALER_VALUES_MAP[upscaler_resolution]
+            filters.append(
+                topaz_api.VideoEnhancementFilter(
+                    model=UPSCALER_MODELS_MAP[upscaler_model],
+                    creativity=(upscaler_creativity if UPSCALER_MODELS_MAP[upscaler_model] == "slc-1" else None),
+                    isOptimizedMode=(True if UPSCALER_MODELS_MAP[upscaler_model] == "slc-1" else None),
+                ),
+            )
+        if interpolation_enabled:
+            target_frame_rate = interpolation_frame_rate
+            filters.append(
+                topaz_api.VideoFrameInterpolationFilter(
+                    model=interpolation_model,
+                    slowmo=interpolation_slowmo,
+                    fps=interpolation_frame_rate,
+                    duplicate=interpolation_duplicate,
+                    duplicate_threshold=interpolation_duplicate_threshold,
+                ),
+            )
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/topaz/video/", method="POST"),
+            response_model=topaz_api.CreateVideoResponse,
+            data=topaz_api.CreateVideoRequest(
+                source=topaz_api.CreateCreateVideoRequestSource(
+                    container="mp4",
+                    size=get_fs_object_size(src_video_stream),
+                    duration=int(duration_sec),
+                    frameCount=video.get_frame_count(),
+                    frameRate=src_frame_rate,
+                    resolution=topaz_api.Resolution(width=src_width, height=src_height),
+                ),
+                filters=filters,
+                output=topaz_api.OutputInformationVideo(
+                    resolution=topaz_api.Resolution(width=target_width, height=target_height),
+                    frameRate=target_frame_rate,
+                    audioCodec="AAC",
+                    audioTransfer="Copy",
+                    dynamicCompressionLevel=dynamic_compression_level,
+                ),
+            ),
+            wait_label="Creating task",
+            final_label_on_success="Task created",
+        )
+        upload_res = await sync_op(
+            cls,
+            ApiEndpoint(
+                path=f"/proxy/topaz/video/{initial_res.requestId}/accept",
+                method="PATCH",
+            ),
+            response_model=topaz_api.VideoAcceptResponse,
+            wait_label="Preparing upload",
+            final_label_on_success="Upload started",
+        )
+        if len(upload_res.urls) > 1:
+            raise NotImplementedError(
+                "Large files are not currently supported. Please open an issue in the ComfyUI repository."
+            )
+        async with aiohttp.ClientSession(headers={"Content-Type": "video/mp4"}) as session:
+            if isinstance(src_video_stream, BytesIO):
+                src_video_stream.seek(0)
+                async with session.put(upload_res.urls[0], data=src_video_stream, raise_for_status=True) as res:
+                    upload_etag = res.headers["Etag"]
+            else:
+                with builtins.open(src_video_stream, "rb") as video_file:
+                    async with session.put(upload_res.urls[0], data=video_file, raise_for_status=True) as res:
+                        upload_etag = res.headers["Etag"]
+        await sync_op(
+            cls,
+            ApiEndpoint(
+                path=f"/proxy/topaz/video/{initial_res.requestId}/complete-upload",
+                method="PATCH",
+            ),
+            response_model=topaz_api.VideoCompleteUploadResponse,
+            data=topaz_api.VideoCompleteUploadRequest(
+                uploadResults=[
+                    topaz_api.VideoCompleteUploadRequestPart(
+                        partNum=1,
+                        eTag=upload_etag,
+                    ),
+                ],
+            ),
+            wait_label="Finalizing upload",
+            final_label_on_success="Upload completed",
+        )
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/topaz/video/{initial_res.requestId}/status"),
+            response_model=topaz_api.VideoStatusResponse,
+            status_extractor=lambda x: x.status,
+            progress_extractor=lambda x: getattr(x, "progress", 0),
+            price_extractor=lambda x: (x.estimates.cost[0] * 0.08 if x.estimates and x.estimates.cost[0] else None),
+            poll_interval=10.0,
+            max_poll_attempts=320,
+        )
+        return IO.NodeOutput(await download_url_to_video_output(final_response.download.url))
+
+
+class TopazExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            TopazImageEnhance,
+            TopazVideoEnhance,
+        ]
+
+
+async def comfy_entrypoint() -> TopazExtension:
+    return TopazExtension()
diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py
index d37e9e9b4..a54dc13ab 100644
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@@ -1,6 +1,7 @@
 import base64
 from io import BytesIO
 
+import torch
 from typing_extensions import override
 
 from comfy_api.input_impl.video_types import VideoFromFile
@@ -10,6 +11,9 @@ from comfy_api_nodes.apis.veo_api import (
     VeoGenVidPollResponse,
     VeoGenVidRequest,
     VeoGenVidResponse,
+    VeoRequestInstance,
+    VeoRequestInstanceImage,
+    VeoRequestParameters,
 )
 from comfy_api_nodes.util import (
     ApiEndpoint,
@@ -346,12 +350,163 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
         )
 
 
+class Veo3FirstLastFrameNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="Veo3FirstLastFrameNode",
+            display_name="Google Veo 3 First-Last-Frame to Video",
+            category="api node/video/Veo",
+            description="Generate video using prompt and first and last frames.",
+            inputs=[
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Text description of the video",
+                ),
+                IO.String.Input(
+                    "negative_prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Negative text prompt to guide what to avoid in the video",
+                ),
+                IO.Combo.Input("resolution", options=["720p", "1080p"]),
+                IO.Combo.Input(
+                    "aspect_ratio",
+                    options=["16:9", "9:16"],
+                    default="16:9",
+                    tooltip="Aspect ratio of the output video",
+                ),
+                IO.Int.Input(
+                    "duration",
+                    default=8,
+                    min=4,
+                    max=8,
+                    step=2,
+                    display_mode=IO.NumberDisplay.slider,
+                    tooltip="Duration of the output video in seconds",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=0xFFFFFFFF,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                    tooltip="Seed for video generation",
+                ),
+                IO.Image.Input("first_frame", tooltip="Start frame"),
+                IO.Image.Input("last_frame", tooltip="End frame"),
+                IO.Combo.Input(
+                    "model",
+                    options=["veo-3.1-generate", "veo-3.1-fast-generate"],
+                    default="veo-3.1-fast-generate",
+                ),
+                IO.Boolean.Input(
+                    "generate_audio",
+                    default=True,
+                    tooltip="Generate audio for the video.",
+                ),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        prompt: str,
+        negative_prompt: str,
+        resolution: str,
+        aspect_ratio: str,
+        duration: int,
+        seed: int,
+        first_frame: torch.Tensor,
+        last_frame: torch.Tensor,
+        model: str,
+        generate_audio: bool,
+    ):
+        model = MODELS_MAP[model]
+        initial_response = await sync_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"),
+            response_model=VeoGenVidResponse,
+            data=VeoGenVidRequest(
+                instances=[
+                    VeoRequestInstance(
+                        prompt=prompt,
+                        image=VeoRequestInstanceImage(
+                            bytesBase64Encoded=tensor_to_base64_string(first_frame), mimeType="image/png"
+                        ),
+                        lastFrame=VeoRequestInstanceImage(
+                            bytesBase64Encoded=tensor_to_base64_string(last_frame), mimeType="image/png"
+                        ),
+                    ),
+                ],
+                parameters=VeoRequestParameters(
+                    aspectRatio=aspect_ratio,
+                    personGeneration="ALLOW",
+                    durationSeconds=duration,
+                    enhancePrompt=True,  # cannot be False for Veo3
+                    seed=seed,
+                    generateAudio=generate_audio,
+                    negativePrompt=negative_prompt,
+                    resolution=resolution,
+                ),
+            ),
+        )
+        poll_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"),
+            response_model=VeoGenVidPollResponse,
+            status_extractor=lambda r: "completed" if r.done else "pending",
+            data=VeoGenVidPollRequest(
+                operationName=initial_response.name,
+            ),
+            poll_interval=5.0,
+            estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
+        )
+
+        if poll_response.error:
+            raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})")
+
+        response = poll_response.response
+        filtered_count = response.raiMediaFilteredCount
+        if filtered_count:
+            reasons = response.raiMediaFilteredReasons or []
+            reason_part = f": {reasons[0]}" if reasons else ""
+            raise Exception(
+                f"Content blocked by Google's Responsible AI filters{reason_part} "
+                f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)."
+            )
+
+        if response.videos:
+            video = response.videos[0]
+            if video.bytesBase64Encoded:
+                return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
+            if video.gcsUri:
+                return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
+            raise Exception("Video returned but no data or URL was provided")
+        raise Exception("Video generation completed but no video was returned")
+
+
 class VeoExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[IO.ComfyNode]]:
         return [
             VeoVideoGenerationNode,
             Veo3VideoGenerationNode,
+            Veo3FirstLastFrameNode,
         ]
 
 
diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py
index 21013b591..80292fb3c 100644
--- a/comfy_api_nodes/util/__init__.py
+++ b/comfy_api_nodes/util/__init__.py
@@ -36,6 +36,7 @@ from .upload_helpers import (
     upload_video_to_comfyapi,
 )
 from .validation_utils import (
+    get_image_dimensions,
     get_number_of_images,
     validate_aspect_ratio_string,
     validate_audio_duration,
@@ -82,6 +83,7 @@ __all__ = [
     "trim_video",
     "video_to_base64_string",
     # Validation utilities
+    "get_image_dimensions",
     "get_number_of_images",
     "validate_aspect_ratio_string",
     "validate_audio_duration",
diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py
index 2d5dcd648..bf01d7d36 100644
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@@ -63,6 +63,7 @@ class _RequestConfig:
     estimated_total: Optional[int] = None
     final_label_on_success: Optional[str] = "Completed"
     progress_origin_ts: Optional[float] = None
+    price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None
 
 
 @dataclass
@@ -77,9 +78,9 @@ class _PollUIState:
 
 
 _RETRY_STATUS = {408, 429, 500, 502, 503, 504}
-COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done"]
-FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"]
-QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"]
+COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"]
+FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"]
+QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing"]
 
 
 async def sync_op(
@@ -87,6 +88,7 @@ async def sync_op(
     endpoint: ApiEndpoint,
     *,
     response_model: Type[M],
+    price_extractor: Optional[Callable[[M], Optional[float]]] = None,
     data: Optional[BaseModel] = None,
     files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None,
     content_type: str = "application/json",
@@ -104,6 +106,7 @@ async def sync_op(
     raw = await sync_op_raw(
         cls,
         endpoint,
+        price_extractor=_wrap_model_extractor(response_model, price_extractor),
         data=data,
         files=files,
         content_type=content_type,
@@ -175,6 +178,7 @@ async def sync_op_raw(
     cls: type[IO.ComfyNode],
     endpoint: ApiEndpoint,
     *,
+    price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None,
     data: Optional[Union[dict[str, Any], BaseModel]] = None,
     files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None,
     content_type: str = "application/json",
@@ -216,6 +220,7 @@ async def sync_op_raw(
         estimated_total=estimated_duration,
         final_label_on_success=final_label_on_success,
         progress_origin_ts=progress_origin_ts,
+        price_extractor=price_extractor,
     )
     return await _request_base(cfg, expect_binary=as_binary)
 
@@ -424,7 +429,9 @@ def _display_text(
     if status:
         display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}")
     if price is not None:
-        display_lines.append(f"Price: ${float(price):,.4f}")
+        p = f"{float(price):,.4f}".rstrip("0").rstrip(".")
+        if p != "0":
+            display_lines.append(f"Price: ${p}")
     if text is not None:
         display_lines.append(text)
     if display_lines:
@@ -580,6 +587,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
     delay = cfg.retry_delay
     operation_succeeded: bool = False
     final_elapsed_seconds: Optional[int] = None
+    extracted_price: Optional[float] = None
     while True:
         attempt += 1
         stop_event = asyncio.Event()
@@ -767,6 +775,8 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                         except json.JSONDecodeError:
                             payload = {"_raw": text}
                         response_content_to_log = payload if isinstance(payload, dict) else text
+                    with contextlib.suppress(Exception):
+                        extracted_price = cfg.price_extractor(payload) if cfg.price_extractor else None
                     operation_succeeded = True
                     final_elapsed_seconds = int(time.monotonic() - start_time)
                     try:
@@ -871,7 +881,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                         else int(time.monotonic() - start_time)
                     ),
                     estimated_total=cfg.estimated_total,
-                    price=None,
+                    price=extracted_price,
                     is_queued=False,
                     processing_elapsed_seconds=final_elapsed_seconds,
                 )
diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py
index 632450d9b..0532bea9a 100644
--- a/comfy_api_nodes/util/upload_helpers.py
+++ b/comfy_api_nodes/util/upload_helpers.py
@@ -4,7 +4,7 @@ import logging
 import time
 import uuid
 from io import BytesIO
-from typing import Optional, Union
+from typing import Optional
 from urllib.parse import urlparse
 
 import aiohttp
@@ -48,8 +48,9 @@ async def upload_images_to_comfyapi(
     image: torch.Tensor,
     *,
     max_images: int = 8,
-    mime_type: Optional[str] = None,
-    wait_label: Optional[str] = "Uploading",
+    mime_type: str | None = None,
+    wait_label: str | None = "Uploading",
+    show_batch_index: bool = True,
 ) -> list[str]:
     """
     Uploads images to ComfyUI API and returns download URLs.
@@ -59,11 +60,18 @@ async def upload_images_to_comfyapi(
     download_urls: list[str] = []
     is_batch = len(image.shape) > 3
     batch_len = image.shape[0] if is_batch else 1
+    num_to_upload = min(batch_len, max_images)
+    batch_start_ts = time.monotonic()
 
-    for idx in range(min(batch_len, max_images)):
+    for idx in range(num_to_upload):
         tensor = image[idx] if is_batch else image
         img_io = tensor_to_bytesio(tensor, mime_type=mime_type)
-        url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, wait_label)
+
+        effective_label = wait_label
+        if wait_label and show_batch_index and num_to_upload > 1:
+            effective_label = f"{wait_label} ({idx + 1}/{num_to_upload})"
+
+        url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, effective_label, batch_start_ts)
         download_urls.append(url)
     return download_urls
 
@@ -95,6 +103,7 @@ async def upload_video_to_comfyapi(
     container: VideoContainer = VideoContainer.MP4,
     codec: VideoCodec = VideoCodec.H264,
     max_duration: Optional[int] = None,
+    wait_label: str | None = "Uploading",
 ) -> str:
     """
     Uploads a single video to ComfyUI API and returns its download URL.
@@ -119,15 +128,16 @@ async def upload_video_to_comfyapi(
     video.save_to(video_bytes_io, format=container, codec=codec)
     video_bytes_io.seek(0)
 
-    return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type)
+    return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type, wait_label)
 
 
 async def upload_file_to_comfyapi(
     cls: type[IO.ComfyNode],
     file_bytes_io: BytesIO,
     filename: str,
-    upload_mime_type: Optional[str],
-    wait_label: Optional[str] = "Uploading",
+    upload_mime_type: str | None,
+    wait_label: str | None = "Uploading",
+    progress_origin_ts: float | None = None,
 ) -> str:
     """Uploads a single file to ComfyUI API and returns its download URL."""
     if upload_mime_type is None:
@@ -148,6 +158,7 @@ async def upload_file_to_comfyapi(
         file_bytes_io,
         content_type=upload_mime_type,
         wait_label=wait_label,
+        progress_origin_ts=progress_origin_ts,
     )
     return create_resp.download_url
 
@@ -155,27 +166,18 @@ async def upload_file_to_comfyapi(
 async def upload_file(
     cls: type[IO.ComfyNode],
     upload_url: str,
-    file: Union[BytesIO, str],
+    file: BytesIO | str,
     *,
-    content_type: Optional[str] = None,
+    content_type: str | None = None,
     max_retries: int = 3,
     retry_delay: float = 1.0,
     retry_backoff: float = 2.0,
-    wait_label: Optional[str] = None,
+    wait_label: str | None = None,
+    progress_origin_ts: float | None = None,
 ) -> None:
     """
     Upload a file to a signed URL (e.g., S3 pre-signed PUT) with retries, Comfy progress display, and interruption.
 
-    Args:
-        cls: Node class (provides auth context + UI progress hooks).
-        upload_url: Pre-signed PUT URL.
-        file: BytesIO or path string.
-        content_type: Explicit MIME type. If None, we *suppress* Content-Type.
-        max_retries: Maximum retry attempts.
-        retry_delay: Initial delay in seconds.
-        retry_backoff: Exponential backoff factor.
-        wait_label: Progress label shown in Comfy UI.
-
     Raises:
         ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception
     """
@@ -198,7 +200,7 @@ async def upload_file(
 
     attempt = 0
     delay = retry_delay
-    start_ts = time.monotonic()
+    start_ts = progress_origin_ts if progress_origin_ts is not None else time.monotonic()
     op_uuid = uuid.uuid4().hex[:8]
     while True:
         attempt += 1
diff --git a/comfy_execution/validation.py b/comfy_execution/validation.py
index cec105fc9..24c0b4ed7 100644
--- a/comfy_execution/validation.py
+++ b/comfy_execution/validation.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+from comfy_api.latest import IO
 
 
 def validate_node_input(
@@ -23,6 +24,11 @@ def validate_node_input(
     if not received_type != input_type:
         return True
 
+    # If the received type or input_type is a MatchType, we can return True immediately;
+    # validation for this is handled by the frontend
+    if received_type == IO.MatchType.io_type or input_type == IO.MatchType.io_type:
+        return True
+
     # Not equal, and not strings
     if not isinstance(received_type, str) or not isinstance(input_type, str):
         return False
diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
index d011f433b..fbb080886 100644
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -3,272 +3,312 @@ import comfy.samplers
 import comfy.sample
 from comfy.k_diffusion import sampling as k_diffusion_sampling
 from comfy.k_diffusion import sa_solver
-from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
 import latent_preview
 import torch
 import comfy.utils
 import node_helpers
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
 
 
-class BasicScheduler:
+class BasicScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                     "scheduler": (comfy.samplers.SCHEDULER_NAMES, ),
-                     "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                      }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="BasicScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Model.Input("model"),
+                io.Combo.Input("scheduler", options=comfy.samplers.SCHEDULER_NAMES),
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, model, scheduler, steps, denoise):
+    @classmethod
+    def execute(cls, model, scheduler, steps, denoise) -> io.NodeOutput:
         total_steps = steps
         if denoise < 1.0:
             if denoise <= 0.0:
-                return (torch.FloatTensor([]),)
+                return io.NodeOutput(torch.FloatTensor([]))
             total_steps = int(steps/denoise)
 
         sigmas = comfy.samplers.calculate_sigmas(model.get_model_object("model_sampling"), scheduler, total_steps).cpu()
         sigmas = sigmas[-(steps + 1):]
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
+
+    get_sigmas = execute
 
 
-class KarrasScheduler:
+class KarrasScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "rho": ("FLOAT", {"default": 7.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="KarrasScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("rho", default=7.0, min=0.0, max=100.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, steps, sigma_max, sigma_min, rho):
+    @classmethod
+    def execute(cls, steps, sigma_max, sigma_min, rho) -> io.NodeOutput:
         sigmas = k_diffusion_sampling.get_sigmas_karras(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, rho=rho)
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class ExponentialScheduler:
+    get_sigmas = execute
+
+class ExponentialScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ExponentialScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, steps, sigma_max, sigma_min):
+    @classmethod
+    def execute(cls, steps, sigma_max, sigma_min) -> io.NodeOutput:
         sigmas = k_diffusion_sampling.get_sigmas_exponential(n=steps, sigma_min=sigma_min, sigma_max=sigma_max)
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class PolyexponentialScheduler:
+    get_sigmas = execute
+
+class PolyexponentialScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "rho": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="PolyexponentialScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("rho", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, steps, sigma_max, sigma_min, rho):
+    @classmethod
+    def execute(cls, steps, sigma_max, sigma_min, rho) -> io.NodeOutput:
         sigmas = k_diffusion_sampling.get_sigmas_polyexponential(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, rho=rho)
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class LaplaceScheduler:
+    get_sigmas = execute
+
+class LaplaceScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "sigma_max": ("FLOAT", {"default": 14.614642, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "sigma_min": ("FLOAT", {"default": 0.0291675, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "mu": ("FLOAT", {"default": 0.0, "min": -10.0, "max": 10.0, "step":0.1, "round": False}),
-                     "beta": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 10.0, "step":0.1, "round": False}),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LaplaceScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("sigma_min", default=0.0291675, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("mu", default=0.0, min=-10.0, max=10.0, step=0.1, round=False),
+                io.Float.Input("beta", default=0.5, min=0.0, max=10.0, step=0.1, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, steps, sigma_max, sigma_min, mu, beta):
+    @classmethod
+    def execute(cls, steps, sigma_max, sigma_min, mu, beta) -> io.NodeOutput:
         sigmas = k_diffusion_sampling.get_sigmas_laplace(n=steps, sigma_min=sigma_min, sigma_max=sigma_max, mu=mu, beta=beta)
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
+
+    get_sigmas = execute
 
 
-class SDTurboScheduler:
+class SDTurboScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                     "steps": ("INT", {"default": 1, "min": 1, "max": 10}),
-                     "denoise": ("FLOAT", {"default": 1.0, "min": 0, "max": 1.0, "step": 0.01}),
-                      }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SDTurboScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Model.Input("model"),
+                io.Int.Input("steps", default=1, min=1, max=10),
+                io.Float.Input("denoise", default=1.0, min=0, max=1.0, step=0.01),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, model, steps, denoise):
+    @classmethod
+    def execute(cls, model, steps, denoise) -> io.NodeOutput:
         start_step = 10 - int(10 * denoise)
         timesteps = torch.flip(torch.arange(1, 11) * 100 - 1, (0,))[start_step:start_step + steps]
         sigmas = model.get_model_object("model_sampling").sigma(timesteps)
         sigmas = torch.cat([sigmas, sigmas.new_zeros([1])])
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class BetaSamplingScheduler:
+    get_sigmas = execute
+
+class BetaSamplingScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                     "steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "alpha": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 50.0, "step":0.01, "round": False}),
-                     "beta": ("FLOAT", {"default": 0.6, "min": 0.0, "max": 50.0, "step":0.01, "round": False}),
-                      }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="BetaSamplingScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Model.Input("model"),
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("alpha", default=0.6, min=0.0, max=50.0, step=0.01, round=False),
+                io.Float.Input("beta", default=0.6, min=0.0, max=50.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, model, steps, alpha, beta):
+    @classmethod
+    def execute(cls, model, steps, alpha, beta) -> io.NodeOutput:
         sigmas = comfy.samplers.beta_scheduler(model.get_model_object("model_sampling"), steps, alpha=alpha, beta=beta)
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class VPScheduler:
+    get_sigmas = execute
+
+class VPScheduler(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}),
-                     "beta_d": ("FLOAT", {"default": 19.9, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}), #TODO: fix default values
-                     "beta_min": ("FLOAT", {"default": 0.1, "min": 0.0, "max": 5000.0, "step":0.01, "round": False}),
-                     "eps_s": ("FLOAT", {"default": 0.001, "min": 0.0, "max": 1.0, "step":0.0001, "round": False}),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/schedulers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="VPScheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=10000),
+                io.Float.Input("beta_d", default=19.9, min=0.0, max=5000.0, step=0.01, round=False), #TODO: fix default values
+                io.Float.Input("beta_min", default=0.1, min=0.0, max=5000.0, step=0.01, round=False),
+                io.Float.Input("eps_s", default=0.001, min=0.0, max=1.0, step=0.0001, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, steps, beta_d, beta_min, eps_s):
+    @classmethod
+    def execute(cls, steps, beta_d, beta_min, eps_s) -> io.NodeOutput:
         sigmas = k_diffusion_sampling.get_sigmas_vp(n=steps, beta_d=beta_d, beta_min=beta_min, eps_s=eps_s)
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class SplitSigmas:
+    get_sigmas = execute
+
+class SplitSigmas(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"sigmas": ("SIGMAS", ),
-                    "step": ("INT", {"default": 0, "min": 0, "max": 10000}),
-                     }
-                }
-    RETURN_TYPES = ("SIGMAS","SIGMAS")
-    RETURN_NAMES = ("high_sigmas", "low_sigmas")
-    CATEGORY = "sampling/custom_sampling/sigmas"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SplitSigmas",
+            category="sampling/custom_sampling/sigmas",
+            inputs=[
+                io.Sigmas.Input("sigmas"),
+                io.Int.Input("step", default=0, min=0, max=10000),
+            ],
+            outputs=[
+                io.Sigmas.Output(display_name="high_sigmas"),
+                io.Sigmas.Output(display_name="low_sigmas"),
+            ]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, sigmas, step):
+    @classmethod
+    def execute(cls, sigmas, step) -> io.NodeOutput:
         sigmas1 = sigmas[:step + 1]
         sigmas2 = sigmas[step:]
-        return (sigmas1, sigmas2)
+        return io.NodeOutput(sigmas1, sigmas2)
 
-class SplitSigmasDenoise:
+    get_sigmas = execute
+
+class SplitSigmasDenoise(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"sigmas": ("SIGMAS", ),
-                    "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
-                     }
-                }
-    RETURN_TYPES = ("SIGMAS","SIGMAS")
-    RETURN_NAMES = ("high_sigmas", "low_sigmas")
-    CATEGORY = "sampling/custom_sampling/sigmas"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SplitSigmasDenoise",
+            category="sampling/custom_sampling/sigmas",
+            inputs=[
+                io.Sigmas.Input("sigmas"),
+                io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01),
+            ],
+            outputs=[
+                io.Sigmas.Output(display_name="high_sigmas"),
+                io.Sigmas.Output(display_name="low_sigmas"),
+            ]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, sigmas, denoise):
+    @classmethod
+    def execute(cls, sigmas, denoise) -> io.NodeOutput:
         steps = max(sigmas.shape[-1] - 1, 0)
         total_steps = round(steps * denoise)
         sigmas1 = sigmas[:-(total_steps)]
         sigmas2 = sigmas[-(total_steps + 1):]
-        return (sigmas1, sigmas2)
+        return io.NodeOutput(sigmas1, sigmas2)
 
-class FlipSigmas:
+    get_sigmas = execute
+
+class FlipSigmas(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"sigmas": ("SIGMAS", ),
-                     }
-                }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/sigmas"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="FlipSigmas",
+            category="sampling/custom_sampling/sigmas",
+            inputs=[io.Sigmas.Input("sigmas")],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "get_sigmas"
-
-    def get_sigmas(self, sigmas):
+    @classmethod
+    def execute(cls, sigmas) -> io.NodeOutput:
         if len(sigmas) == 0:
-            return (sigmas,)
+            return io.NodeOutput(sigmas)
 
         sigmas = sigmas.flip(0)
         if sigmas[0] == 0:
             sigmas[0] = 0.0001
-        return (sigmas,)
+        return io.NodeOutput(sigmas)
 
-class SetFirstSigma:
+    get_sigmas = execute
+
+class SetFirstSigma(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"sigmas": ("SIGMAS", ),
-                     "sigma": ("FLOAT", {"default": 136.0, "min": 0.0, "max": 20000.0, "step": 0.001, "round": False}),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/sigmas"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SetFirstSigma",
+            category="sampling/custom_sampling/sigmas",
+            inputs=[
+                io.Sigmas.Input("sigmas"),
+                io.Float.Input("sigma", default=136.0, min=0.0, max=20000.0, step=0.001, round=False),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "set_first_sigma"
-
-    def set_first_sigma(self, sigmas, sigma):
+    @classmethod
+    def execute(cls, sigmas, sigma) -> io.NodeOutput:
         sigmas = sigmas.clone()
         sigmas[0] = sigma
-        return (sigmas, )
+        return io.NodeOutput(sigmas)
 
-class ExtendIntermediateSigmas:
+    set_first_sigma = execute
+
+class ExtendIntermediateSigmas(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"sigmas": ("SIGMAS", ),
-                     "steps": ("INT", {"default": 2, "min": 1, "max": 100}),
-                     "start_at_sigma": ("FLOAT", {"default": -1.0, "min": -1.0, "max": 20000.0, "step": 0.01, "round": False}),
-                     "end_at_sigma": ("FLOAT", {"default": 12.0, "min":  0.0, "max": 20000.0, "step": 0.01, "round": False}),
-                     "spacing": (['linear', 'cosine', 'sine'],),
-                    }
-               }
-    RETURN_TYPES = ("SIGMAS",)
-    CATEGORY = "sampling/custom_sampling/sigmas"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ExtendIntermediateSigmas",
+            category="sampling/custom_sampling/sigmas",
+            inputs=[
+                io.Sigmas.Input("sigmas"),
+                io.Int.Input("steps", default=2, min=1, max=100),
+                io.Float.Input("start_at_sigma", default=-1.0, min=-1.0, max=20000.0, step=0.01, round=False),
+                io.Float.Input("end_at_sigma", default=12.0, min=0.0, max=20000.0, step=0.01, round=False),
+                io.Combo.Input("spacing", options=['linear', 'cosine', 'sine']),
+            ],
+            outputs=[io.Sigmas.Output()]
+        )
 
-    FUNCTION = "extend"
-
-    def extend(self, sigmas: torch.Tensor, steps: int, start_at_sigma: float, end_at_sigma: float, spacing: str):
+    @classmethod
+    def execute(cls, sigmas: torch.Tensor, steps: int, start_at_sigma: float, end_at_sigma: float, spacing: str) -> io.NodeOutput:
         if start_at_sigma < 0:
             start_at_sigma = float("inf")
 
@@ -299,27 +339,27 @@ class ExtendIntermediateSigmas:
 
         extended_sigmas = torch.FloatTensor(extended_sigmas)
 
-        return (extended_sigmas,)
+        return io.NodeOutput(extended_sigmas)
+
+    extend = execute
 
 
-class SamplingPercentToSigma:
+class SamplingPercentToSigma(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(cls) -> InputTypeDict:
-        return {
-            "required": {
-                "model": (IO.MODEL, {}),
-                "sampling_percent": (IO.FLOAT, {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.0001}),
-                "return_actual_sigma": (IO.BOOLEAN, {"default": False, "tooltip": "Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."}),
-            }
-        }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplingPercentToSigma",
+            category="sampling/custom_sampling/sigmas",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input("sampling_percent", default=0.0, min=0.0, max=1.0, step=0.0001),
+                io.Boolean.Input("return_actual_sigma", default=False, tooltip="Return the actual sigma value instead of the value used for interval checks.\nThis only affects results at 0.0 and 1.0."),
+            ],
+            outputs=[io.Float.Output(display_name="sigma_value")]
+        )
 
-    RETURN_TYPES = (IO.FLOAT,)
-    RETURN_NAMES = ("sigma_value",)
-    CATEGORY = "sampling/custom_sampling/sigmas"
-
-    FUNCTION = "get_sigma"
-
-    def get_sigma(self, model, sampling_percent, return_actual_sigma):
+    @classmethod
+    def execute(cls, model, sampling_percent, return_actual_sigma) -> io.NodeOutput:
         model_sampling = model.get_model_object("model_sampling")
         sigma_val = model_sampling.percent_to_sigma(sampling_percent)
         if return_actual_sigma:
@@ -327,212 +367,234 @@ class SamplingPercentToSigma:
                 sigma_val = model_sampling.sigma_max.item()
             elif sampling_percent == 1.0:
                 sigma_val = model_sampling.sigma_min.item()
-        return (sigma_val,)
+        return io.NodeOutput(sigma_val)
+
+    get_sigma = execute
 
 
-class KSamplerSelect:
+class KSamplerSelect(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"sampler_name": (comfy.samplers.SAMPLER_NAMES, ),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="KSamplerSelect",
+            category="sampling/custom_sampling/samplers",
+            inputs=[io.Combo.Input("sampler_name", options=comfy.samplers.SAMPLER_NAMES)],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, sampler_name):
+    @classmethod
+    def execute(cls, sampler_name) -> io.NodeOutput:
         sampler = comfy.samplers.sampler_object(sampler_name)
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerDPMPP_3M_SDE:
+    get_sampler = execute
+
+class SamplerDPMPP_3M_SDE(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "noise_device": (['gpu', 'cpu'], ),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerDPMPP_3M_SDE",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Combo.Input("noise_device", options=['gpu', 'cpu']),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, eta, s_noise, noise_device):
+    @classmethod
+    def execute(cls, eta, s_noise, noise_device) -> io.NodeOutput:
         if noise_device == 'cpu':
             sampler_name = "dpmpp_3m_sde"
         else:
             sampler_name = "dpmpp_3m_sde_gpu"
         sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise})
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerDPMPP_2M_SDE:
+    get_sampler = execute
+
+class SamplerDPMPP_2M_SDE(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"solver_type": (['midpoint', 'heun'], ),
-                     "eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "noise_device": (['gpu', 'cpu'], ),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerDPMPP_2M_SDE",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Combo.Input("solver_type", options=['midpoint', 'heun']),
+                io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Combo.Input("noise_device", options=['gpu', 'cpu']),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, solver_type, eta, s_noise, noise_device):
+    @classmethod
+    def execute(cls, solver_type, eta, s_noise, noise_device) -> io.NodeOutput:
         if noise_device == 'cpu':
             sampler_name = "dpmpp_2m_sde"
         else:
             sampler_name = "dpmpp_2m_sde_gpu"
         sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "solver_type": solver_type})
-        return (sampler, )
+        return io.NodeOutput(sampler)
+
+    get_sampler = execute
 
 
-class SamplerDPMPP_SDE:
+class SamplerDPMPP_SDE(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "r": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "noise_device": (['gpu', 'cpu'], ),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerDPMPP_SDE",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("r", default=0.5, min=0.0, max=100.0, step=0.01, round=False),
+                io.Combo.Input("noise_device", options=['gpu', 'cpu']),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, eta, s_noise, r, noise_device):
+    @classmethod
+    def execute(cls, eta, s_noise, r, noise_device) -> io.NodeOutput:
         if noise_device == 'cpu':
             sampler_name = "dpmpp_sde"
         else:
             sampler_name = "dpmpp_sde_gpu"
         sampler = comfy.samplers.ksampler(sampler_name, {"eta": eta, "s_noise": s_noise, "r": r})
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerDPMPP_2S_Ancestral:
+    get_sampler = execute
+
+class SamplerDPMPP_2S_Ancestral(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerDPMPP_2S_Ancestral",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, eta, s_noise):
+    @classmethod
+    def execute(cls, eta, s_noise) -> io.NodeOutput:
         sampler = comfy.samplers.ksampler("dpmpp_2s_ancestral", {"eta": eta, "s_noise": s_noise})
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerEulerAncestral:
+    get_sampler = execute
+
+class SamplerEulerAncestral(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerEulerAncestral",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, eta, s_noise):
+    @classmethod
+    def execute(cls, eta, s_noise) -> io.NodeOutput:
         sampler = comfy.samplers.ksampler("euler_ancestral", {"eta": eta, "s_noise": s_noise})
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerEulerAncestralCFGPP:
+    get_sampler = execute
+
+class SamplerEulerAncestralCFGPP(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "eta": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step":0.01, "round": False}),
-                "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step":0.01, "round": False}),
-            }}
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerEulerAncestralCFGPP",
+            display_name="SamplerEulerAncestralCFG++",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Float.Input("eta", default=1.0, min=0.0, max=1.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=10.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, eta, s_noise):
+    @classmethod
+    def execute(cls, eta, s_noise) -> io.NodeOutput:
         sampler = comfy.samplers.ksampler(
             "euler_ancestral_cfg_pp",
             {"eta": eta, "s_noise": s_noise})
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerLMS:
+    get_sampler = execute
+
+class SamplerLMS(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"order": ("INT", {"default": 4, "min": 1, "max": 100}),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerLMS",
+            category="sampling/custom_sampling/samplers",
+            inputs=[io.Int.Input("order", default=4, min=1, max=100)],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, order):
+    @classmethod
+    def execute(cls, order) -> io.NodeOutput:
         sampler = comfy.samplers.ksampler("lms", {"order": order})
-        return (sampler, )
+        return io.NodeOutput(sampler)
 
-class SamplerDPMAdaptative:
+    get_sampler = execute
+
+class SamplerDPMAdaptative(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"order": ("INT", {"default": 3, "min": 2, "max": 3}),
-                     "rtol": ("FLOAT", {"default": 0.05, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "atol": ("FLOAT", {"default": 0.0078, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "h_init": ("FLOAT", {"default": 0.05, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "pcoeff": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "icoeff": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "dcoeff": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "accept_safety": ("FLOAT", {"default": 0.81, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "eta": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                     "s_noise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step":0.01, "round": False}),
-                      }
-               }
-    RETURN_TYPES = ("SAMPLER",)
-    CATEGORY = "sampling/custom_sampling/samplers"
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerDPMAdaptative",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Int.Input("order", default=3, min=2, max=3),
+                io.Float.Input("rtol", default=0.05, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("atol", default=0.0078, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("h_init", default=0.05, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("pcoeff", default=0.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("icoeff", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("dcoeff", default=0.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("accept_safety", default=0.81, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("eta", default=0.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise):
+    @classmethod
+    def execute(cls, order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise) -> io.NodeOutput:
         sampler = comfy.samplers.ksampler("dpm_adaptive", {"order": order, "rtol": rtol, "atol": atol, "h_init": h_init, "pcoeff": pcoeff,
                                                               "icoeff": icoeff, "dcoeff": dcoeff, "accept_safety": accept_safety, "eta": eta,
                                                               "s_noise":s_noise })
-        return (sampler, )
+        return io.NodeOutput(sampler)
+
+    get_sampler = execute
 
 
-class SamplerER_SDE(ComfyNodeABC):
+class SamplerER_SDE(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(cls) -> InputTypeDict:
-        return {
-            "required": {
-                "solver_type": (IO.COMBO, {"options": ["ER-SDE", "Reverse-time SDE", "ODE"]}),
-                "max_stage": (IO.INT, {"default": 3, "min": 1, "max": 3}),
-                "eta": (
-                    IO.FLOAT,
-                    {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False, "tooltip": "Stochastic strength of reverse-time SDE.\nWhen eta=0, it reduces to deterministic ODE. This setting doesn't apply to ER-SDE solver type."},
-                ),
-                "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False}),
-            }
-        }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerER_SDE",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Combo.Input("solver_type", options=["ER-SDE", "Reverse-time SDE", "ODE"]),
+                io.Int.Input("max_stage", default=3, min=1, max=3),
+                io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength of reverse-time SDE.\nWhen eta=0, it reduces to deterministic ODE. This setting doesn't apply to ER-SDE solver type."),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    RETURN_TYPES = (IO.SAMPLER,)
-    CATEGORY = "sampling/custom_sampling/samplers"
-
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, solver_type, max_stage, eta, s_noise):
+    @classmethod
+    def execute(cls, solver_type, max_stage, eta, s_noise) -> io.NodeOutput:
         if solver_type == "ODE" or (solver_type == "Reverse-time SDE" and eta == 0):
             eta = 0
             s_noise = 0
@@ -548,32 +610,33 @@ class SamplerER_SDE(ComfyNodeABC):
 
         sampler_name = "er_sde"
         sampler = comfy.samplers.ksampler(sampler_name, {"s_noise": s_noise, "noise_scaler": noise_scaler, "max_stage": max_stage})
-        return (sampler,)
+        return io.NodeOutput(sampler)
+
+    get_sampler = execute
 
 
-class SamplerSASolver(ComfyNodeABC):
+class SamplerSASolver(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(cls) -> InputTypeDict:
-        return {
-            "required": {
-                "model": (IO.MODEL, {}),
-                "eta": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01, "round": False},),
-                "sde_start_percent": (IO.FLOAT, {"default": 0.2, "min": 0.0, "max": 1.0, "step": 0.001},),
-                "sde_end_percent": (IO.FLOAT, {"default": 0.8, "min": 0.0, "max": 1.0, "step": 0.001},),
-                "s_noise": (IO.FLOAT, {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": False},),
-                "predictor_order": (IO.INT, {"default": 3, "min": 1, "max": 6}),
-                "corrector_order": (IO.INT, {"default": 4, "min": 0, "max": 6}),
-                "use_pece": (IO.BOOLEAN, {}),
-                "simple_order_2": (IO.BOOLEAN, {}),
-            }
-        }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerSASolver",
+            category="sampling/custom_sampling/samplers",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input("eta", default=1.0, min=0.0, max=10.0, step=0.01, round=False),
+                io.Float.Input("sde_start_percent", default=0.2, min=0.0, max=1.0, step=0.001),
+                io.Float.Input("sde_end_percent", default=0.8, min=0.0, max=1.0, step=0.001),
+                io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
+                io.Int.Input("predictor_order", default=3, min=1, max=6),
+                io.Int.Input("corrector_order", default=4, min=0, max=6),
+                io.Boolean.Input("use_pece"),
+                io.Boolean.Input("simple_order_2"),
+            ],
+            outputs=[io.Sampler.Output()]
+        )
 
-    RETURN_TYPES = (IO.SAMPLER,)
-    CATEGORY = "sampling/custom_sampling/samplers"
-
-    FUNCTION = "get_sampler"
-
-    def get_sampler(self, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2):
+    @classmethod
+    def execute(cls, model, eta, sde_start_percent, sde_end_percent, s_noise, predictor_order, corrector_order, use_pece, simple_order_2) -> io.NodeOutput:
         model_sampling = model.get_model_object("model_sampling")
         start_sigma = model_sampling.percent_to_sigma(sde_start_percent)
         end_sigma = model_sampling.percent_to_sigma(sde_end_percent)
@@ -591,7 +654,9 @@ class SamplerSASolver(ComfyNodeABC):
                 "simple_order_2": simple_order_2,
             },
         )
-        return (sampler,)
+        return io.NodeOutput(sampler)
+
+    get_sampler = execute
 
 
 class Noise_EmptyNoise:
@@ -612,30 +677,31 @@ class Noise_RandomNoise:
         batch_inds = input_latent["batch_index"] if "batch_index" in input_latent else None
         return comfy.sample.prepare_noise(latent_image, self.seed, batch_inds)
 
-class SamplerCustom:
+class SamplerCustom(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                    "add_noise": ("BOOLEAN", {"default": True}),
-                    "noise_seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff, "control_after_generate": True}),
-                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
-                    "positive": ("CONDITIONING", ),
-                    "negative": ("CONDITIONING", ),
-                    "sampler": ("SAMPLER", ),
-                    "sigmas": ("SIGMAS", ),
-                    "latent_image": ("LATENT", ),
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerCustom",
+            category="sampling/custom_sampling",
+            inputs=[
+                io.Model.Input("model"),
+                io.Boolean.Input("add_noise", default=True),
+                io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
+                io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01),
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Sampler.Input("sampler"),
+                io.Sigmas.Input("sigmas"),
+                io.Latent.Input("latent_image"),
+            ],
+            outputs=[
+                io.Latent.Output(display_name="output"),
+                io.Latent.Output(display_name="denoised_output"),
+            ]
+        )
 
-    RETURN_TYPES = ("LATENT","LATENT")
-    RETURN_NAMES = ("output", "denoised_output")
-
-    FUNCTION = "sample"
-
-    CATEGORY = "sampling/custom_sampling"
-
-    def sample(self, model, add_noise, noise_seed, cfg, positive, negative, sampler, sigmas, latent_image):
+    @classmethod
+    def execute(cls, model, add_noise, noise_seed, cfg, positive, negative, sampler, sigmas, latent_image) -> io.NodeOutput:
         latent = latent_image
         latent_image = latent["samples"]
         latent = latent.copy()
@@ -664,52 +730,58 @@ class SamplerCustom:
             out_denoised["samples"] = model.model.process_latent_out(x0_output["x0"].cpu())
         else:
             out_denoised = out
-        return (out, out_denoised)
+        return io.NodeOutput(out, out_denoised)
+
+    sample = execute
 
 class Guider_Basic(comfy.samplers.CFGGuider):
     def set_conds(self, positive):
         self.inner_set_conds({"positive": positive})
 
-class BasicGuider:
+class BasicGuider(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                    "conditioning": ("CONDITIONING", ),
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="BasicGuider",
+            category="sampling/custom_sampling/guiders",
+            inputs=[
+                io.Model.Input("model"),
+                io.Conditioning.Input("conditioning"),
+            ],
+            outputs=[io.Guider.Output()]
+        )
 
-    RETURN_TYPES = ("GUIDER",)
-
-    FUNCTION = "get_guider"
-    CATEGORY = "sampling/custom_sampling/guiders"
-
-    def get_guider(self, model, conditioning):
+    @classmethod
+    def execute(cls, model, conditioning) -> io.NodeOutput:
         guider = Guider_Basic(model)
         guider.set_conds(conditioning)
-        return (guider,)
+        return io.NodeOutput(guider)
 
-class CFGGuider:
+    get_guider = execute
+
+class CFGGuider(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                    "positive": ("CONDITIONING", ),
-                    "negative": ("CONDITIONING", ),
-                    "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="CFGGuider",
+            category="sampling/custom_sampling/guiders",
+            inputs=[
+                io.Model.Input("model"),
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01),
+            ],
+            outputs=[io.Guider.Output()]
+        )
 
-    RETURN_TYPES = ("GUIDER",)
-
-    FUNCTION = "get_guider"
-    CATEGORY = "sampling/custom_sampling/guiders"
-
-    def get_guider(self, model, positive, negative, cfg):
+    @classmethod
+    def execute(cls, model, positive, negative, cfg) -> io.NodeOutput:
         guider = comfy.samplers.CFGGuider(model)
         guider.set_conds(positive, negative)
         guider.set_cfg(cfg)
-        return (guider,)
+        return io.NodeOutput(guider)
+
+    get_guider = execute
 
 class Guider_DualCFG(comfy.samplers.CFGGuider):
     def set_cfg(self, cfg1, cfg2, nested=False):
@@ -740,84 +812,88 @@ class Guider_DualCFG(comfy.samplers.CFGGuider):
             out = comfy.samplers.calc_cond_batch(self.inner_model, [negative_cond, middle_cond, positive_cond], x, timestep, model_options)
             return comfy.samplers.cfg_function(self.inner_model, out[1], out[0], self.cfg2, x, timestep, model_options=model_options, cond=middle_cond, uncond=negative_cond) + (out[2] - out[1]) * self.cfg1
 
-class DualCFGGuider:
+class DualCFGGuider(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                    "cond1": ("CONDITIONING", ),
-                    "cond2": ("CONDITIONING", ),
-                    "negative": ("CONDITIONING", ),
-                    "cfg_conds": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
-                    "cfg_cond2_negative": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}),
-                    "style": (["regular", "nested"],),
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DualCFGGuider",
+            category="sampling/custom_sampling/guiders",
+            inputs=[
+                io.Model.Input("model"),
+                io.Conditioning.Input("cond1"),
+                io.Conditioning.Input("cond2"),
+                io.Conditioning.Input("negative"),
+                io.Float.Input("cfg_conds", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01),
+                io.Float.Input("cfg_cond2_negative", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01),
+                io.Combo.Input("style", options=["regular", "nested"]),
+            ],
+            outputs=[io.Guider.Output()]
+        )
 
-    RETURN_TYPES = ("GUIDER",)
-
-    FUNCTION = "get_guider"
-    CATEGORY = "sampling/custom_sampling/guiders"
-
-    def get_guider(self, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style):
+    @classmethod
+    def execute(cls, model, cond1, cond2, negative, cfg_conds, cfg_cond2_negative, style) -> io.NodeOutput:
         guider = Guider_DualCFG(model)
         guider.set_conds(cond1, cond2, negative)
         guider.set_cfg(cfg_conds, cfg_cond2_negative, nested=(style == "nested"))
-        return (guider,)
+        return io.NodeOutput(guider)
 
-class DisableNoise:
+    get_guider = execute
+
+class DisableNoise(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":{
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DisableNoise",
+            category="sampling/custom_sampling/noise",
+            inputs=[],
+            outputs=[io.Noise.Output()]
+        )
 
-    RETURN_TYPES = ("NOISE",)
-    FUNCTION = "get_noise"
-    CATEGORY = "sampling/custom_sampling/noise"
-
-    def get_noise(self):
-        return (Noise_EmptyNoise(),)
-
-
-class RandomNoise(DisableNoise):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "noise_seed": ("INT", {
-                    "default": 0,
-                    "min": 0,
-                    "max": 0xffffffffffffffff,
-                    "control_after_generate": True,
-                }),
-            }
-        }
+    def execute(cls) -> io.NodeOutput:
+        return io.NodeOutput(Noise_EmptyNoise())
 
-    def get_noise(self, noise_seed):
-        return (Noise_RandomNoise(noise_seed),)
+    get_noise = execute
 
 
-class SamplerCustomAdvanced:
+class RandomNoise(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"noise": ("NOISE", ),
-                    "guider": ("GUIDER", ),
-                    "sampler": ("SAMPLER", ),
-                    "sigmas": ("SIGMAS", ),
-                    "latent_image": ("LATENT", ),
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="RandomNoise",
+            category="sampling/custom_sampling/noise",
+            inputs=[io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True)],
+            outputs=[io.Noise.Output()]
+        )
 
-    RETURN_TYPES = ("LATENT","LATENT")
-    RETURN_NAMES = ("output", "denoised_output")
+    @classmethod
+    def execute(cls, noise_seed) -> io.NodeOutput:
+        return io.NodeOutput(Noise_RandomNoise(noise_seed))
 
-    FUNCTION = "sample"
+    get_noise = execute
 
-    CATEGORY = "sampling/custom_sampling"
 
-    def sample(self, noise, guider, sampler, sigmas, latent_image):
+class SamplerCustomAdvanced(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SamplerCustomAdvanced",
+            category="sampling/custom_sampling",
+            inputs=[
+                io.Noise.Input("noise"),
+                io.Guider.Input("guider"),
+                io.Sampler.Input("sampler"),
+                io.Sigmas.Input("sigmas"),
+                io.Latent.Input("latent_image"),
+            ],
+            outputs=[
+                io.Latent.Output(display_name="output"),
+                io.Latent.Output(display_name="denoised_output"),
+            ]
+        )
+
+    @classmethod
+    def execute(cls, noise, guider, sampler, sigmas, latent_image) -> io.NodeOutput:
         latent = latent_image
         latent_image = latent["samples"]
         latent = latent.copy()
@@ -842,28 +918,32 @@ class SamplerCustomAdvanced:
             out_denoised["samples"] = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu())
         else:
             out_denoised = out
-        return (out, out_denoised)
+        return io.NodeOutput(out, out_denoised)
 
-class AddNoise:
+    sample = execute
+
+class AddNoise(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required":
-                    {"model": ("MODEL",),
-                     "noise": ("NOISE", ),
-                     "sigmas": ("SIGMAS", ),
-                     "latent_image": ("LATENT", ),
-                     }
-                }
+    def define_schema(cls):
+        return io.Schema(
+            node_id="AddNoise",
+            category="_for_testing/custom_sampling/noise",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input("model"),
+                io.Noise.Input("noise"),
+                io.Sigmas.Input("sigmas"),
+                io.Latent.Input("latent_image"),
+            ],
+            outputs=[
+                io.Latent.Output(),
+            ]
+        )
 
-    RETURN_TYPES = ("LATENT",)
-
-    FUNCTION = "add_noise"
-
-    CATEGORY = "_for_testing/custom_sampling/noise"
-
-    def add_noise(self, model, noise, sigmas, latent_image):
+    @classmethod
+    def execute(cls, model, noise, sigmas, latent_image) -> io.NodeOutput:
         if len(sigmas) == 0:
-            return latent_image
+            return io.NodeOutput(latent_image)
 
         latent = latent_image
         latent_image = latent["samples"]
@@ -887,46 +967,50 @@ class AddNoise:
 
         out = latent.copy()
         out["samples"] = noisy
-        return (out,)
+        return io.NodeOutput(out)
+
+    add_noise = execute
 
 
-NODE_CLASS_MAPPINGS = {
-    "SamplerCustom": SamplerCustom,
-    "BasicScheduler": BasicScheduler,
-    "KarrasScheduler": KarrasScheduler,
-    "ExponentialScheduler": ExponentialScheduler,
-    "PolyexponentialScheduler": PolyexponentialScheduler,
-    "LaplaceScheduler": LaplaceScheduler,
-    "VPScheduler": VPScheduler,
-    "BetaSamplingScheduler": BetaSamplingScheduler,
-    "SDTurboScheduler": SDTurboScheduler,
-    "KSamplerSelect": KSamplerSelect,
-    "SamplerEulerAncestral": SamplerEulerAncestral,
-    "SamplerEulerAncestralCFGPP": SamplerEulerAncestralCFGPP,
-    "SamplerLMS": SamplerLMS,
-    "SamplerDPMPP_3M_SDE": SamplerDPMPP_3M_SDE,
-    "SamplerDPMPP_2M_SDE": SamplerDPMPP_2M_SDE,
-    "SamplerDPMPP_SDE": SamplerDPMPP_SDE,
-    "SamplerDPMPP_2S_Ancestral": SamplerDPMPP_2S_Ancestral,
-    "SamplerDPMAdaptative": SamplerDPMAdaptative,
-    "SamplerER_SDE": SamplerER_SDE,
-    "SamplerSASolver": SamplerSASolver,
-    "SplitSigmas": SplitSigmas,
-    "SplitSigmasDenoise": SplitSigmasDenoise,
-    "FlipSigmas": FlipSigmas,
-    "SetFirstSigma": SetFirstSigma,
-    "ExtendIntermediateSigmas": ExtendIntermediateSigmas,
-    "SamplingPercentToSigma": SamplingPercentToSigma,
+class CustomSamplersExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            SamplerCustom,
+            BasicScheduler,
+            KarrasScheduler,
+            ExponentialScheduler,
+            PolyexponentialScheduler,
+            LaplaceScheduler,
+            VPScheduler,
+            BetaSamplingScheduler,
+            SDTurboScheduler,
+            KSamplerSelect,
+            SamplerEulerAncestral,
+            SamplerEulerAncestralCFGPP,
+            SamplerLMS,
+            SamplerDPMPP_3M_SDE,
+            SamplerDPMPP_2M_SDE,
+            SamplerDPMPP_SDE,
+            SamplerDPMPP_2S_Ancestral,
+            SamplerDPMAdaptative,
+            SamplerER_SDE,
+            SamplerSASolver,
+            SplitSigmas,
+            SplitSigmasDenoise,
+            FlipSigmas,
+            SetFirstSigma,
+            ExtendIntermediateSigmas,
+            SamplingPercentToSigma,
+            CFGGuider,
+            DualCFGGuider,
+            BasicGuider,
+            RandomNoise,
+            DisableNoise,
+            AddNoise,
+            SamplerCustomAdvanced,
+        ]
 
-    "CFGGuider": CFGGuider,
-    "DualCFGGuider": DualCFGGuider,
-    "BasicGuider": BasicGuider,
-    "RandomNoise": RandomNoise,
-    "DisableNoise": DisableNoise,
-    "AddNoise": AddNoise,
-    "SamplerCustomAdvanced": SamplerCustomAdvanced,
-}
 
-NODE_DISPLAY_NAME_MAPPINGS = {
-    "SamplerEulerAncestralCFGPP": "SamplerEulerAncestralCFG++",
-}
+async def comfy_entrypoint() -> CustomSamplersExtension:
+    return CustomSamplersExtension()
diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py
new file mode 100644
index 000000000..4789d7d53
--- /dev/null
+++ b/comfy_extras/nodes_dataset.py
@@ -0,0 +1,1432 @@
+import logging
+import os
+import json
+
+import numpy as np
+import torch
+from PIL import Image
+from typing_extensions import override
+
+import folder_paths
+import node_helpers
+from comfy_api.latest import ComfyExtension, io
+
+
+def load_and_process_images(image_files, input_dir):
+    """Utility function to load and process a list of images.
+
+    Args:
+        image_files: List of image filenames
+        input_dir: Base directory containing the images
+        resize_method: How to handle images of different sizes ("None", "Stretch", "Crop", "Pad")
+
+    Returns:
+        torch.Tensor: Batch of processed images
+    """
+    if not image_files:
+        raise ValueError("No valid images found in input")
+
+    output_images = []
+
+    for file in image_files:
+        image_path = os.path.join(input_dir, file)
+        img = node_helpers.pillow(Image.open, image_path)
+
+        if img.mode == "I":
+            img = img.point(lambda i: i * (1 / 255))
+        img = img.convert("RGB")
+        img_array = np.array(img).astype(np.float32) / 255.0
+        img_tensor = torch.from_numpy(img_array)[None,]
+        output_images.append(img_tensor)
+
+    return output_images
+
+
+class LoadImageDataSetFromFolderNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LoadImageDataSetFromFolder",
+            display_name="Load Image Dataset from Folder",
+            category="dataset",
+            is_experimental=True,
+            inputs=[
+                io.Combo.Input(
+                    "folder",
+                    options=folder_paths.get_input_subfolders(),
+                    tooltip="The folder to load images from.",
+                )
+            ],
+            outputs=[
+                io.Image.Output(
+                    display_name="images",
+                    is_output_list=True,
+                    tooltip="List of loaded images",
+                )
+            ],
+        )
+
+    @classmethod
+    def execute(cls, folder):
+        sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder)
+        valid_extensions = [".png", ".jpg", ".jpeg", ".webp"]
+        image_files = [
+            f
+            for f in os.listdir(sub_input_dir)
+            if any(f.lower().endswith(ext) for ext in valid_extensions)
+        ]
+        output_tensor = load_and_process_images(image_files, sub_input_dir)
+        return io.NodeOutput(output_tensor)
+
+
+class LoadImageTextDataSetFromFolderNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LoadImageTextDataSetFromFolder",
+            display_name="Load Image and Text Dataset from Folder",
+            category="dataset",
+            is_experimental=True,
+            inputs=[
+                io.Combo.Input(
+                    "folder",
+                    options=folder_paths.get_input_subfolders(),
+                    tooltip="The folder to load images from.",
+                )
+            ],
+            outputs=[
+                io.Image.Output(
+                    display_name="images",
+                    is_output_list=True,
+                    tooltip="List of loaded images",
+                ),
+                io.String.Output(
+                    display_name="texts",
+                    is_output_list=True,
+                    tooltip="List of text captions",
+                ),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, folder):
+        logging.info(f"Loading images from folder: {folder}")
+
+        sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder)
+        valid_extensions = [".png", ".jpg", ".jpeg", ".webp"]
+
+        image_files = []
+        for item in os.listdir(sub_input_dir):
+            path = os.path.join(sub_input_dir, item)
+            if any(item.lower().endswith(ext) for ext in valid_extensions):
+                image_files.append(path)
+            elif os.path.isdir(path):
+                # Support kohya-ss/sd-scripts folder structure
+                repeat = 1
+                if item.split("_")[0].isdigit():
+                    repeat = int(item.split("_")[0])
+                image_files.extend(
+                    [
+                        os.path.join(path, f)
+                        for f in os.listdir(path)
+                        if any(f.lower().endswith(ext) for ext in valid_extensions)
+                    ]
+                    * repeat
+                )
+
+        caption_file_path = [
+            f.replace(os.path.splitext(f)[1], ".txt") for f in image_files
+        ]
+        captions = []
+        for caption_file in caption_file_path:
+            caption_path = os.path.join(sub_input_dir, caption_file)
+            if os.path.exists(caption_path):
+                with open(caption_path, "r", encoding="utf-8") as f:
+                    caption = f.read().strip()
+                    captions.append(caption)
+            else:
+                captions.append("")
+
+        output_tensor = load_and_process_images(image_files, sub_input_dir)
+
+        logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.")
+        return io.NodeOutput(output_tensor, captions)
+
+
+def save_images_to_folder(image_list, output_dir, prefix="image"):
+    """Utility function to save a list of image tensors to disk.
+
+    Args:
+        image_list: List of image tensors (each [1, H, W, C] or [H, W, C] or [C, H, W])
+        output_dir: Directory to save images to
+        prefix: Filename prefix
+
+    Returns:
+        List of saved filenames
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    saved_files = []
+
+    for idx, img_tensor in enumerate(image_list):
+        # Handle different tensor shapes
+        if isinstance(img_tensor, torch.Tensor):
+            # Remove batch dimension if present [1, H, W, C] -> [H, W, C]
+            if img_tensor.dim() == 4 and img_tensor.shape[0] == 1:
+                img_tensor = img_tensor.squeeze(0)
+
+            # If tensor is [C, H, W], permute to [H, W, C]
+            if img_tensor.dim() == 3 and img_tensor.shape[0] in [1, 3, 4]:
+                if (
+                    img_tensor.shape[0] <= 4
+                    and img_tensor.shape[1] > 4
+                    and img_tensor.shape[2] > 4
+                ):
+                    img_tensor = img_tensor.permute(1, 2, 0)
+
+            # Convert to numpy and scale to 0-255
+            img_array = img_tensor.cpu().numpy()
+            img_array = np.clip(img_array * 255.0, 0, 255).astype(np.uint8)
+
+            # Convert to PIL Image
+            img = Image.fromarray(img_array)
+        else:
+            raise ValueError(f"Expected torch.Tensor, got {type(img_tensor)}")
+
+        # Save image
+        filename = f"{prefix}_{idx:05d}.png"
+        filepath = os.path.join(output_dir, filename)
+        img.save(filepath)
+        saved_files.append(filename)
+
+    return saved_files
+
+
+class SaveImageDataSetToFolderNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SaveImageDataSetToFolder",
+            display_name="Save Image Dataset to Folder",
+            category="dataset",
+            is_experimental=True,
+            is_output_node=True,
+            is_input_list=True,  # Receive images as list
+            inputs=[
+                io.Image.Input("images", tooltip="List of images to save."),
+                io.String.Input(
+                    "folder_name",
+                    default="dataset",
+                    tooltip="Name of the folder to save images to (inside output directory).",
+                ),
+                io.String.Input(
+                    "filename_prefix",
+                    default="image",
+                    tooltip="Prefix for saved image filenames.",
+                ),
+            ],
+            outputs=[],
+        )
+
+    @classmethod
+    def execute(cls, images, folder_name, filename_prefix):
+        # Extract scalar values
+        folder_name = folder_name[0]
+        filename_prefix = filename_prefix[0]
+
+        output_dir = os.path.join(folder_paths.get_output_directory(), folder_name)
+        saved_files = save_images_to_folder(images, output_dir, filename_prefix)
+
+        logging.info(f"Saved {len(saved_files)} images to {output_dir}.")
+        return io.NodeOutput()
+
+
+class SaveImageTextDataSetToFolderNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SaveImageTextDataSetToFolder",
+            display_name="Save Image and Text Dataset to Folder",
+            category="dataset",
+            is_experimental=True,
+            is_output_node=True,
+            is_input_list=True,  # Receive both images and texts as lists
+            inputs=[
+                io.Image.Input("images", tooltip="List of images to save."),
+                io.String.Input("texts", tooltip="List of text captions to save."),
+                io.String.Input(
+                    "folder_name",
+                    default="dataset",
+                    tooltip="Name of the folder to save images to (inside output directory).",
+                ),
+                io.String.Input(
+                    "filename_prefix",
+                    default="image",
+                    tooltip="Prefix for saved image filenames.",
+                ),
+            ],
+            outputs=[],
+        )
+
+    @classmethod
+    def execute(cls, images, texts, folder_name, filename_prefix):
+        # Extract scalar values
+        folder_name = folder_name[0]
+        filename_prefix = filename_prefix[0]
+
+        output_dir = os.path.join(folder_paths.get_output_directory(), folder_name)
+        saved_files = save_images_to_folder(images, output_dir, filename_prefix)
+
+        # Save captions
+        for idx, (filename, caption) in enumerate(zip(saved_files, texts)):
+            caption_filename = filename.replace(".png", ".txt")
+            caption_path = os.path.join(output_dir, caption_filename)
+            with open(caption_path, "w", encoding="utf-8") as f:
+                f.write(caption)
+
+        logging.info(f"Saved {len(saved_files)} images and captions to {output_dir}.")
+        return io.NodeOutput()
+
+
+# ========== Helper Functions for Transform Nodes ==========
+
+
+def tensor_to_pil(img_tensor):
+    """Convert tensor to PIL Image."""
+    if img_tensor.dim() == 4 and img_tensor.shape[0] == 1:
+        img_tensor = img_tensor.squeeze(0)
+    img_array = (img_tensor.cpu().numpy() * 255).clip(0, 255).astype(np.uint8)
+    return Image.fromarray(img_array)
+
+
+def pil_to_tensor(img):
+    """Convert PIL Image to tensor."""
+    img_array = np.array(img).astype(np.float32) / 255.0
+    return torch.from_numpy(img_array)[None,]
+
+
+# ========== Base Classes for Transform Nodes ==========
+
+
+class ImageProcessingNode(io.ComfyNode):
+    """Base class for image processing nodes that operate on images.
+
+    Child classes should set:
+        node_id: Unique node identifier (required)
+        display_name: Display name (optional, defaults to node_id)
+        description: Node description (optional)
+        extra_inputs: List of additional io.Input objects beyond "images" (optional)
+        is_group_process: None (auto-detect), True (group), or False (individual) (optional)
+        is_output_list: True (list output) or False (single output) (optional, default True)
+
+    Child classes must implement ONE of:
+        _process(cls, image, **kwargs) -> tensor  (for single-item processing)
+        _group_process(cls, images, **kwargs) -> list[tensor]  (for group processing)
+    """
+
+    node_id = None
+    display_name = None
+    description = None
+    extra_inputs = []
+    is_group_process = None  # None = auto-detect, True/False = explicit
+    is_output_list = None  # None = auto-detect based on processing mode
+
+    @classmethod
+    def _detect_processing_mode(cls):
+        """Detect whether this node uses group or individual processing.
+
+        Returns:
+            bool: True if group processing, False if individual processing
+        """
+        # Explicit setting takes precedence
+        if cls.is_group_process is not None:
+            return cls.is_group_process
+
+        # Check which method is overridden by looking at the defining class in MRO
+        base_class = ImageProcessingNode
+
+        # Find which class in MRO defines _process
+        process_definer = None
+        for klass in cls.__mro__:
+            if "_process" in klass.__dict__:
+                process_definer = klass
+                break
+
+        # Find which class in MRO defines _group_process
+        group_definer = None
+        for klass in cls.__mro__:
+            if "_group_process" in klass.__dict__:
+                group_definer = klass
+                break
+
+        # Check what was overridden (not defined in base class)
+        has_process = process_definer is not None and process_definer is not base_class
+        has_group = group_definer is not None and group_definer is not base_class
+
+        if has_process and has_group:
+            raise ValueError(
+                f"{cls.__name__}: Cannot override both _process and _group_process. "
+                "Override only one, or set is_group_process explicitly."
+            )
+        if not has_process and not has_group:
+            raise ValueError(
+                f"{cls.__name__}: Must override either _process or _group_process"
+            )
+
+        return has_group
+
+    @classmethod
+    def define_schema(cls):
+        if cls.node_id is None:
+            raise NotImplementedError(f"{cls.__name__} must set node_id class variable")
+
+        is_group = cls._detect_processing_mode()
+
+        # Auto-detect is_output_list if not explicitly set
+        # Single processing: False (backend collects results into list)
+        # Group processing: True by default (can be False for single-output nodes)
+        output_is_list = (
+            cls.is_output_list if cls.is_output_list is not None else is_group
+        )
+
+        inputs = [
+            io.Image.Input(
+                "images",
+                tooltip=(
+                    "List of images to process." if is_group else "Image to process."
+                ),
+            )
+        ]
+        inputs.extend(cls.extra_inputs)
+
+        return io.Schema(
+            node_id=cls.node_id,
+            display_name=cls.display_name or cls.node_id,
+            category="dataset/image",
+            is_experimental=True,
+            is_input_list=is_group,  # True for group, False for individual
+            inputs=inputs,
+            outputs=[
+                io.Image.Output(
+                    display_name="images",
+                    is_output_list=output_is_list,
+                    tooltip="Processed images",
+                )
+            ],
+        )
+
+    @classmethod
+    def execute(cls, images, **kwargs):
+        """Execute the node. Routes to _process or _group_process based on mode."""
+        is_group = cls._detect_processing_mode()
+
+        # Extract scalar values from lists for parameters
+        params = {}
+        for k, v in kwargs.items():
+            if isinstance(v, list) and len(v) == 1:
+                params[k] = v[0]
+            else:
+                params[k] = v
+
+        if is_group:
+            # Group processing: images is list, call _group_process
+            result = cls._group_process(images, **params)
+        else:
+            # Individual processing: images is single item, call _process
+            result = cls._process(images, **params)
+
+        return io.NodeOutput(result)
+
+    @classmethod
+    def _process(cls, image, **kwargs):
+        """Override this method for single-item processing.
+
+        Args:
+            image: tensor - Single image tensor
+            **kwargs: Additional parameters (already extracted from lists)
+
+        Returns:
+            tensor - Processed image
+        """
+        raise NotImplementedError(f"{cls.__name__} must implement _process method")
+
+    @classmethod
+    def _group_process(cls, images, **kwargs):
+        """Override this method for group processing.
+
+        Args:
+            images: list[tensor] - List of image tensors
+            **kwargs: Additional parameters (already extracted from lists)
+
+        Returns:
+            list[tensor] - Processed images
+        """
+        raise NotImplementedError(
+            f"{cls.__name__} must implement _group_process method"
+        )
+
+
+class TextProcessingNode(io.ComfyNode):
+    """Base class for text processing nodes that operate on texts.
+
+    Child classes should set:
+        node_id: Unique node identifier (required)
+        display_name: Display name (optional, defaults to node_id)
+        description: Node description (optional)
+        extra_inputs: List of additional io.Input objects beyond "texts" (optional)
+        is_group_process: None (auto-detect), True (group), or False (individual) (optional)
+        is_output_list: True (list output) or False (single output) (optional, default True)
+
+    Child classes must implement ONE of:
+        _process(cls, text, **kwargs) -> str  (for single-item processing)
+        _group_process(cls, texts, **kwargs) -> list[str]  (for group processing)
+    """
+
+    node_id = None
+    display_name = None
+    description = None
+    extra_inputs = []
+    is_group_process = None  # None = auto-detect, True/False = explicit
+    is_output_list = None  # None = auto-detect based on processing mode
+
+    @classmethod
+    def _detect_processing_mode(cls):
+        """Detect whether this node uses group or individual processing.
+
+        Returns:
+            bool: True if group processing, False if individual processing
+        """
+        # Explicit setting takes precedence
+        if cls.is_group_process is not None:
+            return cls.is_group_process
+
+        # Check which method is overridden by looking at the defining class in MRO
+        base_class = TextProcessingNode
+
+        # Find which class in MRO defines _process
+        process_definer = None
+        for klass in cls.__mro__:
+            if "_process" in klass.__dict__:
+                process_definer = klass
+                break
+
+        # Find which class in MRO defines _group_process
+        group_definer = None
+        for klass in cls.__mro__:
+            if "_group_process" in klass.__dict__:
+                group_definer = klass
+                break
+
+        # Check what was overridden (not defined in base class)
+        has_process = process_definer is not None and process_definer is not base_class
+        has_group = group_definer is not None and group_definer is not base_class
+
+        if has_process and has_group:
+            raise ValueError(
+                f"{cls.__name__}: Cannot override both _process and _group_process. "
+                "Override only one, or set is_group_process explicitly."
+            )
+        if not has_process and not has_group:
+            raise ValueError(
+                f"{cls.__name__}: Must override either _process or _group_process"
+            )
+
+        return has_group
+
+    @classmethod
+    def define_schema(cls):
+        if cls.node_id is None:
+            raise NotImplementedError(f"{cls.__name__} must set node_id class variable")
+
+        is_group = cls._detect_processing_mode()
+
+        inputs = [
+            io.String.Input(
+                "texts",
+                tooltip="List of texts to process." if is_group else "Text to process.",
+            )
+        ]
+        inputs.extend(cls.extra_inputs)
+
+        return io.Schema(
+            node_id=cls.node_id,
+            display_name=cls.display_name or cls.node_id,
+            category="dataset/text",
+            is_experimental=True,
+            is_input_list=is_group,  # True for group, False for individual
+            inputs=inputs,
+            outputs=[
+                io.String.Output(
+                    display_name="texts",
+                    is_output_list=cls.is_output_list,
+                    tooltip="Processed texts",
+                )
+            ],
+        )
+
+    @classmethod
+    def execute(cls, texts, **kwargs):
+        """Execute the node. Routes to _process or _group_process based on mode."""
+        is_group = cls._detect_processing_mode()
+
+        # Extract scalar values from lists for parameters
+        params = {}
+        for k, v in kwargs.items():
+            if isinstance(v, list) and len(v) == 1:
+                params[k] = v[0]
+            else:
+                params[k] = v
+
+        if is_group:
+            # Group processing: texts is list, call _group_process
+            result = cls._group_process(texts, **params)
+        else:
+            # Individual processing: texts is single item, call _process
+            result = cls._process(texts, **params)
+
+        # Wrap result based on is_output_list
+        if cls.is_output_list:
+            # Result should already be a list (or will be for individual)
+            return io.NodeOutput(result if is_group else [result])
+        else:
+            # Single output - wrap in list for NodeOutput
+            return io.NodeOutput([result])
+
+    @classmethod
+    def _process(cls, text, **kwargs):
+        """Override this method for single-item processing.
+
+        Args:
+            text: str - Single text string
+            **kwargs: Additional parameters (already extracted from lists)
+
+        Returns:
+            str - Processed text
+        """
+        raise NotImplementedError(f"{cls.__name__} must implement _process method")
+
+    @classmethod
+    def _group_process(cls, texts, **kwargs):
+        """Override this method for group processing.
+
+        Args:
+            texts: list[str] - List of text strings
+            **kwargs: Additional parameters (already extracted from lists)
+
+        Returns:
+            list[str] - Processed texts
+        """
+        raise NotImplementedError(
+            f"{cls.__name__} must implement _group_process method"
+        )
+
+
+# ========== Image Transform Nodes ==========
+
+
+class ResizeImagesByShorterEdgeNode(ImageProcessingNode):
+    node_id = "ResizeImagesByShorterEdge"
+    display_name = "Resize Images by Shorter Edge"
+    description = "Resize images so that the shorter edge matches the specified length while preserving aspect ratio."
+    extra_inputs = [
+        io.Int.Input(
+            "shorter_edge",
+            default=512,
+            min=1,
+            max=8192,
+            tooltip="Target length for the shorter edge.",
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, image, shorter_edge):
+        img = tensor_to_pil(image)
+        w, h = img.size
+        if w < h:
+            new_w = shorter_edge
+            new_h = int(h * (shorter_edge / w))
+        else:
+            new_h = shorter_edge
+            new_w = int(w * (shorter_edge / h))
+        img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+        return pil_to_tensor(img)
+
+
+class ResizeImagesByLongerEdgeNode(ImageProcessingNode):
+    node_id = "ResizeImagesByLongerEdge"
+    display_name = "Resize Images by Longer Edge"
+    description = "Resize images so that the longer edge matches the specified length while preserving aspect ratio."
+    extra_inputs = [
+        io.Int.Input(
+            "longer_edge",
+            default=1024,
+            min=1,
+            max=8192,
+            tooltip="Target length for the longer edge.",
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, image, longer_edge):
+        img = tensor_to_pil(image)
+        w, h = img.size
+        if w > h:
+            new_w = longer_edge
+            new_h = int(h * (longer_edge / w))
+        else:
+            new_h = longer_edge
+            new_w = int(w * (longer_edge / h))
+        img = img.resize((new_w, new_h), Image.Resampling.LANCZOS)
+        return pil_to_tensor(img)
+
+
+class CenterCropImagesNode(ImageProcessingNode):
+    node_id = "CenterCropImages"
+    display_name = "Center Crop Images"
+    description = "Center crop all images to the specified dimensions."
+    extra_inputs = [
+        io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."),
+        io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."),
+    ]
+
+    @classmethod
+    def _process(cls, image, width, height):
+        img = tensor_to_pil(image)
+        left = max(0, (img.width - width) // 2)
+        top = max(0, (img.height - height) // 2)
+        right = min(img.width, left + width)
+        bottom = min(img.height, top + height)
+        img = img.crop((left, top, right, bottom))
+        return pil_to_tensor(img)
+
+
+class RandomCropImagesNode(ImageProcessingNode):
+    node_id = "RandomCropImages"
+    display_name = "Random Crop Images"
+    description = (
+        "Randomly crop all images to the specified dimensions (for data augmentation)."
+    )
+    extra_inputs = [
+        io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."),
+        io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."),
+        io.Int.Input(
+            "seed", default=0, min=0, max=0xFFFFFFFFFFFFFFFF, tooltip="Random seed."
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, image, width, height, seed):
+        np.random.seed(seed % (2**32 - 1))
+        img = tensor_to_pil(image)
+        max_left = max(0, img.width - width)
+        max_top = max(0, img.height - height)
+        left = np.random.randint(0, max_left + 1) if max_left > 0 else 0
+        top = np.random.randint(0, max_top + 1) if max_top > 0 else 0
+        right = min(img.width, left + width)
+        bottom = min(img.height, top + height)
+        img = img.crop((left, top, right, bottom))
+        return pil_to_tensor(img)
+
+
+class NormalizeImagesNode(ImageProcessingNode):
+    node_id = "NormalizeImages"
+    display_name = "Normalize Images"
+    description = "Normalize images using mean and standard deviation."
+    extra_inputs = [
+        io.Float.Input(
+            "mean",
+            default=0.5,
+            min=0.0,
+            max=1.0,
+            tooltip="Mean value for normalization.",
+        ),
+        io.Float.Input(
+            "std",
+            default=0.5,
+            min=0.001,
+            max=1.0,
+            tooltip="Standard deviation for normalization.",
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, image, mean, std):
+        return (image - mean) / std
+
+
+class AdjustBrightnessNode(ImageProcessingNode):
+    node_id = "AdjustBrightness"
+    display_name = "Adjust Brightness"
+    description = "Adjust brightness of all images."
+    extra_inputs = [
+        io.Float.Input(
+            "factor",
+            default=1.0,
+            min=0.0,
+            max=2.0,
+            tooltip="Brightness factor. 1.0 = no change, <1.0 = darker, >1.0 = brighter.",
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, image, factor):
+        return (image * factor).clamp(0.0, 1.0)
+
+
+class AdjustContrastNode(ImageProcessingNode):
+    node_id = "AdjustContrast"
+    display_name = "Adjust Contrast"
+    description = "Adjust contrast of all images."
+    extra_inputs = [
+        io.Float.Input(
+            "factor",
+            default=1.0,
+            min=0.0,
+            max=2.0,
+            tooltip="Contrast factor. 1.0 = no change, <1.0 = less contrast, >1.0 = more contrast.",
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, image, factor):
+        return ((image - 0.5) * factor + 0.5).clamp(0.0, 1.0)
+
+
+class ShuffleDatasetNode(ImageProcessingNode):
+    node_id = "ShuffleDataset"
+    display_name = "Shuffle Image Dataset"
+    description = "Randomly shuffle the order of images in the dataset."
+    is_group_process = True  # Requires full list to shuffle
+    extra_inputs = [
+        io.Int.Input(
+            "seed", default=0, min=0, max=0xFFFFFFFFFFFFFFFF, tooltip="Random seed."
+        ),
+    ]
+
+    @classmethod
+    def _group_process(cls, images, seed):
+        np.random.seed(seed % (2**32 - 1))
+        indices = np.random.permutation(len(images))
+        return [images[i] for i in indices]
+
+
+class ShuffleImageTextDatasetNode(io.ComfyNode):
+    """Special node that shuffles both images and texts together."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ShuffleImageTextDataset",
+            display_name="Shuffle Image-Text Dataset",
+            category="dataset/image",
+            is_experimental=True,
+            is_input_list=True,
+            inputs=[
+                io.Image.Input("images", tooltip="List of images to shuffle."),
+                io.String.Input("texts", tooltip="List of texts to shuffle."),
+                io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=0xFFFFFFFFFFFFFFFF,
+                    tooltip="Random seed.",
+                ),
+            ],
+            outputs=[
+                io.Image.Output(
+                    display_name="images",
+                    is_output_list=True,
+                    tooltip="Shuffled images",
+                ),
+                io.String.Output(
+                    display_name="texts", is_output_list=True, tooltip="Shuffled texts"
+                ),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, images, texts, seed):
+        seed = seed[0]  # Extract scalar
+        np.random.seed(seed % (2**32 - 1))
+        indices = np.random.permutation(len(images))
+        shuffled_images = [images[i] for i in indices]
+        shuffled_texts = [texts[i] for i in indices]
+        return io.NodeOutput(shuffled_images, shuffled_texts)
+
+
+# ========== Text Transform Nodes ==========
+
+
+class TextToLowercaseNode(TextProcessingNode):
+    node_id = "TextToLowercase"
+    display_name = "Text to Lowercase"
+    description = "Convert all texts to lowercase."
+
+    @classmethod
+    def _process(cls, text):
+        return text.lower()
+
+
+class TextToUppercaseNode(TextProcessingNode):
+    node_id = "TextToUppercase"
+    display_name = "Text to Uppercase"
+    description = "Convert all texts to uppercase."
+
+    @classmethod
+    def _process(cls, text):
+        return text.upper()
+
+
+class TruncateTextNode(TextProcessingNode):
+    node_id = "TruncateText"
+    display_name = "Truncate Text"
+    description = "Truncate all texts to a maximum length."
+    extra_inputs = [
+        io.Int.Input(
+            "max_length", default=77, min=1, max=10000, tooltip="Maximum text length."
+        ),
+    ]
+
+    @classmethod
+    def _process(cls, text, max_length):
+        return text[:max_length]
+
+
+class AddTextPrefixNode(TextProcessingNode):
+    node_id = "AddTextPrefix"
+    display_name = "Add Text Prefix"
+    description = "Add a prefix to all texts."
+    extra_inputs = [
+        io.String.Input("prefix", default="", tooltip="Prefix to add."),
+    ]
+
+    @classmethod
+    def _process(cls, text, prefix):
+        return prefix + text
+
+
+class AddTextSuffixNode(TextProcessingNode):
+    node_id = "AddTextSuffix"
+    display_name = "Add Text Suffix"
+    description = "Add a suffix to all texts."
+    extra_inputs = [
+        io.String.Input("suffix", default="", tooltip="Suffix to add."),
+    ]
+
+    @classmethod
+    def _process(cls, text, suffix):
+        return text + suffix
+
+
+class ReplaceTextNode(TextProcessingNode):
+    node_id = "ReplaceText"
+    display_name = "Replace Text"
+    description = "Replace text in all texts."
+    extra_inputs = [
+        io.String.Input("find", default="", tooltip="Text to find."),
+        io.String.Input("replace", default="", tooltip="Text to replace with."),
+    ]
+
+    @classmethod
+    def _process(cls, text, find, replace):
+        return text.replace(find, replace)
+
+
+class StripWhitespaceNode(TextProcessingNode):
+    node_id = "StripWhitespace"
+    display_name = "Strip Whitespace"
+    description = "Strip leading and trailing whitespace from all texts."
+
+    @classmethod
+    def _process(cls, text):
+        return text.strip()
+
+
+# ========== Group Processing Example Nodes ==========
+
+
+class ImageDeduplicationNode(ImageProcessingNode):
+    """Remove duplicate or very similar images from the dataset using perceptual hashing."""
+
+    node_id = "ImageDeduplication"
+    display_name = "Image Deduplication"
+    description = "Remove duplicate or very similar images from the dataset."
+    is_group_process = True  # Requires full list to compare images
+    extra_inputs = [
+        io.Float.Input(
+            "similarity_threshold",
+            default=0.95,
+            min=0.0,
+            max=1.0,
+            tooltip="Similarity threshold (0-1). Higher means more similar. Images above this threshold are considered duplicates.",
+        ),
+    ]
+
+    @classmethod
+    def _group_process(cls, images, similarity_threshold):
+        """Remove duplicate images using perceptual hashing."""
+        if len(images) == 0:
+            return []
+
+        # Compute simple perceptual hash for each image
+        def compute_hash(img_tensor):
+            """Compute a simple perceptual hash by resizing to 8x8 and comparing to average."""
+            img = tensor_to_pil(img_tensor)
+            # Resize to 8x8
+            img_small = img.resize((8, 8), Image.Resampling.LANCZOS).convert("L")
+            # Get pixels
+            pixels = list(img_small.getdata())
+            # Compute average
+            avg = sum(pixels) / len(pixels)
+            # Create hash (1 if above average, 0 otherwise)
+            hash_bits = "".join("1" if p > avg else "0" for p in pixels)
+            return hash_bits
+
+        def hamming_distance(hash1, hash2):
+            """Compute Hamming distance between two hash strings."""
+            return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))
+
+        # Compute hashes for all images
+        hashes = [compute_hash(img) for img in images]
+
+        # Find duplicates
+        keep_indices = []
+        for i in range(len(images)):
+            is_duplicate = False
+            for j in keep_indices:
+                # Compare hashes
+                distance = hamming_distance(hashes[i], hashes[j])
+                similarity = 1.0 - (distance / 64.0)  # 64 bits total
+                if similarity >= similarity_threshold:
+                    is_duplicate = True
+                    logging.info(
+                        f"Image {i} is similar to image {j} (similarity: {similarity:.3f}), skipping"
+                    )
+                    break
+
+            if not is_duplicate:
+                keep_indices.append(i)
+
+        # Return only unique images
+        unique_images = [images[i] for i in keep_indices]
+        logging.info(
+            f"Deduplication: kept {len(unique_images)} out of {len(images)} images"
+        )
+        return unique_images
+
+
+class ImageGridNode(ImageProcessingNode):
+    """Combine multiple images into a single grid/collage."""
+
+    node_id = "ImageGrid"
+    display_name = "Image Grid"
+    description = "Arrange multiple images into a grid layout."
+    is_group_process = True  # Requires full list to create grid
+    is_output_list = False  # Outputs single grid image
+    extra_inputs = [
+        io.Int.Input(
+            "columns",
+            default=4,
+            min=1,
+            max=20,
+            tooltip="Number of columns in the grid.",
+        ),
+        io.Int.Input(
+            "cell_width",
+            default=256,
+            min=32,
+            max=2048,
+            tooltip="Width of each cell in the grid.",
+        ),
+        io.Int.Input(
+            "cell_height",
+            default=256,
+            min=32,
+            max=2048,
+            tooltip="Height of each cell in the grid.",
+        ),
+        io.Int.Input(
+            "padding", default=4, min=0, max=50, tooltip="Padding between images."
+        ),
+    ]
+
+    @classmethod
+    def _group_process(cls, images, columns, cell_width, cell_height, padding):
+        """Arrange images into a grid."""
+        if len(images) == 0:
+            raise ValueError("Cannot create grid from empty image list")
+
+        # Calculate grid dimensions
+        num_images = len(images)
+        rows = (num_images + columns - 1) // columns  # Ceiling division
+
+        # Calculate total grid size
+        grid_width = columns * cell_width + (columns - 1) * padding
+        grid_height = rows * cell_height + (rows - 1) * padding
+
+        # Create blank grid
+        grid = Image.new("RGB", (grid_width, grid_height), (0, 0, 0))
+
+        # Place images
+        for idx, img_tensor in enumerate(images):
+            row = idx // columns
+            col = idx % columns
+
+            # Convert to PIL and resize to cell size
+            img = tensor_to_pil(img_tensor)
+            img = img.resize((cell_width, cell_height), Image.Resampling.LANCZOS)
+
+            # Calculate position
+            x = col * (cell_width + padding)
+            y = row * (cell_height + padding)
+
+            # Paste into grid
+            grid.paste(img, (x, y))
+
+        logging.info(
+            f"Created {columns}x{rows} grid with {num_images} images ({grid_width}x{grid_height})"
+        )
+        return pil_to_tensor(grid)
+
+
+class MergeImageListsNode(ImageProcessingNode):
+    """Merge multiple image lists into a single list."""
+
+    node_id = "MergeImageLists"
+    display_name = "Merge Image Lists"
+    description = "Concatenate multiple image lists into one."
+    is_group_process = True  # Receives images as list
+
+    @classmethod
+    def _group_process(cls, images):
+        """Simply return the images list (already merged by input handling)."""
+        # When multiple list inputs are connected, they're concatenated
+        # For now, this is a simple pass-through
+        logging.info(f"Merged image list contains {len(images)} images")
+        return images
+
+
+class MergeTextListsNode(TextProcessingNode):
+    """Merge multiple text lists into a single list."""
+
+    node_id = "MergeTextLists"
+    display_name = "Merge Text Lists"
+    description = "Concatenate multiple text lists into one."
+    is_group_process = True  # Receives texts as list
+
+    @classmethod
+    def _group_process(cls, texts):
+        """Simply return the texts list (already merged by input handling)."""
+        # When multiple list inputs are connected, they're concatenated
+        # For now, this is a simple pass-through
+        logging.info(f"Merged text list contains {len(texts)} texts")
+        return texts
+
+
+# ========== Training Dataset Nodes ==========
+
+
+class MakeTrainingDataset(io.ComfyNode):
+    """Encode images with VAE and texts with CLIP to create a training dataset."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="MakeTrainingDataset",
+            display_name="Make Training Dataset",
+            category="dataset",
+            is_experimental=True,
+            is_input_list=True,  # images and texts as lists
+            inputs=[
+                io.Image.Input("images", tooltip="List of images to encode."),
+                io.Vae.Input(
+                    "vae", tooltip="VAE model for encoding images to latents."
+                ),
+                io.Clip.Input(
+                    "clip", tooltip="CLIP model for encoding text to conditioning."
+                ),
+                io.String.Input(
+                    "texts",
+                    optional=True,
+                    tooltip="List of text captions. Can be length n (matching images), 1 (repeated for all), or omitted (uses empty string).",
+                ),
+            ],
+            outputs=[
+                io.Latent.Output(
+                    display_name="latents",
+                    is_output_list=True,
+                    tooltip="List of latent dicts",
+                ),
+                io.Conditioning.Output(
+                    display_name="conditioning",
+                    is_output_list=True,
+                    tooltip="List of conditioning lists",
+                ),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, images, vae, clip, texts=None):
+        # Extract scalars (vae and clip are single values wrapped in lists)
+        vae = vae[0]
+        clip = clip[0]
+
+        # Handle text list
+        num_images = len(images)
+
+        if texts is None or len(texts) == 0:
+            # Treat as [""] for unconditional training
+            texts = [""]
+
+        if len(texts) == 1 and num_images > 1:
+            # Repeat single text for all images
+            texts = texts * num_images
+        elif len(texts) != num_images:
+            raise ValueError(
+                f"Number of texts ({len(texts)}) does not match number of images ({num_images}). "
+                f"Text list should have length {num_images}, 1, or 0."
+            )
+
+        # Encode images with VAE
+        logging.info(f"Encoding {num_images} images with VAE...")
+        latents_list = []  # list[{"samples": tensor}]
+        for img_tensor in images:
+            # img_tensor is [1, H, W, 3]
+            latent_tensor = vae.encode(img_tensor[:, :, :, :3])
+            latents_list.append({"samples": latent_tensor})
+
+        # Encode texts with CLIP
+        logging.info(f"Encoding {len(texts)} texts with CLIP...")
+        conditioning_list = []  # list[list[cond]]
+        for text in texts:
+            if text == "":
+                cond = clip.encode_from_tokens_scheduled(clip.tokenize(""))
+            else:
+                tokens = clip.tokenize(text)
+                cond = clip.encode_from_tokens_scheduled(tokens)
+            conditioning_list.append(cond)
+
+        logging.info(
+            f"Created dataset with {len(latents_list)} latents and {len(conditioning_list)} conditioning."
+        )
+        return io.NodeOutput(latents_list, conditioning_list)
+
+
+class SaveTrainingDataset(io.ComfyNode):
+    """Save encoded training dataset (latents + conditioning) to disk."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SaveTrainingDataset",
+            display_name="Save Training Dataset",
+            category="dataset",
+            is_experimental=True,
+            is_output_node=True,
+            is_input_list=True,  # Receive lists
+            inputs=[
+                io.Latent.Input(
+                    "latents",
+                    tooltip="List of latent dicts from MakeTrainingDataset.",
+                ),
+                io.Conditioning.Input(
+                    "conditioning",
+                    tooltip="List of conditioning lists from MakeTrainingDataset.",
+                ),
+                io.String.Input(
+                    "folder_name",
+                    default="training_dataset",
+                    tooltip="Name of folder to save dataset (inside output directory).",
+                ),
+                io.Int.Input(
+                    "shard_size",
+                    default=1000,
+                    min=1,
+                    max=100000,
+                    tooltip="Number of samples per shard file.",
+                ),
+            ],
+            outputs=[],
+        )
+
+    @classmethod
+    def execute(cls, latents, conditioning, folder_name, shard_size):
+        # Extract scalars
+        folder_name = folder_name[0]
+        shard_size = shard_size[0]
+
+        # latents: list[{"samples": tensor}]
+        # conditioning: list[list[cond]]
+
+        # Validate lengths match
+        if len(latents) != len(conditioning):
+            raise ValueError(
+                f"Number of latents ({len(latents)}) does not match number of conditions ({len(conditioning)}). "
+                f"Something went wrong in dataset preparation."
+            )
+
+        # Create output directory
+        output_dir = os.path.join(folder_paths.get_output_directory(), folder_name)
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Prepare data pairs
+        num_samples = len(latents)
+        num_shards = (num_samples + shard_size - 1) // shard_size  # Ceiling division
+
+        logging.info(
+            f"Saving {num_samples} samples to {num_shards} shards in {output_dir}..."
+        )
+
+        # Save data in shards
+        for shard_idx in range(num_shards):
+            start_idx = shard_idx * shard_size
+            end_idx = min(start_idx + shard_size, num_samples)
+
+            # Get shard data (list of latent dicts and conditioning lists)
+            shard_data = {
+                "latents": latents[start_idx:end_idx],
+                "conditioning": conditioning[start_idx:end_idx],
+            }
+
+            # Save shard
+            shard_filename = f"shard_{shard_idx:04d}.pkl"
+            shard_path = os.path.join(output_dir, shard_filename)
+
+            with open(shard_path, "wb") as f:
+                torch.save(shard_data, f)
+
+            logging.info(
+                f"Saved shard {shard_idx + 1}/{num_shards}: {shard_filename} ({end_idx - start_idx} samples)"
+            )
+
+        # Save metadata
+        metadata = {
+            "num_samples": num_samples,
+            "num_shards": num_shards,
+            "shard_size": shard_size,
+        }
+        metadata_path = os.path.join(output_dir, "metadata.json")
+        with open(metadata_path, "w") as f:
+            json.dump(metadata, f, indent=2)
+
+        logging.info(f"Successfully saved {num_samples} samples to {output_dir}.")
+        return io.NodeOutput()
+
+
+class LoadTrainingDataset(io.ComfyNode):
+    """Load encoded training dataset from disk."""
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LoadTrainingDataset",
+            display_name="Load Training Dataset",
+            category="dataset",
+            is_experimental=True,
+            inputs=[
+                io.String.Input(
+                    "folder_name",
+                    default="training_dataset",
+                    tooltip="Name of folder containing the saved dataset (inside output directory).",
+                ),
+            ],
+            outputs=[
+                io.Latent.Output(
+                    display_name="latents",
+                    is_output_list=True,
+                    tooltip="List of latent dicts",
+                ),
+                io.Conditioning.Output(
+                    display_name="conditioning",
+                    is_output_list=True,
+                    tooltip="List of conditioning lists",
+                ),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, folder_name):
+        # Get dataset directory
+        dataset_dir = os.path.join(folder_paths.get_output_directory(), folder_name)
+
+        if not os.path.exists(dataset_dir):
+            raise ValueError(f"Dataset directory not found: {dataset_dir}")
+
+        # Find all shard files
+        shard_files = sorted(
+            [
+                f
+                for f in os.listdir(dataset_dir)
+                if f.startswith("shard_") and f.endswith(".pkl")
+            ]
+        )
+
+        if not shard_files:
+            raise ValueError(f"No shard files found in {dataset_dir}")
+
+        logging.info(f"Loading {len(shard_files)} shards from {dataset_dir}...")
+
+        # Load all shards
+        all_latents = []  # list[{"samples": tensor}]
+        all_conditioning = []  # list[list[cond]]
+
+        for shard_file in shard_files:
+            shard_path = os.path.join(dataset_dir, shard_file)
+
+            with open(shard_path, "rb") as f:
+                shard_data = torch.load(f, weights_only=True)
+
+            all_latents.extend(shard_data["latents"])
+            all_conditioning.extend(shard_data["conditioning"])
+
+            logging.info(f"Loaded {shard_file}: {len(shard_data['latents'])} samples")
+
+        logging.info(
+            f"Successfully loaded {len(all_latents)} samples from {dataset_dir}."
+        )
+        return io.NodeOutput(all_latents, all_conditioning)
+
+
+# ========== Extension Setup ==========
+
+
+class DatasetExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            # Data loading/saving nodes
+            LoadImageDataSetFromFolderNode,
+            LoadImageTextDataSetFromFolderNode,
+            SaveImageDataSetToFolderNode,
+            SaveImageTextDataSetToFolderNode,
+            # Image transform nodes
+            ResizeImagesByShorterEdgeNode,
+            ResizeImagesByLongerEdgeNode,
+            CenterCropImagesNode,
+            RandomCropImagesNode,
+            NormalizeImagesNode,
+            AdjustBrightnessNode,
+            AdjustContrastNode,
+            ShuffleDatasetNode,
+            ShuffleImageTextDatasetNode,
+            # Text transform nodes
+            TextToLowercaseNode,
+            TextToUppercaseNode,
+            TruncateTextNode,
+            AddTextPrefixNode,
+            AddTextSuffixNode,
+            ReplaceTextNode,
+            StripWhitespaceNode,
+            # Group processing examples
+            ImageDeduplicationNode,
+            ImageGridNode,
+            MergeImageListsNode,
+            MergeTextListsNode,
+            # Training dataset nodes
+            MakeTrainingDataset,
+            SaveTrainingDataset,
+            LoadTrainingDataset,
+        ]
+
+
+async def comfy_entrypoint() -> DatasetExtension:
+    return DatasetExtension()
diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py
index 1359e2f99..11b23ffdb 100644
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -11,13 +11,13 @@ if TYPE_CHECKING:
 
 def easycache_forward_wrapper(executor, *args, **kwargs):
     # get values from args
-    x: torch.Tensor = args[0]
     transformer_options: dict[str] = args[-1]
     if not isinstance(transformer_options, dict):
         transformer_options = kwargs.get("transformer_options")
         if not transformer_options:
             transformer_options = args[-2]
     easycache: EasyCacheHolder = transformer_options["easycache"]
+    x: torch.Tensor = args[0][:, :easycache.output_channels]
     sigmas = transformer_options["sigmas"]
     uuids = transformer_options["uuids"]
     if sigmas is not None and easycache.is_past_end_timestep(sigmas):
@@ -82,13 +82,13 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
 
 def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
     # get values from args
-    x: torch.Tensor = args[0]
     timestep: float = args[1]
     model_options: dict[str] = args[2]
     easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"]
     if easycache.is_past_end_timestep(timestep):
         return executor(*args, **kwargs)
     # prepare next x_prev
+    x: torch.Tensor = args[0][:, :easycache.output_channels]
     next_x_prev = x
     input_change = None
     do_easycache = easycache.should_do_easycache(timestep)
@@ -173,7 +173,7 @@ def easycache_sample_wrapper(executor, *args, **kwargs):
 
 
 class EasyCacheHolder:
-    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False):
+    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None):
         self.name = "EasyCache"
         self.reuse_threshold = reuse_threshold
         self.start_percent = start_percent
@@ -202,6 +202,7 @@ class EasyCacheHolder:
         self.allow_mismatch = True
         self.cut_from_start = True
         self.state_metadata = None
+        self.output_channels = output_channels
 
     def is_past_end_timestep(self, timestep: float) -> bool:
         return not (timestep[0] > self.end_t).item()
@@ -264,7 +265,7 @@ class EasyCacheHolder:
                     else:
                         slicing.append(slice(None))
                 batch_slice = batch_slice + slicing
-            x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device)
+            x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device)
         return x
 
     def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
@@ -283,7 +284,7 @@ class EasyCacheHolder:
                 else:
                     slicing.append(slice(None))
                 skip_dim = False
-            x = x[slicing]
+            x = x[tuple(slicing)]
         diff = output - x
         batch_offset = diff.shape[0] // len(uuids)
         for i, uuid in enumerate(uuids):
@@ -323,7 +324,7 @@ class EasyCacheHolder:
         return self
 
     def clone(self):
-        return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose)
+        return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels)
 
 
 class EasyCacheNode(io.ComfyNode):
@@ -350,7 +351,7 @@ class EasyCacheNode(io.ComfyNode):
     @classmethod
     def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
         model = model.clone()
-        model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose)
+        model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels)
         model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "easycache", easycache_sample_wrapper)
         model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, "easycache", easycache_calc_cond_batch_wrapper)
         model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "easycache", easycache_forward_wrapper)
@@ -358,7 +359,7 @@ class EasyCacheNode(io.ComfyNode):
 
 
 class LazyCacheHolder:
-    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False):
+    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None):
         self.name = "LazyCache"
         self.reuse_threshold = reuse_threshold
         self.start_percent = start_percent
@@ -382,6 +383,7 @@ class LazyCacheHolder:
         self.approx_output_change_rates = []
         self.total_steps_skipped = 0
         self.state_metadata = None
+        self.output_channels = output_channels
 
     def has_cache_diff(self) -> bool:
         return self.cache_diff is not None
@@ -456,7 +458,7 @@ class LazyCacheHolder:
         return self
 
     def clone(self):
-        return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose)
+        return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels)
 
 class LazyCacheNode(io.ComfyNode):
     @classmethod
@@ -482,7 +484,7 @@ class LazyCacheNode(io.ComfyNode):
     @classmethod
     def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
         model = model.clone()
-        model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose)
+        model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels)
         model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "lazycache", easycache_sample_wrapper)
         model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "lazycache", lazycache_predict_noise_wrapper)
         return io.NodeOutput(model)
diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py
index ce1b2e89f..d9c4bba81 100644
--- a/comfy_extras/nodes_flux.py
+++ b/comfy_extras/nodes_flux.py
@@ -2,7 +2,10 @@ import node_helpers
 import comfy.utils
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-
+import comfy.model_management
+import torch
+import math
+import nodes
 
 class CLIPTextEncodeFlux(io.ComfyNode):
     @classmethod
@@ -30,6 +33,27 @@ class CLIPTextEncodeFlux(io.ComfyNode):
 
     encode = execute  # TODO: remove
 
+class EmptyFlux2LatentImage(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="EmptyFlux2LatentImage",
+            display_name="Empty Flux 2 Latent",
+            category="latent",
+            inputs=[
+                io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+            ],
+            outputs=[
+                io.Latent.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
+        latent = torch.zeros([batch_size, 128, height // 16, width // 16], device=comfy.model_management.intermediate_device())
+        return io.NodeOutput({"samples": latent})
 
 class FluxGuidance(io.ComfyNode):
     @classmethod
@@ -154,6 +178,58 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode):
     append = execute  # TODO: remove
 
 
+def generalized_time_snr_shift(t, mu: float, sigma: float):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+
+def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float:
+    a1, b1 = 8.73809524e-05, 1.89833333
+    a2, b2 = 0.00016927, 0.45666666
+
+    if image_seq_len > 4300:
+        mu = a2 * image_seq_len + b2
+        return float(mu)
+
+    m_200 = a2 * image_seq_len + b2
+    m_10 = a1 * image_seq_len + b1
+
+    a = (m_200 - m_10) / 190.0
+    b = m_200 - 200.0 * a
+    mu = a * num_steps + b
+
+    return float(mu)
+
+
+def get_schedule(num_steps: int, image_seq_len: int) -> list[float]:
+    mu = compute_empirical_mu(image_seq_len, num_steps)
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    timesteps = generalized_time_snr_shift(timesteps, mu, 1.0)
+    return timesteps
+
+
+class Flux2Scheduler(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="Flux2Scheduler",
+            category="sampling/custom_sampling/schedulers",
+            inputs=[
+                io.Int.Input("steps", default=20, min=1, max=4096),
+                io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1),
+                io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1),
+            ],
+            outputs=[
+                io.Sigmas.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, steps, width, height) -> io.NodeOutput:
+        seq_len = (width * height / (16 * 16))
+        sigmas = get_schedule(steps, round(seq_len))
+        return io.NodeOutput(sigmas)
+
+
 class FluxExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[io.ComfyNode]]:
@@ -163,6 +239,8 @@ class FluxExtension(ComfyExtension):
             FluxDisableGuidance,
             FluxKontextImageScale,
             FluxKontextMultiReferenceLatentMethod,
+            EmptyFlux2LatentImage,
+            Flux2Scheduler,
         ]
 
 
diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py
index f7c34d059..32be182f1 100644
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -4,7 +4,8 @@ import torch
 import comfy.model_management
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-
+from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel
+import folder_paths
 
 class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
     @classmethod
@@ -37,6 +38,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
     def define_schema(cls):
         return io.Schema(
             node_id="EmptyHunyuanLatentVideo",
+            display_name="Empty HunyuanVideo 1.0 Latent",
             category="latent/video",
             inputs=[
                 io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -57,6 +59,198 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
     generate = execute  # TODO: remove
 
 
+class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo):
+    @classmethod
+    def define_schema(cls):
+        schema = super().define_schema()
+        schema.node_id = "EmptyHunyuanVideo15Latent"
+        schema.display_name = "Empty HunyuanVideo 1.5 Latent"
+        return schema
+
+    @classmethod
+    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
+        # Using scale factor of 16 instead of 8
+        latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device())
+        return io.NodeOutput({"samples": latent})
+
+
+class HunyuanVideo15ImageToVideo(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanVideo15ImageToVideo",
+            category="conditioning/video_models",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae"),
+                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+                io.Int.Input("length", default=33, min=1, max=nodes.MAX_RESOLUTION, step=4),
+                io.Int.Input("batch_size", default=1, min=1, max=4096),
+                io.Image.Input("start_image", optional=True),
+                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None) -> io.NodeOutput:
+        latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device())
+
+        if start_image is not None:
+            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+
+            encoded = vae.encode(start_image[:, :, :, :3])
+            concat_latent_image = torch.zeros((latent.shape[0], 32, latent.shape[2], latent.shape[3], latent.shape[4]), device=comfy.model_management.intermediate_device())
+            concat_latent_image[:, :, :encoded.shape[2], :, :] = encoded
+
+            mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
+            mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
+
+            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return io.NodeOutput(positive, negative, out_latent)
+
+
+class HunyuanVideo15SuperResolution(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanVideo15SuperResolution",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.Vae.Input("vae", optional=True),
+                io.Image.Input("start_image", optional=True),
+                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
+                io.Latent.Input("latent"),
+                io.Float.Input("noise_augmentation", default=0.70, min=0.0, max=1.0, step=0.01),
+
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+                io.Latent.Output(display_name="latent"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, positive, negative, latent, noise_augmentation, vae=None, start_image=None, clip_vision_output=None) -> io.NodeOutput:
+        in_latent = latent["samples"]
+        in_channels = in_latent.shape[1]
+        cond_latent = torch.zeros([in_latent.shape[0], in_channels * 2 + 2, in_latent.shape[-3], in_latent.shape[-2], in_latent.shape[-1]], device=comfy.model_management.intermediate_device())
+        cond_latent[:, in_channels + 1 : 2 * in_channels + 1] = in_latent
+        cond_latent[:, 2 * in_channels + 1] = 1
+        if start_image is not None:
+            start_image = comfy.utils.common_upscale(start_image.movedim(-1, 1), in_latent.shape[-1] * 16, in_latent.shape[-2] * 16, "bilinear", "center").movedim(1, -1)
+            encoded = vae.encode(start_image[:, :, :, :3])
+            cond_latent[:, :in_channels, :encoded.shape[2], :, :] = encoded
+            cond_latent[:, in_channels + 1, 0] = 1
+
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "noise_augmentation": noise_augmentation})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "noise_augmentation": noise_augmentation})
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        return io.NodeOutput(positive, negative, latent)
+
+
+class LatentUpscaleModelLoader(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LatentUpscaleModelLoader",
+            display_name="Load Latent Upscale Model",
+            category="loaders",
+            inputs=[
+                io.Combo.Input("model_name", options=folder_paths.get_filename_list("latent_upscale_models")),
+            ],
+            outputs=[
+                io.LatentUpscaleModel.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, model_name) -> io.NodeOutput:
+        model_path = folder_paths.get_full_path_or_raise("latent_upscale_models", model_name)
+        sd = comfy.utils.load_torch_file(model_path, safe_load=True)
+
+        if "blocks.0.block.0.conv.weight" in sd:
+            config = {
+                "in_channels": sd["in_conv.conv.weight"].shape[1],
+                "out_channels": sd["out_conv.conv.weight"].shape[0],
+                "hidden_channels": sd["in_conv.conv.weight"].shape[0],
+                "num_blocks": len([k for k in sd.keys() if k.startswith("blocks.") and k.endswith(".block.0.conv.weight")]),
+                "global_residual": False,
+            }
+            model_type = "720p"
+        elif "up.0.block.0.conv1.conv.weight" in sd:
+            sd = {key.replace("nin_shortcut", "nin_shortcut.conv", 1): value for key, value in sd.items()}
+            config = {
+                "z_channels": sd["conv_in.conv.weight"].shape[1],
+                "out_channels": sd["conv_out.conv.weight"].shape[0],
+                "block_out_channels": tuple(sd[f"up.{i}.block.0.conv1.conv.weight"].shape[0] for i in range(len([k for k in sd.keys() if k.startswith("up.") and k.endswith(".block.0.conv1.conv.weight")]))),
+            }
+            model_type = "1080p"
+
+        model = HunyuanVideo15SRModel(model_type, config)
+        model.load_sd(sd)
+
+        return io.NodeOutput(model)
+
+
+class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="HunyuanVideo15LatentUpscaleWithModel",
+            display_name="Hunyuan Video 15 Latent Upscale With Model",
+            category="latent",
+            inputs=[
+                io.LatentUpscaleModel.Input("model"),
+                io.Latent.Input("samples"),
+                io.Combo.Input("upscale_method", options=["nearest-exact", "bilinear", "area", "bicubic", "bislerp"], default="bilinear"),
+                io.Int.Input("width", default=1280, min=0, max=16384, step=8),
+                io.Int.Input("height", default=720, min=0, max=16384, step=8),
+                io.Combo.Input("crop", options=["disabled", "center"]),
+            ],
+            outputs=[
+                io.Latent.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, model, samples, upscale_method, width, height, crop) -> io.NodeOutput:
+        if width == 0 and height == 0:
+            return io.NodeOutput(samples)
+        else:
+            if width == 0:
+                height = max(64, height)
+                width = max(64, round(samples["samples"].shape[-1] * height / samples["samples"].shape[-2]))
+            elif height == 0:
+                width = max(64, width)
+                height = max(64, round(samples["samples"].shape[-2] * width / samples["samples"].shape[-1]))
+            else:
+                width = max(64, width)
+                height = max(64, height)
+            s = comfy.utils.common_upscale(samples["samples"], width // 16, height // 16, upscale_method, crop)
+            s = model.resample_latent(s)
+            return io.NodeOutput({"samples": s.cpu().float()})
+
+
 PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
     "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
     "1. The main content and theme of the video."
@@ -210,6 +404,11 @@ class HunyuanExtension(ComfyExtension):
             CLIPTextEncodeHunyuanDiT,
             TextEncodeHunyuanVideo_ImageToVideo,
             EmptyHunyuanLatentVideo,
+            EmptyHunyuanVideo15Latent,
+            HunyuanVideo15ImageToVideo,
+            HunyuanVideo15SuperResolution,
+            HunyuanVideo15LatentUpscaleWithModel,
+            LatentUpscaleModelLoader,
             HunyuanImageToVideo,
             EmptyHunyuanImageLatent,
             HunyuanRefinerLatent,
diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py
index f6e71e0a8..adca14f62 100644
--- a/comfy_extras/nodes_hunyuan3d.py
+++ b/comfy_extras/nodes_hunyuan3d.py
@@ -7,63 +7,79 @@ from comfy.ldm.modules.diffusionmodules.mmdit import get_1d_sincos_pos_embed_fro
 import folder_paths
 import comfy.model_management
 from comfy.cli_args import args
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, IO, Types
+from comfy_api.latest._util import MESH, VOXEL  # only for backward compatibility if someone import it from this file (will be removed later) # noqa
 
-class EmptyLatentHunyuan3Dv2:
+
+class EmptyLatentHunyuan3Dv2(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}),
-                "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}),
-            }
-        }
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="EmptyLatentHunyuan3Dv2",
+            category="latent/3d",
+            inputs=[
+                IO.Int.Input("resolution", default=3072, min=1, max=8192),
+                IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
+            ],
+            outputs=[
+                IO.Latent.Output(),
+            ]
+        )
 
-    RETURN_TYPES = ("LATENT",)
-    FUNCTION = "generate"
-
-    CATEGORY = "latent/3d"
-
-    def generate(self, resolution, batch_size):
+    @classmethod
+    def execute(cls, resolution, batch_size) -> IO.NodeOutput:
         latent = torch.zeros([batch_size, 64, resolution], device=comfy.model_management.intermediate_device())
-        return ({"samples": latent, "type": "hunyuan3dv2"}, )
+        return IO.NodeOutput({"samples": latent, "type": "hunyuan3dv2"})
 
-class Hunyuan3Dv2Conditioning:
+    generate = execute  # TODO: remove
+
+
+class Hunyuan3Dv2Conditioning(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"clip_vision_output": ("CLIP_VISION_OUTPUT",),
-                             }}
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="Hunyuan3Dv2Conditioning",
+            category="conditioning/video_models",
+            inputs=[
+                IO.ClipVisionOutput.Input("clip_vision_output"),
+            ],
+            outputs=[
+                IO.Conditioning.Output(display_name="positive"),
+                IO.Conditioning.Output(display_name="negative"),
+            ]
+        )
 
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
-    RETURN_NAMES = ("positive", "negative")
-
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/video_models"
-
-    def encode(self, clip_vision_output):
+    @classmethod
+    def execute(cls, clip_vision_output) -> IO.NodeOutput:
         embeds = clip_vision_output.last_hidden_state
         positive = [[embeds, {}]]
         negative = [[torch.zeros_like(embeds), {}]]
-        return (positive, negative)
+        return IO.NodeOutput(positive, negative)
+
+    encode = execute  # TODO: remove
 
 
-class Hunyuan3Dv2ConditioningMultiView:
+class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {},
-                "optional": {"front": ("CLIP_VISION_OUTPUT",),
-                             "left": ("CLIP_VISION_OUTPUT",),
-                             "back": ("CLIP_VISION_OUTPUT",),
-                             "right": ("CLIP_VISION_OUTPUT",), }}
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="Hunyuan3Dv2ConditioningMultiView",
+            category="conditioning/video_models",
+            inputs=[
+                IO.ClipVisionOutput.Input("front", optional=True),
+                IO.ClipVisionOutput.Input("left", optional=True),
+                IO.ClipVisionOutput.Input("back", optional=True),
+                IO.ClipVisionOutput.Input("right", optional=True),
+            ],
+            outputs=[
+                IO.Conditioning.Output(display_name="positive"),
+                IO.Conditioning.Output(display_name="negative"),
+            ]
+        )
 
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
-    RETURN_NAMES = ("positive", "negative")
-
-    FUNCTION = "encode"
-
-    CATEGORY = "conditioning/video_models"
-
-    def encode(self, front=None, left=None, back=None, right=None):
+    @classmethod
+    def execute(cls, front=None, left=None, back=None, right=None) -> IO.NodeOutput:
         all_embeds = [front, left, back, right]
         out = []
         pos_embeds = None
@@ -76,29 +92,35 @@ class Hunyuan3Dv2ConditioningMultiView:
         embeds = torch.cat(out, dim=1)
         positive = [[embeds, {}]]
         negative = [[torch.zeros_like(embeds), {}]]
-        return (positive, negative)
+        return IO.NodeOutput(positive, negative)
+
+    encode = execute  # TODO: remove
 
 
-class VOXEL:
-    def __init__(self, data):
-        self.data = data
-
-class VAEDecodeHunyuan3D:
+class VAEDecodeHunyuan3D(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"samples": ("LATENT", ),
-                             "vae": ("VAE", ),
-                             "num_chunks": ("INT", {"default": 8000, "min": 1000, "max": 500000}),
-                             "octree_resolution": ("INT", {"default": 256, "min": 16, "max": 512}),
-                             }}
-    RETURN_TYPES = ("VOXEL",)
-    FUNCTION = "decode"
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VAEDecodeHunyuan3D",
+            category="latent/3d",
+            inputs=[
+                IO.Latent.Input("samples"),
+                IO.Vae.Input("vae"),
+                IO.Int.Input("num_chunks", default=8000, min=1000, max=500000),
+                IO.Int.Input("octree_resolution", default=256, min=16, max=512),
+            ],
+            outputs=[
+                IO.Voxel.Output(),
+            ]
+        )
 
-    CATEGORY = "latent/3d"
+    @classmethod
+    def execute(cls, vae, samples, num_chunks, octree_resolution) -> IO.NodeOutput:
+        voxels = Types.VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution}))
+        return IO.NodeOutput(voxels)
+
+    decode = execute  # TODO: remove
 
-    def decode(self, vae, samples, num_chunks, octree_resolution):
-        voxels = VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution}))
-        return (voxels, )
 
 def voxel_to_mesh(voxels, threshold=0.5, device=None):
     if device is None:
@@ -396,24 +418,24 @@ def voxel_to_mesh_surfnet(voxels, threshold=0.5, device=None):
 
     return final_vertices, faces
 
-class MESH:
-    def __init__(self, vertices, faces):
-        self.vertices = vertices
-        self.faces = faces
 
-
-class VoxelToMeshBasic:
+class VoxelToMeshBasic(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"voxel": ("VOXEL", ),
-                             "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}),
-                             }}
-    RETURN_TYPES = ("MESH",)
-    FUNCTION = "decode"
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VoxelToMeshBasic",
+            category="3d",
+            inputs=[
+                IO.Voxel.Input("voxel"),
+                IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01),
+            ],
+            outputs=[
+                IO.Mesh.Output(),
+            ]
+        )
 
-    CATEGORY = "3d"
-
-    def decode(self, voxel, threshold):
+    @classmethod
+    def execute(cls, voxel, threshold) -> IO.NodeOutput:
         vertices = []
         faces = []
         for x in voxel.data:
@@ -421,21 +443,29 @@ class VoxelToMeshBasic:
             vertices.append(v)
             faces.append(f)
 
-        return (MESH(torch.stack(vertices), torch.stack(faces)), )
+        return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
 
-class VoxelToMesh:
+    decode = execute  # TODO: remove
+
+
+class VoxelToMesh(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"voxel": ("VOXEL", ),
-                             "algorithm": (["surface net", "basic"], ),
-                             "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}),
-                             }}
-    RETURN_TYPES = ("MESH",)
-    FUNCTION = "decode"
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="VoxelToMesh",
+            category="3d",
+            inputs=[
+                IO.Voxel.Input("voxel"),
+                IO.Combo.Input("algorithm", options=["surface net", "basic"]),
+                IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01),
+            ],
+            outputs=[
+                IO.Mesh.Output(),
+            ]
+        )
 
-    CATEGORY = "3d"
-
-    def decode(self, voxel, algorithm, threshold):
+    @classmethod
+    def execute(cls, voxel, algorithm, threshold) -> IO.NodeOutput:
         vertices = []
         faces = []
 
@@ -449,7 +479,9 @@ class VoxelToMesh:
             vertices.append(v)
             faces.append(f)
 
-        return (MESH(torch.stack(vertices), torch.stack(faces)), )
+        return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
+
+    decode = execute  # TODO: remove
 
 
 def save_glb(vertices, faces, filepath, metadata=None):
@@ -581,31 +613,32 @@ def save_glb(vertices, faces, filepath, metadata=None):
     return filepath
 
 
-class SaveGLB:
+class SaveGLB(IO.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"mesh": ("MESH", ),
-                             "filename_prefix": ("STRING", {"default": "mesh/ComfyUI"}), },
-                "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, }
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="SaveGLB",
+            category="3d",
+            is_output_node=True,
+            inputs=[
+                IO.Mesh.Input("mesh"),
+                IO.String.Input("filename_prefix", default="mesh/ComfyUI"),
+            ],
+            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo]
+        )
 
-    RETURN_TYPES = ()
-    FUNCTION = "save"
-
-    OUTPUT_NODE = True
-
-    CATEGORY = "3d"
-
-    def save(self, mesh, filename_prefix, prompt=None, extra_pnginfo=None):
+    @classmethod
+    def execute(cls, mesh, filename_prefix) -> IO.NodeOutput:
         full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory())
         results = []
 
         metadata = {}
         if not args.disable_metadata:
-            if prompt is not None:
-                metadata["prompt"] = json.dumps(prompt)
-            if extra_pnginfo is not None:
-                for x in extra_pnginfo:
-                    metadata[x] = json.dumps(extra_pnginfo[x])
+            if cls.hidden.prompt is not None:
+                metadata["prompt"] = json.dumps(cls.hidden.prompt)
+            if cls.hidden.extra_pnginfo is not None:
+                for x in cls.hidden.extra_pnginfo:
+                    metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])
 
         for i in range(mesh.vertices.shape[0]):
             f = f"{filename}_{counter:05}_.glb"
@@ -616,15 +649,22 @@ class SaveGLB:
                 "type": "output"
             })
             counter += 1
-        return {"ui": {"3d": results}}
+        return IO.NodeOutput(ui={"3d": results})
 
 
-NODE_CLASS_MAPPINGS = {
-    "EmptyLatentHunyuan3Dv2": EmptyLatentHunyuan3Dv2,
-    "Hunyuan3Dv2Conditioning": Hunyuan3Dv2Conditioning,
-    "Hunyuan3Dv2ConditioningMultiView": Hunyuan3Dv2ConditioningMultiView,
-    "VAEDecodeHunyuan3D": VAEDecodeHunyuan3D,
-    "VoxelToMeshBasic": VoxelToMeshBasic,
-    "VoxelToMesh": VoxelToMesh,
-    "SaveGLB": SaveGLB,
-}
+class Hunyuan3dExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            EmptyLatentHunyuan3Dv2,
+            Hunyuan3Dv2Conditioning,
+            Hunyuan3Dv2ConditioningMultiView,
+            VAEDecodeHunyuan3D,
+            VoxelToMeshBasic,
+            VoxelToMesh,
+            SaveGLB,
+        ]
+
+
+async def comfy_entrypoint() -> Hunyuan3dExtension:
+    return Hunyuan3dExtension()
diff --git a/comfy_extras/nodes_load_3d.py b/comfy_extras/nodes_load_3d.py
index 899608149..54c66ef68 100644
--- a/comfy_extras/nodes_load_3d.py
+++ b/comfy_extras/nodes_load_3d.py
@@ -7,6 +7,10 @@ from comfy_api.input_impl import VideoFromFile
 
 from pathlib import Path
 
+from PIL import Image
+import numpy as np
+
+import uuid
 
 def normalize_path(path):
     return path.replace('\\', '/')
@@ -34,58 +38,6 @@ class Load3D():
             "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
         }}
 
-    RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "IMAGE", "LOAD3D_CAMERA", IO.VIDEO)
-    RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "lineart", "camera_info", "recording_video")
-
-    FUNCTION = "process"
-    EXPERIMENTAL = True
-
-    CATEGORY = "3d"
-
-    def process(self, model_file, image, **kwargs):
-        image_path = folder_paths.get_annotated_filepath(image['image'])
-        mask_path = folder_paths.get_annotated_filepath(image['mask'])
-        normal_path = folder_paths.get_annotated_filepath(image['normal'])
-        lineart_path = folder_paths.get_annotated_filepath(image['lineart'])
-
-        load_image_node = nodes.LoadImage()
-        output_image, ignore_mask = load_image_node.load_image(image=image_path)
-        ignore_image, output_mask = load_image_node.load_image(image=mask_path)
-        normal_image, ignore_mask2 = load_image_node.load_image(image=normal_path)
-        lineart_image, ignore_mask3 = load_image_node.load_image(image=lineart_path)
-
-        video = None
-
-        if image['recording'] != "":
-            recording_video_path = folder_paths.get_annotated_filepath(image['recording'])
-
-            video = VideoFromFile(recording_video_path)
-
-        return output_image, output_mask, model_file, normal_image, lineart_image, image['camera_info'], video
-
-class Load3DAnimation():
-    @classmethod
-    def INPUT_TYPES(s):
-        input_dir = os.path.join(folder_paths.get_input_directory(), "3d")
-
-        os.makedirs(input_dir, exist_ok=True)
-
-        input_path = Path(input_dir)
-        base_path = Path(folder_paths.get_input_directory())
-
-        files = [
-            normalize_path(str(file_path.relative_to(base_path)))
-            for file_path in input_path.rglob("*")
-            if file_path.suffix.lower() in {'.gltf', '.glb', '.fbx'}
-        ]
-
-        return {"required": {
-            "model_file": (sorted(files), {"file_upload": True}),
-            "image": ("LOAD_3D_ANIMATION", {}),
-            "width": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
-            "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
-        }}
-
     RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "LOAD3D_CAMERA", IO.VIDEO)
     RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "camera_info", "recording_video")
 
@@ -120,7 +72,8 @@ class Preview3D():
             "model_file": ("STRING", {"default": "", "multiline": False}),
         },
         "optional": {
-            "camera_info": ("LOAD3D_CAMERA", {})
+            "camera_info": ("LOAD3D_CAMERA", {}),
+            "bg_image": ("IMAGE", {})
         }}
 
     OUTPUT_NODE = True
@@ -133,50 +86,33 @@ class Preview3D():
 
     def process(self, model_file, **kwargs):
         camera_info = kwargs.get("camera_info", None)
+        bg_image = kwargs.get("bg_image", None)
+
+        bg_image_path = None
+        if bg_image is not None:
+
+            img_array = (bg_image[0].cpu().numpy() * 255).astype(np.uint8)
+            img = Image.fromarray(img_array)
+
+            temp_dir = folder_paths.get_temp_directory()
+            filename = f"bg_{uuid.uuid4().hex}.png"
+            bg_image_path = os.path.join(temp_dir, filename)
+            img.save(bg_image_path, compress_level=1)
+
+            bg_image_path = f"temp/{filename}"
 
         return {
             "ui": {
-                "result": [model_file, camera_info]
-            }
-        }
-
-class Preview3DAnimation():
-    @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {
-            "model_file": ("STRING", {"default": "", "multiline": False}),
-        },
-        "optional": {
-            "camera_info": ("LOAD3D_CAMERA", {})
-        }}
-
-    OUTPUT_NODE = True
-    RETURN_TYPES = ()
-
-    CATEGORY = "3d"
-
-    FUNCTION = "process"
-    EXPERIMENTAL = True
-
-    def process(self, model_file, **kwargs):
-        camera_info = kwargs.get("camera_info", None)
-
-        return {
-            "ui": {
-                "result": [model_file, camera_info]
+                "result": [model_file, camera_info, bg_image_path]
             }
         }
 
 NODE_CLASS_MAPPINGS = {
     "Load3D": Load3D,
-    "Load3DAnimation": Load3DAnimation,
     "Preview3D": Preview3D,
-    "Preview3DAnimation": Preview3DAnimation
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "Load3D": "Load 3D",
-    "Load3DAnimation": "Load 3D - Animation",
-    "Preview3D": "Preview 3D",
-    "Preview3DAnimation": "Preview 3D - Animation"
+    "Load3D": "Load 3D & Animation",
+    "Preview3D": "Preview 3D & Animation",
 }
diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py
new file mode 100644
index 000000000..95a6ba788
--- /dev/null
+++ b/comfy_extras/nodes_logic.py
@@ -0,0 +1,155 @@
+from typing import TypedDict
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+from comfy_api.latest import _io
+
+
+
+class SwitchNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        template = io.MatchType.Template("switch")
+        return io.Schema(
+            node_id="ComfySwitchNode",
+            display_name="Switch",
+            category="logic",
+            is_experimental=True,
+            inputs=[
+                io.Boolean.Input("switch"),
+                io.MatchType.Input("on_false", template=template, lazy=True, optional=True),
+                io.MatchType.Input("on_true", template=template, lazy=True, optional=True),
+            ],
+            outputs=[
+                io.MatchType.Output(template=template, display_name="output"),
+            ],
+        )
+
+    @classmethod
+    def check_lazy_status(cls, switch, on_false=..., on_true=...):
+        # We use ... instead of None, as None is passed for connected-but-unevaluated inputs.
+        # This trick allows us to ignore the value of the switch and still be able to run execute().
+
+        # One of the inputs may be missing, in which case we need to evaluate the other input
+        if on_false is ...:
+            return ["on_true"]
+        if on_true is ...:
+            return ["on_false"]
+        # Normal lazy switch operation
+        if switch and on_true is None:
+            return ["on_true"]
+        if not switch and on_false is None:
+            return ["on_false"]
+
+    @classmethod
+    def validate_inputs(cls, switch, on_false=..., on_true=...):
+        # This check happens before check_lazy_status(), so we can eliminate the case where
+        # both inputs are missing.
+        if on_false is ... and on_true is ...:
+            return "At least one of on_false or on_true must be connected to Switch node"
+        return True
+
+    @classmethod
+    def execute(cls, switch, on_true=..., on_false=...) -> io.NodeOutput:
+        if on_true is ...:
+            return io.NodeOutput(on_false)
+        if on_false is ...:
+            return io.NodeOutput(on_true)
+        return io.NodeOutput(on_true if switch else on_false)
+
+
+class DCTestNode(io.ComfyNode):
+    class DCValues(TypedDict):
+        combo: str
+        string: str
+        integer: int
+        image: io.Image.Type
+        subcombo: dict[str]
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="DCTestNode",
+            display_name="DCTest",
+            category="logic",
+            is_output_node=True,
+            inputs=[_io.DynamicCombo.Input("combo", options=[
+                _io.DynamicCombo.Option("option1", [io.String.Input("string")]),
+                _io.DynamicCombo.Option("option2", [io.Int.Input("integer")]),
+                _io.DynamicCombo.Option("option3", [io.Image.Input("image")]),
+                _io.DynamicCombo.Option("option4", [
+                    _io.DynamicCombo.Input("subcombo", options=[
+                        _io.DynamicCombo.Option("opt1", [io.Float.Input("float_x"), io.Float.Input("float_y")]),
+                        _io.DynamicCombo.Option("opt2", [io.Mask.Input("mask1", optional=True)]),
+                    ])
+                ])]
+            )],
+            outputs=[io.AnyType.Output()],
+        )
+
+    @classmethod
+    def execute(cls, combo: DCValues) -> io.NodeOutput:
+        combo_val = combo["combo"]
+        if combo_val == "option1":
+            return io.NodeOutput(combo["string"])
+        elif combo_val == "option2":
+            return io.NodeOutput(combo["integer"])
+        elif combo_val == "option3":
+            return io.NodeOutput(combo["image"])
+        elif combo_val == "option4":
+            return io.NodeOutput(f"{combo['subcombo']}")
+        else:
+            raise ValueError(f"Invalid combo: {combo_val}")
+
+
+class AutogrowNamesTestNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        template = _io.Autogrow.TemplateNames(input=io.Float.Input("float"), names=["a", "b", "c"])
+        return io.Schema(
+            node_id="AutogrowNamesTestNode",
+            display_name="AutogrowNamesTest",
+            category="logic",
+            inputs=[
+                _io.Autogrow.Input("autogrow", template=template)
+            ],
+            outputs=[io.String.Output()],
+        )
+
+    @classmethod
+    def execute(cls, autogrow: _io.Autogrow.Type) -> io.NodeOutput:
+        vals = list(autogrow.values())
+        combined = ",".join([str(x) for x in vals])
+        return io.NodeOutput(combined)
+
+class AutogrowPrefixTestNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        template = _io.Autogrow.TemplatePrefix(input=io.Float.Input("float"), prefix="float", min=1, max=10)
+        return io.Schema(
+            node_id="AutogrowPrefixTestNode",
+            display_name="AutogrowPrefixTest",
+            category="logic",
+            inputs=[
+                _io.Autogrow.Input("autogrow", template=template)
+            ],
+            outputs=[io.String.Output()],
+        )
+
+    @classmethod
+    def execute(cls, autogrow: _io.Autogrow.Type) -> io.NodeOutput:
+        vals = list(autogrow.values())
+        combined = ",".join([str(x) for x in vals])
+        return io.NodeOutput(combined)
+
+class LogicExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            # SwitchNode,
+            # DCTestNode,
+            # AutogrowNamesTestNode,
+            # AutogrowPrefixTestNode,
+        ]
+
+async def comfy_entrypoint() -> LogicExtension:
+    return LogicExtension()
diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py
index 783c59b6b..c61810dbf 100644
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@@ -6,6 +6,7 @@ import comfy.ops
 import comfy.model_management
 import comfy.ldm.common_dit
 import comfy.latent_formats
+import comfy.ldm.lumina.controlnet
 
 
 class BlockWiseControlBlock(torch.nn.Module):
@@ -189,6 +190,35 @@ class SigLIPMultiFeatProjModel(torch.nn.Module):
 
         return embedding
 
+def z_image_convert(sd):
+    replace_keys = {".attention.to_out.0.bias": ".attention.out.bias",
+                    ".attention.norm_k.weight": ".attention.k_norm.weight",
+                    ".attention.norm_q.weight": ".attention.q_norm.weight",
+                    ".attention.to_out.0.weight": ".attention.out.weight"
+                    }
+
+    out_sd = {}
+    for k in sorted(sd.keys()):
+        w = sd[k]
+
+        k_out = k
+        if k_out.endswith(".attention.to_k.weight"):
+            cc = [w]
+            continue
+        if k_out.endswith(".attention.to_q.weight"):
+            cc = [w] + cc
+            continue
+        if k_out.endswith(".attention.to_v.weight"):
+            cc = cc + [w]
+            w = torch.cat(cc, dim=0)
+            k_out = k_out.replace(".attention.to_v.weight", ".attention.qkv.weight")
+
+        for r, rr in replace_keys.items():
+            k_out = k_out.replace(r, rr)
+        out_sd[k_out] = w
+
+    return out_sd
+
 class ModelPatchLoader:
     @classmethod
     def INPUT_TYPES(s):
@@ -211,6 +241,9 @@ class ModelPatchLoader:
         elif 'feature_embedder.mid_layer_norm.bias' in sd:
             sd = comfy.utils.state_dict_prefix_replace(sd, {"feature_embedder.": ""}, filter_keys=True)
             model = SigLIPMultiFeatProjModel(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
+        elif 'control_all_x_embedder.2-1.weight' in sd: # alipai z image fun controlnet
+            sd = z_image_convert(sd)
+            model = comfy.ldm.lumina.controlnet.ZImage_Control(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
 
         model.load_state_dict(sd)
         model = comfy.model_patcher.ModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
@@ -263,6 +296,69 @@ class DiffSynthCnetPatch:
     def models(self):
         return [self.model_patch]
 
+class ZImageControlPatch:
+    def __init__(self, model_patch, vae, image, strength):
+        self.model_patch = model_patch
+        self.vae = vae
+        self.image = image
+        self.strength = strength
+        self.encoded_image = self.encode_latent_cond(image)
+        self.encoded_image_size = (image.shape[1], image.shape[2])
+        self.temp_data = None
+
+    def encode_latent_cond(self, image):
+        latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(image))
+        return latent_image
+
+    def __call__(self, kwargs):
+        x = kwargs.get("x")
+        img = kwargs.get("img")
+        txt = kwargs.get("txt")
+        pe = kwargs.get("pe")
+        vec = kwargs.get("vec")
+        block_index = kwargs.get("block_index")
+        spacial_compression = self.vae.spacial_compression_encode()
+        if self.encoded_image is None or self.encoded_image_size != (x.shape[-2] * spacial_compression, x.shape[-1] * spacial_compression):
+            image_scaled = comfy.utils.common_upscale(self.image.movedim(-1, 1), x.shape[-1] * spacial_compression, x.shape[-2] * spacial_compression, "area", "center")
+            loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
+            self.encoded_image = self.encode_latent_cond(image_scaled.movedim(1, -1))
+            self.encoded_image_size = (image_scaled.shape[-2], image_scaled.shape[-1])
+            comfy.model_management.load_models_gpu(loaded_models)
+
+        cnet_index = (block_index // 5)
+        cnet_index_float = (block_index / 5)
+
+        kwargs.pop("img")  # we do ops in place
+        kwargs.pop("txt")
+
+        cnet_blocks = self.model_patch.model.n_control_layers
+        if cnet_index_float > (cnet_blocks - 1):
+            self.temp_data = None
+            return kwargs
+
+        if self.temp_data is None or self.temp_data[0] > cnet_index:
+            self.temp_data = (-1, (None, self.model_patch.model(txt, self.encoded_image.to(img.dtype), pe, vec)))
+
+        while self.temp_data[0] < cnet_index and (self.temp_data[0] + 1) < cnet_blocks:
+            next_layer = self.temp_data[0] + 1
+            self.temp_data = (next_layer, self.model_patch.model.forward_control_block(next_layer, self.temp_data[1][1], img[:, :self.temp_data[1][1].shape[1]], None, pe, vec))
+
+        if cnet_index_float == self.temp_data[0]:
+            img[:, :self.temp_data[1][0].shape[1]] += (self.temp_data[1][0] * self.strength)
+            if cnet_blocks == self.temp_data[0] + 1:
+                self.temp_data = None
+
+        return kwargs
+
+    def to(self, device_or_dtype):
+        if isinstance(device_or_dtype, torch.device):
+            self.encoded_image = self.encoded_image.to(device_or_dtype)
+            self.temp_data = None
+        return self
+
+    def models(self):
+        return [self.model_patch]
+
 class QwenImageDiffsynthControlnet:
     @classmethod
     def INPUT_TYPES(s):
@@ -289,7 +385,10 @@ class QwenImageDiffsynthControlnet:
                 mask = mask.unsqueeze(2)
             mask = 1.0 - mask
 
-        model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask))
+        if isinstance(model_patch.model, comfy.ldm.lumina.controlnet.ZImage_Control):
+            model_patched.set_model_double_block_patch(ZImageControlPatch(model_patch, vae, image, strength))
+        else:
+            model_patched.set_model_double_block_patch(DiffSynthCnetPatch(model_patch, vae, image, strength, mask))
         return (model_patched,)
 
 
diff --git a/comfy_extras/nodes_nop.py b/comfy_extras/nodes_nop.py
new file mode 100644
index 000000000..953061bcb
--- /dev/null
+++ b/comfy_extras/nodes_nop.py
@@ -0,0 +1,39 @@
+from comfy_api.latest import ComfyExtension, io
+from typing_extensions import override
+# If you write a node that is so useless that it breaks ComfyUI it will be featured in this exclusive list
+
+# "native" block swap nodes are placebo at best and break the ComfyUI memory management system.
+# They are also considered harmful because instead of users reporting issues with the built in
+# memory management they install these stupid nodes and complain even harder. Now it completely
+# breaks with some of the new ComfyUI memory optimizations so I have made the decision to NOP it
+# out of all workflows.
+class wanBlockSwap(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="wanBlockSwap",
+            category="",
+            description="NOP",
+            inputs=[
+                io.Model.Input("model"),
+            ],
+            outputs=[
+                io.Model.Output(),
+            ],
+            is_deprecated=True,
+        )
+
+    @classmethod
+    def execute(cls, model) -> io.NodeOutput:
+        return io.NodeOutput(model)
+
+
+class NopExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            wanBlockSwap
+        ]
+
+async def comfy_entrypoint() -> NopExtension:
+    return NopExtension()
diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py
index e749fa6ae..139b07c93 100644
--- a/comfy_extras/nodes_preview_any.py
+++ b/comfy_extras/nodes_preview_any.py
@@ -39,5 +39,5 @@ NODE_CLASS_MAPPINGS = {
 }
 
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "PreviewAny": "Preview Any",
+    "PreviewAny": "Preview as Text",
 }
diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py
index 9e6ec6780..cb24ab709 100644
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -1,15 +1,13 @@
-import datetime
-import json
 import logging
 import os
 
 import numpy as np
 import safetensors
 import torch
-from PIL import Image, ImageDraw, ImageFont
-from PIL.PngImagePlugin import PngInfo
 import torch.utils.checkpoint
-import tqdm
+from tqdm.auto import trange
+from PIL import Image, ImageDraw, ImageFont
+from typing_extensions import override
 
 import comfy.samplers
 import comfy.sd
@@ -18,9 +16,9 @@ import comfy.model_management
 import comfy_extras.nodes_custom_sampler
 import folder_paths
 import node_helpers
-from comfy.cli_args import args
-from comfy.comfy_types.node_typing import IO
 from comfy.weight_adapter import adapters, adapter_maps
+from comfy_api.latest import ComfyExtension, io, ui
+from comfy.utils import ProgressBar
 
 
 def make_batch_extra_option_dict(d, indicies, full_size=None):
@@ -56,7 +54,18 @@ def process_cond_list(d, prefix=""):
 
 
 class TrainSampler(comfy.samplers.Sampler):
-    def __init__(self, loss_fn, optimizer, loss_callback=None, batch_size=1, grad_acc=1, total_steps=1, seed=0, training_dtype=torch.bfloat16):
+    def __init__(
+        self,
+        loss_fn,
+        optimizer,
+        loss_callback=None,
+        batch_size=1,
+        grad_acc=1,
+        total_steps=1,
+        seed=0,
+        training_dtype=torch.bfloat16,
+        real_dataset=None,
+    ):
         self.loss_fn = loss_fn
         self.optimizer = optimizer
         self.loss_callback = loss_callback
@@ -65,54 +74,138 @@ class TrainSampler(comfy.samplers.Sampler):
         self.grad_acc = grad_acc
         self.seed = seed
         self.training_dtype = training_dtype
+        self.real_dataset: list[torch.Tensor] | None = real_dataset
 
-    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+    def fwd_bwd(
+        self,
+        model_wrap,
+        batch_sigmas,
+        batch_noise,
+        batch_latent,
+        cond,
+        indicies,
+        extra_args,
+        dataset_size,
+        bwd=True,
+    ):
+        xt = model_wrap.inner_model.model_sampling.noise_scaling(
+            batch_sigmas, batch_noise, batch_latent, False
+        )
+        x0 = model_wrap.inner_model.model_sampling.noise_scaling(
+            torch.zeros_like(batch_sigmas),
+            torch.zeros_like(batch_noise),
+            batch_latent,
+            False,
+        )
+
+        model_wrap.conds["positive"] = [cond[i] for i in indicies]
+        batch_extra_args = make_batch_extra_option_dict(
+            extra_args, indicies, full_size=dataset_size
+        )
+
+        with torch.autocast(xt.device.type, dtype=self.training_dtype):
+            x0_pred = model_wrap(
+                xt.requires_grad_(True),
+                batch_sigmas.requires_grad_(True),
+                **batch_extra_args,
+            )
+            loss = self.loss_fn(x0_pred, x0)
+        if bwd:
+            bwd_loss = loss / self.grad_acc
+            bwd_loss.backward()
+        return loss
+
+    def sample(
+        self,
+        model_wrap,
+        sigmas,
+        extra_args,
+        callback,
+        noise,
+        latent_image=None,
+        denoise_mask=None,
+        disable_pbar=False,
+    ):
         model_wrap.conds = process_cond_list(model_wrap.conds)
         cond = model_wrap.conds["positive"]
         dataset_size = sigmas.size(0)
         torch.cuda.empty_cache()
-        for i in (pbar:=tqdm.trange(self.total_steps, desc="Training LoRA", smoothing=0.01, disable=not comfy.utils.PROGRESS_BAR_ENABLED)):
-            noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(self.seed + i * 1000)
-            indicies = torch.randperm(dataset_size)[:self.batch_size].tolist()
-
-            batch_latent = torch.stack([latent_image[i] for i in indicies])
-            batch_noise = noisegen.generate_noise({"samples": batch_latent}).to(batch_latent.device)
-            batch_sigmas = [
-                model_wrap.inner_model.model_sampling.percent_to_sigma(
-                    torch.rand((1,)).item()
-                ) for _ in range(min(self.batch_size, dataset_size))
-            ]
-            batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device)
-
-            xt = model_wrap.inner_model.model_sampling.noise_scaling(
-                batch_sigmas,
-                batch_noise,
-                batch_latent,
-                False
+        ui_pbar = ProgressBar(self.total_steps)
+        for i in (
+            pbar := trange(
+                self.total_steps,
+                desc="Training LoRA",
+                smoothing=0.01,
+                disable=not comfy.utils.PROGRESS_BAR_ENABLED,
             )
-            x0 = model_wrap.inner_model.model_sampling.noise_scaling(
-                torch.zeros_like(batch_sigmas),
-                torch.zeros_like(batch_noise),
-                batch_latent,
-                False
+        ):
+            noisegen = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(
+                self.seed + i * 1000
             )
+            indicies = torch.randperm(dataset_size)[: self.batch_size].tolist()
 
-            model_wrap.conds["positive"] = [
-                cond[i] for i in indicies
-            ]
-            batch_extra_args = make_batch_extra_option_dict(extra_args, indicies, full_size=dataset_size)
+            if self.real_dataset is None:
+                batch_latent = torch.stack([latent_image[i] for i in indicies])
+                batch_noise = noisegen.generate_noise({"samples": batch_latent}).to(
+                    batch_latent.device
+                )
+                batch_sigmas = [
+                    model_wrap.inner_model.model_sampling.percent_to_sigma(
+                        torch.rand((1,)).item()
+                    )
+                    for _ in range(min(self.batch_size, dataset_size))
+                ]
+                batch_sigmas = torch.tensor(batch_sigmas).to(batch_latent.device)
 
-            with torch.autocast(xt.device.type, dtype=self.training_dtype):
-                x0_pred = model_wrap(xt, batch_sigmas, **batch_extra_args)
-                loss = self.loss_fn(x0_pred, x0)
-            loss.backward()
-            if self.loss_callback:
-                self.loss_callback(loss.item())
-            pbar.set_postfix({"loss": f"{loss.item():.4f}"})
+                loss = self.fwd_bwd(
+                    model_wrap,
+                    batch_sigmas,
+                    batch_noise,
+                    batch_latent,
+                    cond,
+                    indicies,
+                    extra_args,
+                    dataset_size,
+                    bwd=True,
+                )
+                if self.loss_callback:
+                    self.loss_callback(loss.item())
+                pbar.set_postfix({"loss": f"{loss.item():.4f}"})
+            else:
+                total_loss = 0
+                for index in indicies:
+                    single_latent = self.real_dataset[index].to(latent_image)
+                    batch_noise = noisegen.generate_noise(
+                        {"samples": single_latent}
+                    ).to(single_latent.device)
+                    batch_sigmas = (
+                        model_wrap.inner_model.model_sampling.percent_to_sigma(
+                            torch.rand((1,)).item()
+                        )
+                    )
+                    batch_sigmas = torch.tensor([batch_sigmas]).to(single_latent.device)
+                    loss = self.fwd_bwd(
+                        model_wrap,
+                        batch_sigmas,
+                        batch_noise,
+                        single_latent,
+                        cond,
+                        [index],
+                        extra_args,
+                        dataset_size,
+                        bwd=False,
+                    )
+                    total_loss += loss
+                total_loss = total_loss / self.grad_acc / len(indicies)
+                total_loss.backward()
+                if self.loss_callback:
+                    self.loss_callback(total_loss.item())
+                pbar.set_postfix({"loss": f"{total_loss.item():.4f}"})
 
-            if (i+1) % self.grad_acc == 0:
+            if (i + 1) % self.grad_acc == 0:
                 self.optimizer.step()
                 self.optimizer.zero_grad()
+                ui_pbar.update(1)
         torch.cuda.empty_cache()
         return torch.zeros_like(latent_image)
 
@@ -134,233 +227,6 @@ class BiasDiff(torch.nn.Module):
         return self.passive_memory_usage()
 
 
-def load_and_process_images(image_files, input_dir, resize_method="None", w=None, h=None):
-    """Utility function to load and process a list of images.
-
-    Args:
-        image_files: List of image filenames
-        input_dir: Base directory containing the images
-        resize_method: How to handle images of different sizes ("None", "Stretch", "Crop", "Pad")
-
-    Returns:
-        torch.Tensor: Batch of processed images
-    """
-    if not image_files:
-        raise ValueError("No valid images found in input")
-
-    output_images = []
-
-    for file in image_files:
-        image_path = os.path.join(input_dir, file)
-        img = node_helpers.pillow(Image.open, image_path)
-
-        if img.mode == "I":
-            img = img.point(lambda i: i * (1 / 255))
-        img = img.convert("RGB")
-
-        if w is None and h is None:
-            w, h = img.size[0], img.size[1]
-
-        # Resize image to first image
-        if img.size[0] != w or img.size[1] != h:
-            if resize_method == "Stretch":
-                img = img.resize((w, h), Image.Resampling.LANCZOS)
-            elif resize_method == "Crop":
-                img = img.crop((0, 0, w, h))
-            elif resize_method == "Pad":
-                img = img.resize((w, h), Image.Resampling.LANCZOS)
-            elif resize_method == "None":
-                raise ValueError(
-                    "Your input image size does not match the first image in the dataset. Either select a valid resize method or use the same size for all images."
-                )
-
-        img_array = np.array(img).astype(np.float32) / 255.0
-        img_tensor = torch.from_numpy(img_array)[None,]
-        output_images.append(img_tensor)
-
-    return torch.cat(output_images, dim=0)
-
-
-class LoadImageSetNode:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "images": (
-                    [
-                        f
-                        for f in os.listdir(folder_paths.get_input_directory())
-                        if f.endswith((".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif", ".jpe", ".apng", ".tif", ".tiff"))
-                    ],
-                    {"image_upload": True, "allow_batch": True},
-                )
-            },
-            "optional": {
-                "resize_method": (
-                    ["None", "Stretch", "Crop", "Pad"],
-                    {"default": "None"},
-                ),
-            },
-        }
-
-    INPUT_IS_LIST = True
-    RETURN_TYPES = ("IMAGE",)
-    FUNCTION = "load_images"
-    CATEGORY = "loaders"
-    EXPERIMENTAL = True
-    DESCRIPTION = "Loads a batch of images from a directory for training."
-
-    @classmethod
-    def VALIDATE_INPUTS(s, images, resize_method):
-        filenames = images[0] if isinstance(images[0], list) else images
-
-        for image in filenames:
-            if not folder_paths.exists_annotated_filepath(image):
-                return "Invalid image file: {}".format(image)
-        return True
-
-    def load_images(self, input_files, resize_method):
-        input_dir = folder_paths.get_input_directory()
-        valid_extensions = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif", ".jpe", ".apng", ".tif", ".tiff"]
-        image_files = [
-            f
-            for f in input_files
-            if any(f.lower().endswith(ext) for ext in valid_extensions)
-        ]
-        output_tensor = load_and_process_images(image_files, input_dir, resize_method)
-        return (output_tensor,)
-
-
-class LoadImageSetFromFolderNode:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."})
-            },
-            "optional": {
-                "resize_method": (
-                    ["None", "Stretch", "Crop", "Pad"],
-                    {"default": "None"},
-                ),
-            },
-        }
-
-    RETURN_TYPES = ("IMAGE",)
-    FUNCTION = "load_images"
-    CATEGORY = "loaders"
-    EXPERIMENTAL = True
-    DESCRIPTION = "Loads a batch of images from a directory for training."
-
-    def load_images(self, folder, resize_method):
-        sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder)
-        valid_extensions = [".png", ".jpg", ".jpeg", ".webp"]
-        image_files = [
-            f
-            for f in os.listdir(sub_input_dir)
-            if any(f.lower().endswith(ext) for ext in valid_extensions)
-        ]
-        output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method)
-        return (output_tensor,)
-
-
-class LoadImageTextSetFromFolderNode:
-    @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "folder": (folder_paths.get_input_subfolders(), {"tooltip": "The folder to load images from."}),
-                "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}),
-            },
-            "optional": {
-                "resize_method": (
-                    ["None", "Stretch", "Crop", "Pad"],
-                    {"default": "None"},
-                ),
-                "width": (
-                    IO.INT,
-                    {
-                        "default": -1,
-                        "min": -1,
-                        "max": 10000,
-                        "step": 1,
-                        "tooltip": "The width to resize the images to. -1 means use the original width.",
-                    },
-                ),
-                "height": (
-                    IO.INT,
-                    {
-                        "default": -1,
-                        "min": -1,
-                        "max": 10000,
-                        "step": 1,
-                        "tooltip": "The height to resize the images to. -1 means use the original height.",
-                    },
-                )
-            },
-        }
-
-    RETURN_TYPES = ("IMAGE", IO.CONDITIONING,)
-    FUNCTION = "load_images"
-    CATEGORY = "loaders"
-    EXPERIMENTAL = True
-    DESCRIPTION = "Loads a batch of images and caption from a directory for training."
-
-    def load_images(self, folder, clip, resize_method, width=None, height=None):
-        if clip is None:
-            raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.")
-
-        logging.info(f"Loading images from folder: {folder}")
-
-        sub_input_dir = os.path.join(folder_paths.get_input_directory(), folder)
-        valid_extensions = [".png", ".jpg", ".jpeg", ".webp"]
-
-        image_files = []
-        for item in os.listdir(sub_input_dir):
-            path = os.path.join(sub_input_dir, item)
-            if any(item.lower().endswith(ext) for ext in valid_extensions):
-                image_files.append(path)
-            elif os.path.isdir(path):
-                # Support kohya-ss/sd-scripts folder structure
-                repeat = 1
-                if item.split("_")[0].isdigit():
-                    repeat = int(item.split("_")[0])
-                image_files.extend([
-                    os.path.join(path, f) for f in os.listdir(path) if any(f.lower().endswith(ext) for ext in valid_extensions)
-                ] * repeat)
-
-        caption_file_path = [
-            f.replace(os.path.splitext(f)[1], ".txt")
-            for f in image_files
-        ]
-        captions = []
-        for caption_file in caption_file_path:
-            caption_path = os.path.join(sub_input_dir, caption_file)
-            if os.path.exists(caption_path):
-                with open(caption_path, "r", encoding="utf-8") as f:
-                    caption = f.read().strip()
-                    captions.append(caption)
-            else:
-                captions.append("")
-
-        width = width if width != -1 else None
-        height = height if height != -1 else None
-        output_tensor = load_and_process_images(image_files, sub_input_dir, resize_method, width, height)
-
-        logging.info(f"Loaded {len(output_tensor)} images from {sub_input_dir}.")
-
-        logging.info(f"Encoding captions from {sub_input_dir}.")
-        conditions = []
-        empty_cond = clip.encode_from_tokens_scheduled(clip.tokenize(""))
-        for text in captions:
-            if text == "":
-                conditions.append(empty_cond)
-            tokens = clip.tokenize(text)
-            conditions.extend(clip.encode_from_tokens_scheduled(tokens))
-        logging.info(f"Encoded {len(conditions)} captions from {sub_input_dir}.")
-        return (output_tensor, conditions)
-
-
 def draw_loss_graph(loss_map, steps):
     width, height = 500, 300
     img = Image.new("RGB", (width, height), "white")
@@ -379,10 +245,14 @@ def draw_loss_graph(loss_map, steps):
     return img
 
 
-def find_all_highest_child_module_with_forward(model: torch.nn.Module, result = None, name = None):
+def find_all_highest_child_module_with_forward(
+    model: torch.nn.Module, result=None, name=None
+):
     if result is None:
         result = []
-    elif hasattr(model, "forward") and not isinstance(model, (torch.nn.ModuleList, torch.nn.Sequential, torch.nn.ModuleDict)):
+    elif hasattr(model, "forward") and not isinstance(
+        model, (torch.nn.ModuleList, torch.nn.Sequential, torch.nn.ModuleDict)
+    ):
         result.append(model)
         logging.debug(f"Found module with forward: {name} ({model.__class__.__name__})")
         return result
@@ -396,12 +266,13 @@ def patch(m):
     if not hasattr(m, "forward"):
         return
     org_forward = m.forward
+
     def fwd(args, kwargs):
         return org_forward(*args, **kwargs)
+
     def checkpointing_fwd(*args, **kwargs):
-        return torch.utils.checkpoint.checkpoint(
-            fwd, args, kwargs, use_reentrant=False
-        )
+        return torch.utils.checkpoint.checkpoint(fwd, args, kwargs, use_reentrant=False)
+
     m.org_forward = org_forward
     m.forward = checkpointing_fwd
 
@@ -412,130 +283,126 @@ def unpatch(m):
         del m.org_forward
 
 
-class TrainLoraNode:
+class TrainLoraNode(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "model": (IO.MODEL, {"tooltip": "The model to train the LoRA on."}),
-                "latents": (
-                    "LATENT",
-                    {
-                        "tooltip": "The Latents to use for training, serve as dataset/input of the model."
-                    },
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TrainLoraNode",
+            display_name="Train LoRA",
+            category="training",
+            is_experimental=True,
+            is_input_list=True,  # All inputs become lists
+            inputs=[
+                io.Model.Input("model", tooltip="The model to train the LoRA on."),
+                io.Latent.Input(
+                    "latents",
+                    tooltip="The Latents to use for training, serve as dataset/input of the model.",
                 ),
-                "positive": (
-                    IO.CONDITIONING,
-                    {"tooltip": "The positive conditioning to use for training."},
+                io.Conditioning.Input(
+                    "positive", tooltip="The positive conditioning to use for training."
                 ),
-                "batch_size": (
-                    IO.INT,
-                    {
-                        "default": 1,
-                        "min": 1,
-                        "max": 10000,
-                        "step": 1,
-                        "tooltip": "The batch size to use for training.",
-                    },
+                io.Int.Input(
+                    "batch_size",
+                    default=1,
+                    min=1,
+                    max=10000,
+                    tooltip="The batch size to use for training.",
                 ),
-                "grad_accumulation_steps": (
-                    IO.INT,
-                    {
-                        "default": 1,
-                        "min": 1,
-                        "max": 1024,
-                        "step": 1,
-                        "tooltip": "The number of gradient accumulation steps to use for training.",
-                    }
+                io.Int.Input(
+                    "grad_accumulation_steps",
+                    default=1,
+                    min=1,
+                    max=1024,
+                    tooltip="The number of gradient accumulation steps to use for training.",
                 ),
-                "steps": (
-                    IO.INT,
-                    {
-                        "default": 16,
-                        "min": 1,
-                        "max": 100000,
-                        "tooltip": "The number of steps to train the LoRA for.",
-                    },
+                io.Int.Input(
+                    "steps",
+                    default=16,
+                    min=1,
+                    max=100000,
+                    tooltip="The number of steps to train the LoRA for.",
                 ),
-                "learning_rate": (
-                    IO.FLOAT,
-                    {
-                        "default": 0.0005,
-                        "min": 0.0000001,
-                        "max": 1.0,
-                        "step": 0.000001,
-                        "tooltip": "The learning rate to use for training.",
-                    },
+                io.Float.Input(
+                    "learning_rate",
+                    default=0.0005,
+                    min=0.0000001,
+                    max=1.0,
+                    step=0.0000001,
+                    tooltip="The learning rate to use for training.",
                 ),
-                "rank": (
-                    IO.INT,
-                    {
-                        "default": 8,
-                        "min": 1,
-                        "max": 128,
-                        "tooltip": "The rank of the LoRA layers.",
-                    },
+                io.Int.Input(
+                    "rank",
+                    default=8,
+                    min=1,
+                    max=128,
+                    tooltip="The rank of the LoRA layers.",
                 ),
-                "optimizer": (
-                    ["AdamW", "Adam", "SGD", "RMSprop"],
-                    {
-                        "default": "AdamW",
-                        "tooltip": "The optimizer to use for training.",
-                    },
+                io.Combo.Input(
+                    "optimizer",
+                    options=["AdamW", "Adam", "SGD", "RMSprop"],
+                    default="AdamW",
+                    tooltip="The optimizer to use for training.",
                 ),
-                "loss_function": (
-                    ["MSE", "L1", "Huber", "SmoothL1"],
-                    {
-                        "default": "MSE",
-                        "tooltip": "The loss function to use for training.",
-                    },
+                io.Combo.Input(
+                    "loss_function",
+                    options=["MSE", "L1", "Huber", "SmoothL1"],
+                    default="MSE",
+                    tooltip="The loss function to use for training.",
                 ),
-                "seed": (
-                    IO.INT,
-                    {
-                        "default": 0,
-                        "min": 0,
-                        "max": 0xFFFFFFFFFFFFFFFF,
-                        "tooltip": "The seed to use for training (used in generator for LoRA weight initialization and noise sampling)",
-                    },
+                io.Int.Input(
+                    "seed",
+                    default=0,
+                    min=0,
+                    max=0xFFFFFFFFFFFFFFFF,
+                    tooltip="The seed to use for training (used in generator for LoRA weight initialization and noise sampling)",
                 ),
-                "training_dtype": (
-                    ["bf16",  "fp32"],
-                    {"default": "bf16", "tooltip": "The dtype to use for training."},
+                io.Combo.Input(
+                    "training_dtype",
+                    options=["bf16", "fp32"],
+                    default="bf16",
+                    tooltip="The dtype to use for training.",
                 ),
-                "lora_dtype": (
-                    ["bf16", "fp32"],
-                    {"default": "bf16", "tooltip": "The dtype to use for lora."},
+                io.Combo.Input(
+                    "lora_dtype",
+                    options=["bf16", "fp32"],
+                    default="bf16",
+                    tooltip="The dtype to use for lora.",
                 ),
-                "algorithm": (
-                    list(adapter_maps.keys()),
-                    {"default": list(adapter_maps.keys())[0], "tooltip": "The algorithm to use for training."},
+                io.Combo.Input(
+                    "algorithm",
+                    options=list(adapter_maps.keys()),
+                    default=list(adapter_maps.keys())[0],
+                    tooltip="The algorithm to use for training.",
                 ),
-                "gradient_checkpointing": (
-                    IO.BOOLEAN,
-                    {
-                        "default": True,
-                        "tooltip": "Use gradient checkpointing for training.",
-                    }
+                io.Boolean.Input(
+                    "gradient_checkpointing",
+                    default=True,
+                    tooltip="Use gradient checkpointing for training.",
                 ),
-                "existing_lora": (
-                    folder_paths.get_filename_list("loras") + ["[None]"],
-                    {
-                        "default": "[None]",
-                        "tooltip": "The existing LoRA to append to. Set to None for new LoRA.",
-                    },
+                io.Combo.Input(
+                    "existing_lora",
+                    options=folder_paths.get_filename_list("loras") + ["[None]"],
+                    default="[None]",
+                    tooltip="The existing LoRA to append to. Set to None for new LoRA.",
                 ),
-            },
-        }
+            ],
+            outputs=[
+                io.Model.Output(
+                    display_name="model", tooltip="Model with LoRA applied"
+                ),
+                io.Custom("LORA_MODEL").Output(
+                    display_name="lora", tooltip="LoRA weights"
+                ),
+                io.Custom("LOSS_MAP").Output(
+                    display_name="loss_map", tooltip="Loss history"
+                ),
+                io.Int.Output(display_name="steps", tooltip="Total training steps"),
+            ],
+        )
 
-    RETURN_TYPES = (IO.MODEL, IO.LORA_MODEL, IO.LOSS_MAP, IO.INT)
-    RETURN_NAMES = ("model_with_lora", "lora", "loss", "steps")
-    FUNCTION = "train"
-    CATEGORY = "training"
-    EXPERIMENTAL = True
-
-    def train(
-        self,
+    @classmethod
+    def execute(
+        cls,
         model,
         latents,
         positive,
@@ -553,13 +420,74 @@ class TrainLoraNode:
         gradient_checkpointing,
         existing_lora,
     ):
+        # Extract scalars from lists (due to is_input_list=True)
+        model = model[0]
+        batch_size = batch_size[0]
+        steps = steps[0]
+        grad_accumulation_steps = grad_accumulation_steps[0]
+        learning_rate = learning_rate[0]
+        rank = rank[0]
+        optimizer = optimizer[0]
+        loss_function = loss_function[0]
+        seed = seed[0]
+        training_dtype = training_dtype[0]
+        lora_dtype = lora_dtype[0]
+        algorithm = algorithm[0]
+        gradient_checkpointing = gradient_checkpointing[0]
+        existing_lora = existing_lora[0]
+
+        # Handle latents - either single dict or list of dicts
+        if len(latents) == 1:
+            latents = latents[0]["samples"]  # Single latent dict
+        else:
+            latent_list = []
+            for latent in latents:
+                latent = latent["samples"]
+                bs = latent.shape[0]
+                if bs != 1:
+                    for sub_latent in latent:
+                        latent_list.append(sub_latent[None])
+                else:
+                    latent_list.append(latent)
+            latents = latent_list
+
+        # Handle conditioning - either single list or list of lists
+        if len(positive) == 1:
+            positive = positive[0]  # Single conditioning list
+        else:
+            # Multiple conditioning lists - flatten
+            flat_positive = []
+            for cond in positive:
+                if isinstance(cond, list):
+                    flat_positive.extend(cond)
+                else:
+                    flat_positive.append(cond)
+            positive = flat_positive
+
         mp = model.clone()
         dtype = node_helpers.string_to_torch_dtype(training_dtype)
         lora_dtype = node_helpers.string_to_torch_dtype(lora_dtype)
         mp.set_model_compute_dtype(dtype)
 
-        latents = latents["samples"].to(dtype)
-        num_images = latents.shape[0]
+        # latents here can be list of different size latent or one large batch
+        if isinstance(latents, list):
+            all_shapes = set()
+            latents = [t.to(dtype) for t in latents]
+            for latent in latents:
+                all_shapes.add(latent.shape)
+            logging.info(f"Latent shapes: {all_shapes}")
+            if len(all_shapes) > 1:
+                multi_res = True
+            else:
+                multi_res = False
+                latents = torch.cat(latents, dim=0)
+            num_images = len(latents)
+        elif isinstance(latents, torch.Tensor):
+            latents = latents.to(dtype)
+            num_images = latents.shape[0]
+        else:
+            logging.error(f"Invalid latents type: {type(latents)}")
+
         logging.info(f"Total Images: {num_images}, Total Captions: {len(positive)}")
         if len(positive) == 1 and num_images > 1:
             positive = positive * num_images
@@ -591,9 +519,7 @@ class TrainLoraNode:
                         shape = m.weight.shape
                         if len(shape) >= 2:
                             alpha = float(existing_weights.get(f"{key}.alpha", 1.0))
-                            dora_scale = existing_weights.get(
-                                f"{key}.dora_scale", None
-                            )
+                            dora_scale = existing_weights.get(f"{key}.dora_scale", None)
                             for adapter_cls in adapters:
                                 existing_adapter = adapter_cls.load(
                                     n, existing_weights, alpha, dora_scale
@@ -605,7 +531,9 @@ class TrainLoraNode:
                                 adapter_cls = adapter_maps[algorithm]
 
                             if existing_adapter is not None:
-                                train_adapter = existing_adapter.to_train().to(lora_dtype)
+                                train_adapter = existing_adapter.to_train().to(
+                                    lora_dtype
+                                )
                             else:
                                 # Use LoRA with alpha=1.0 by default
                                 train_adapter = adapter_cls.create_train(
@@ -629,7 +557,9 @@ class TrainLoraNode:
                     if hasattr(m, "bias") and m.bias is not None:
                         key = "{}.bias".format(n)
                         bias = torch.nn.Parameter(
-                            torch.zeros(m.bias.shape, dtype=lora_dtype, requires_grad=True)
+                            torch.zeros(
+                                m.bias.shape, dtype=lora_dtype, requires_grad=True
+                            )
                         )
                         bias_module = BiasDiff(bias)
                         lora_sd["{}.diff_b".format(n)] = bias
@@ -657,24 +587,31 @@ class TrainLoraNode:
 
             # setup models
             if gradient_checkpointing:
-                for m in find_all_highest_child_module_with_forward(mp.model.diffusion_model):
+                for m in find_all_highest_child_module_with_forward(
+                    mp.model.diffusion_model
+                ):
                     patch(m)
             mp.model.requires_grad_(False)
-            comfy.model_management.load_models_gpu([mp], memory_required=1e20, force_full_load=True)
+            comfy.model_management.load_models_gpu(
+                [mp], memory_required=1e20, force_full_load=True
+            )
 
             # Setup sampler and guider like in test script
             loss_map = {"loss": []}
+
             def loss_callback(loss):
                 loss_map["loss"].append(loss)
+
             train_sampler = TrainSampler(
                 criterion,
                 optimizer,
                 loss_callback=loss_callback,
                 batch_size=batch_size,
                 grad_acc=grad_accumulation_steps,
-                total_steps=steps*grad_accumulation_steps,
+                total_steps=steps * grad_accumulation_steps,
                 seed=seed,
-                training_dtype=dtype
+                training_dtype=dtype,
+                real_dataset=latents if multi_res else None,
             )
             guider = comfy_extras.nodes_custom_sampler.Guider_Basic(mp)
             guider.set_conds(positive)  # Set conditioning from input
@@ -684,12 +621,15 @@ class TrainLoraNode:
                 # Generate dummy sigmas and noise
                 sigmas = torch.tensor(range(num_images))
                 noise = comfy_extras.nodes_custom_sampler.Noise_RandomNoise(seed)
+                if multi_res:
+                    # use first latent as dummy latent if multi_res
+                    latents = latents[0].repeat(num_images, 1, 1, 1)
                 guider.sample(
                     noise.generate_noise({"samples": latents}),
                     latents,
                     train_sampler,
                     sigmas,
-                    seed=noise.seed
+                    seed=noise.seed,
                 )
             finally:
                 for m in mp.model.modules():
@@ -702,111 +642,118 @@ class TrainLoraNode:
             for param in lora_sd:
                 lora_sd[param] = lora_sd[param].to(lora_dtype)
 
-            return (mp, lora_sd, loss_map, steps + existing_steps)
+            return io.NodeOutput(mp, lora_sd, loss_map, steps + existing_steps)
 
 
-class LoraModelLoader:
-    def __init__(self):
-        self.loaded_lora = None
+class LoraModelLoader(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LoraModelLoader",
+            display_name="Load LoRA Model",
+            category="loaders",
+            is_experimental=True,
+            inputs=[
+                io.Model.Input(
+                    "model", tooltip="The diffusion model the LoRA will be applied to."
+                ),
+                io.Custom("LORA_MODEL").Input(
+                    "lora", tooltip="The LoRA model to apply to the diffusion model."
+                ),
+                io.Float.Input(
+                    "strength_model",
+                    default=1.0,
+                    min=-100.0,
+                    max=100.0,
+                    tooltip="How strongly to modify the diffusion model. This value can be negative.",
+                ),
+            ],
+            outputs=[
+                io.Model.Output(
+                    display_name="model", tooltip="The modified diffusion model."
+                ),
+            ],
+        )
 
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "model": ("MODEL", {"tooltip": "The diffusion model the LoRA will be applied to."}),
-                "lora": (IO.LORA_MODEL, {"tooltip": "The LoRA model to apply to the diffusion model."}),
-                "strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01, "tooltip": "How strongly to modify the diffusion model. This value can be negative."}),
-            }
-        }
-
-    RETURN_TYPES = ("MODEL",)
-    OUTPUT_TOOLTIPS = ("The modified diffusion model.",)
-    FUNCTION = "load_lora_model"
-
-    CATEGORY = "loaders"
-    DESCRIPTION = "Load Trained LoRA weights from Train LoRA node."
-    EXPERIMENTAL = True
-
-    def load_lora_model(self, model, lora, strength_model):
+    def execute(cls, model, lora, strength_model):
         if strength_model == 0:
-            return (model, )
+            return io.NodeOutput(model)
 
-        model_lora, _ = comfy.sd.load_lora_for_models(model, None, lora, strength_model, 0)
-        return (model_lora, )
+        model_lora, _ = comfy.sd.load_lora_for_models(
+            model, None, lora, strength_model, 0
+        )
+        return io.NodeOutput(model_lora)
 
 
-class SaveLoRA:
-    def __init__(self):
-        self.output_dir = folder_paths.get_output_directory()
+class SaveLoRA(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SaveLoRA",
+            display_name="Save LoRA Weights",
+            category="loaders",
+            is_experimental=True,
+            is_output_node=True,
+            inputs=[
+                io.Custom("LORA_MODEL").Input(
+                    "lora",
+                    tooltip="The LoRA model to save. Do not use the model with LoRA layers.",
+                ),
+                io.String.Input(
+                    "prefix",
+                    default="loras/ComfyUI_trained_lora",
+                    tooltip="The prefix to use for the saved LoRA file.",
+                ),
+                io.Int.Input(
+                    "steps",
+                    optional=True,
+                    tooltip="Optional: The number of steps to LoRA has been trained for, used to name the saved file.",
+                ),
+            ],
+            outputs=[],
+        )
 
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "lora": (
-                    IO.LORA_MODEL,
-                    {
-                        "tooltip": "The LoRA model to save. Do not use the model with LoRA layers."
-                    },
-                ),
-                "prefix": (
-                    "STRING",
-                    {
-                        "default": "loras/ComfyUI_trained_lora",
-                        "tooltip": "The prefix to use for the saved LoRA file.",
-                    },
-                ),
-            },
-            "optional": {
-                "steps": (
-                    IO.INT,
-                    {
-                        "forceInput": True,
-                        "tooltip": "Optional: The number of steps to LoRA has been trained for, used to name the saved file.",
-                    },
-                ),
-            },
-        }
-
-    RETURN_TYPES = ()
-    FUNCTION = "save"
-    CATEGORY = "loaders"
-    EXPERIMENTAL = True
-    OUTPUT_NODE = True
-
-    def save(self, lora, prefix, steps=None):
-        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(prefix, self.output_dir)
+    def execute(cls, lora, prefix, steps=None):
+        output_dir = folder_paths.get_output_directory()
+        full_output_folder, filename, counter, subfolder, filename_prefix = (
+            folder_paths.get_save_image_path(prefix, output_dir)
+        )
         if steps is None:
             output_checkpoint = f"{filename}_{counter:05}_.safetensors"
         else:
             output_checkpoint = f"{filename}_{steps}_steps_{counter:05}_.safetensors"
         output_checkpoint = os.path.join(full_output_folder, output_checkpoint)
         safetensors.torch.save_file(lora, output_checkpoint)
-        return {}
+        return io.NodeOutput()
 
 
-class LossGraphNode:
-    def __init__(self):
-        self.output_dir = folder_paths.get_temp_directory()
+class LossGraphNode(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="LossGraphNode",
+            display_name="Plot Loss Graph",
+            category="training",
+            is_experimental=True,
+            is_output_node=True,
+            inputs=[
+                io.Custom("LOSS_MAP").Input(
+                    "loss", tooltip="Loss map from training node."
+                ),
+                io.String.Input(
+                    "filename_prefix",
+                    default="loss_graph",
+                    tooltip="Prefix for the saved loss graph image.",
+                ),
+            ],
+            outputs=[],
+            hidden=[io.Hidden.prompt, io.Hidden.extra_pnginfo],
+        )
 
     @classmethod
-    def INPUT_TYPES(s):
-        return {
-            "required": {
-                "loss": (IO.LOSS_MAP, {"default": {}}),
-                "filename_prefix": (IO.STRING, {"default": "loss_graph"}),
-            },
-            "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},
-        }
-
-    RETURN_TYPES = ()
-    FUNCTION = "plot_loss"
-    OUTPUT_NODE = True
-    CATEGORY = "training"
-    EXPERIMENTAL = True
-    DESCRIPTION = "Plots the loss graph and saves it to the output directory."
-
-    def plot_loss(self, loss, filename_prefix, prompt=None, extra_pnginfo=None):
+    def execute(cls, loss, filename_prefix, prompt=None, extra_pnginfo=None):
         loss_values = loss["loss"]
         width, height = 800, 480
         margin = 40
@@ -849,47 +796,27 @@ class LossGraphNode:
             (margin - 30, height - 10), f"{min_loss:.2f}", font=font, fill="black"
         )
 
-        metadata = None
-        if not args.disable_metadata:
-            metadata = PngInfo()
-            if prompt is not None:
-                metadata.add_text("prompt", json.dumps(prompt))
-            if extra_pnginfo is not None:
-                for x in extra_pnginfo:
-                    metadata.add_text(x, json.dumps(extra_pnginfo[x]))
+        # Convert PIL image to tensor for PreviewImage
+        img_array = np.array(img).astype(np.float32) / 255.0
+        img_tensor = torch.from_numpy(img_array)[None,]  # [1, H, W, 3]
 
-        date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        img.save(
-            os.path.join(self.output_dir, f"{filename_prefix}_{date}.png"),
-            pnginfo=metadata,
-        )
-        return {
-            "ui": {
-                "images": [
-                    {
-                        "filename": f"{filename_prefix}_{date}.png",
-                        "subfolder": "",
-                        "type": "temp",
-                    }
-                ]
-            }
-        }
+        # Return preview UI
+        return io.NodeOutput(ui=ui.PreviewImage(img_tensor, cls=cls))
 
 
-NODE_CLASS_MAPPINGS = {
-    "TrainLoraNode": TrainLoraNode,
-    "SaveLoRANode": SaveLoRA,
-    "LoraModelLoader": LoraModelLoader,
-    "LoadImageSetFromFolderNode": LoadImageSetFromFolderNode,
-    "LoadImageTextSetFromFolderNode": LoadImageTextSetFromFolderNode,
-    "LossGraphNode": LossGraphNode,
-}
+# ========== Extension Setup ==========
 
-NODE_DISPLAY_NAME_MAPPINGS = {
-    "TrainLoraNode": "Train LoRA",
-    "SaveLoRANode": "Save LoRA Weights",
-    "LoraModelLoader": "Load LoRA Model",
-    "LoadImageSetFromFolderNode": "Load Image Dataset from Folder",
-    "LoadImageTextSetFromFolderNode": "Load Image and Text Dataset from Folder",
-    "LossGraphNode": "Plot Loss Graph",
-}
+
+class TrainingExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            TrainLoraNode,
+            LoraModelLoader,
+            SaveLoRA,
+            LossGraphNode,
+        ]
+
+
+async def comfy_entrypoint() -> TrainingExtension:
+    return TrainingExtension()
diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py
index 69fabb12e..6cf6e39bf 100644
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -88,7 +88,7 @@ class SaveVideo(io.ComfyNode):
         )
 
     @classmethod
-    def execute(cls, video: VideoInput, filename_prefix, format, codec) -> io.NodeOutput:
+    def execute(cls, video: VideoInput, filename_prefix, format: str, codec) -> io.NodeOutput:
         width, height = video.get_dimensions()
         full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
             filename_prefix,
@@ -108,7 +108,7 @@ class SaveVideo(io.ComfyNode):
         file = f"{filename}_{counter:05}_.{VideoContainer.get_extension(format)}"
         video.save_to(
             os.path.join(full_output_folder, file),
-            format=format,
+            format=VideoContainer(format),
             codec=codec,
             metadata=saved_metadata
         )
diff --git a/comfyui_version.py b/comfyui_version.py
index 25d1a4157..4b039356e 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.68"
+__version__ = "0.3.76"
diff --git a/execution.py b/execution.py
index 17c77beab..c2186ac98 100644
--- a/execution.py
+++ b/execution.py
@@ -34,7 +34,7 @@ from comfy_execution.validation import validate_node_input
 from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler
 from comfy_execution.utils import CurrentNodeContext
 from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func
-from comfy_api.latest import io
+from comfy_api.latest import io, _io
 
 
 class ExecutionResult(Enum):
@@ -76,7 +76,7 @@ class IsChangedCache:
             return self.is_changed[node_id]
 
         # Intentionally do not use cached outputs here. We only want constants in IS_CHANGED
-        input_data_all, _, hidden_inputs = get_input_data(node["inputs"], class_def, node_id, None)
+        input_data_all, _, v3_data = get_input_data(node["inputs"], class_def, node_id, None)
         try:
             is_changed = await _async_map_node_over_list(self.prompt_id, node_id, class_def, input_data_all, is_changed_name)
             is_changed = await resolve_map_node_over_list_results(is_changed)
@@ -146,8 +146,9 @@ SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org")
 
 def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}):
     is_v3 = issubclass(class_def, _ComfyNodeInternal)
+    v3_data: io.V3Data = {}
     if is_v3:
-        valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True)
+        valid_inputs, schema, v3_data = class_def.INPUT_TYPES(include_hidden=False, return_schema=True, live_inputs=inputs)
     else:
         valid_inputs = class_def.INPUT_TYPES()
     input_data_all = {}
@@ -207,7 +208,8 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=
                     input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)]
                 if h[x] == "API_KEY_COMFY_ORG":
                     input_data_all[x] = [extra_data.get("api_key_comfy_org", None)]
-    return input_data_all, missing_keys, hidden_inputs_v3
+    v3_data["hidden_inputs"] = hidden_inputs_v3
+    return input_data_all, missing_keys, v3_data
 
 map_node_over_list = None #Don't hook this please
 
@@ -223,7 +225,7 @@ async def resolve_map_node_over_list_results(results):
                 raise exc
         return [x.result() if isinstance(x, asyncio.Task) else x for x in results]
 
-async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None):
+async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, func, allow_interrupt=False, execution_block_cb=None, pre_execute_cb=None, v3_data=None):
     # check if node wants the lists
     input_is_list = getattr(obj, "INPUT_IS_LIST", False)
 
@@ -259,13 +261,16 @@ async def _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, f
                 if is_class(obj):
                     type_obj = obj
                     obj.VALIDATE_CLASS()
-                    class_clone = obj.PREPARE_CLASS_CLONE(hidden_inputs)
+                    class_clone = obj.PREPARE_CLASS_CLONE(v3_data)
                 # otherwise, use class instance to populate/reuse some fields
                 else:
                     type_obj = type(obj)
                     type_obj.VALIDATE_CLASS()
-                    class_clone = type_obj.PREPARE_CLASS_CLONE(hidden_inputs)
+                    class_clone = type_obj.PREPARE_CLASS_CLONE(v3_data)
                 f = make_locked_method_func(type_obj, func, class_clone)
+                # in case of dynamic inputs, restructure inputs to expected nested dict
+                if v3_data is not None:
+                    inputs = _io.build_nested_inputs(inputs, v3_data)
             # V1
             else:
                 f = getattr(obj, func)
@@ -320,8 +325,8 @@ def merge_result_data(results, obj):
             output.append([o[i] for o in results])
     return output
 
-async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, hidden_inputs=None):
-    return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs)
+async def get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=None, pre_execute_cb=None, v3_data=None):
+    return_values = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, obj.FUNCTION, allow_interrupt=True, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
     has_pending_task = any(isinstance(r, asyncio.Task) and not r.done() for r in return_values)
     if has_pending_task:
         return return_values, {}, False, has_pending_task
@@ -460,7 +465,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
             has_subgraph = False
         else:
             get_progress_state().start_progress(unique_id)
-            input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data)
+            input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data)
             if server.client_id is not None:
                 server.last_node_id = display_node_id
                 server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id)
@@ -475,7 +480,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
             else:
                 lazy_status_present = getattr(obj, "check_lazy_status", None) is not None
             if lazy_status_present:
-                required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, hidden_inputs=hidden_inputs)
+                required_inputs = await _async_map_node_over_list(prompt_id, unique_id, obj, input_data_all, "check_lazy_status", allow_interrupt=True, v3_data=v3_data)
                 required_inputs = await resolve_map_node_over_list_results(required_inputs)
                 required_inputs = set(sum([r for r in required_inputs if isinstance(r,list)], []))
                 required_inputs = [x for x in required_inputs if isinstance(x,str) and (
@@ -507,7 +512,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
             def pre_execute_cb(call_index):
                 # TODO - How to handle this with async functions without contextvars (which requires Python 3.12)?
                 GraphBuilder.set_default_prefix(unique_id, call_index, 0)
-            output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, hidden_inputs=hidden_inputs)
+            output_data, output_ui, has_subgraph, has_pending_tasks = await get_output_data(prompt_id, unique_id, obj, input_data_all, execution_block_cb=execution_block_cb, pre_execute_cb=pre_execute_cb, v3_data=v3_data)
             if has_pending_tasks:
                 pending_async_nodes[unique_id] = output_data
                 unblock = execution_list.add_external_block(unique_id)
@@ -745,18 +750,17 @@ async def validate_inputs(prompt_id, prompt, item, validated):
     class_type = prompt[unique_id]['class_type']
     obj_class = nodes.NODE_CLASS_MAPPINGS[class_type]
 
-    class_inputs = obj_class.INPUT_TYPES()
-    valid_inputs = set(class_inputs.get('required',{})).union(set(class_inputs.get('optional',{})))
-
     errors = []
     valid = True
 
     validate_function_inputs = []
     validate_has_kwargs = False
     if issubclass(obj_class, _ComfyNodeInternal):
+        class_inputs, _, _ = obj_class.INPUT_TYPES(include_hidden=False, return_schema=True, live_inputs=inputs)
         validate_function_name = "validate_inputs"
         validate_function = first_real_override(obj_class, validate_function_name)
     else:
+        class_inputs = obj_class.INPUT_TYPES()
         validate_function_name = "VALIDATE_INPUTS"
         validate_function = getattr(obj_class, validate_function_name, None)
     if validate_function is not None:
@@ -765,6 +769,8 @@ async def validate_inputs(prompt_id, prompt, item, validated):
         validate_has_kwargs = argspec.varkw is not None
     received_types = {}
 
+    valid_inputs = set(class_inputs.get('required',{})).union(set(class_inputs.get('optional',{})))
+
     for x in valid_inputs:
         input_type, input_category, extra_info = get_input_info(obj_class, x, class_inputs)
         assert extra_info is not None
@@ -935,7 +941,7 @@ async def validate_inputs(prompt_id, prompt, item, validated):
                         continue
 
     if len(validate_function_inputs) > 0 or validate_has_kwargs:
-        input_data_all, _, hidden_inputs = get_input_data(inputs, obj_class, unique_id)
+        input_data_all, _, v3_data = get_input_data(inputs, obj_class, unique_id)
         input_filtered = {}
         for x in input_data_all:
             if x in validate_function_inputs or validate_has_kwargs:
@@ -943,7 +949,7 @@ async def validate_inputs(prompt_id, prompt, item, validated):
         if 'input_types' in validate_function_inputs:
             input_filtered['input_types'] = [received_types]
 
-        ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, hidden_inputs=hidden_inputs)
+        ret = await _async_map_node_over_list(prompt_id, unique_id, obj_class, input_filtered, validate_function_name, v3_data=v3_data)
         ret = await resolve_map_node_over_list_results(ret)
         for x in input_filtered:
             for i, r in enumerate(ret):
diff --git a/folder_paths.py b/folder_paths.py
index f110d832b..9c96540e3 100644
--- a/folder_paths.py
+++ b/folder_paths.py
@@ -38,6 +38,8 @@ folder_names_and_paths["gligen"] = ([os.path.join(models_dir, "gligen")], suppor
 
 folder_names_and_paths["upscale_models"] = ([os.path.join(models_dir, "upscale_models")], supported_pt_extensions)
 
+folder_names_and_paths["latent_upscale_models"] = ([os.path.join(models_dir, "latent_upscale_models")], supported_pt_extensions)
+
 folder_names_and_paths["custom_nodes"] = ([os.path.join(base_path, "custom_nodes")], set())
 
 folder_names_and_paths["hypernetworks"] = ([os.path.join(models_dir, "hypernetworks")], supported_pt_extensions)
@@ -135,6 +137,71 @@ def set_user_directory(user_dir: str) -> None:
     user_directory = user_dir
 
 
+# System User Protection - Protects system directories from HTTP endpoint access
+# System Users are internal-only users that cannot be accessed via HTTP endpoints.
+# They use the '__' prefix convention (similar to Python's private member convention).
+SYSTEM_USER_PREFIX = "__"
+
+
+def get_system_user_directory(name: str = "system") -> str:
+    """
+    Get the path to a System User directory.
+
+    System User directories (prefixed with '__') are only accessible via internal API,
+    not through HTTP endpoints. Use this for storing system-internal data that
+    should not be exposed to users.
+
+    Args:
+        name: System user name (e.g., "system", "cache"). Must be alphanumeric
+              with underscores allowed, but cannot start with underscore.
+
+    Returns:
+        Absolute path to the system user directory.
+
+    Raises:
+        ValueError: If name is empty, invalid, or starts with underscore.
+
+    Example:
+        >>> get_system_user_directory("cache")
+        '/path/to/user/__cache'
+    """
+    if not name or not isinstance(name, str):
+        raise ValueError("System user name cannot be empty")
+    if not name.replace("_", "").isalnum():
+        raise ValueError(f"Invalid system user name: '{name}'")
+    if name.startswith("_"):
+        raise ValueError("System user name should not start with underscore")
+    return os.path.join(get_user_directory(), f"{SYSTEM_USER_PREFIX}{name}")
+
+
+def get_public_user_directory(user_id: str) -> str | None:
+    """
+    Get the path to a Public User directory for HTTP endpoint access.
+
+    This function provides structural security by returning None for any
+    System User (prefixed with '__'). All HTTP endpoints should use this
+    function instead of directly constructing user paths.
+
+    Args:
+        user_id: User identifier from HTTP request.
+
+    Returns:
+        Absolute path to the user directory, or None if user_id is invalid
+        or refers to a System User.
+
+    Example:
+        >>> get_public_user_directory("default")
+        '/path/to/user/default'
+        >>> get_public_user_directory("__system")
+        None
+    """
+    if not user_id or not isinstance(user_id, str):
+        return None
+    if user_id.startswith(SYSTEM_USER_PREFIX):
+        return None
+    return os.path.join(get_user_directory(), user_id)
+
+
 #NOTE: used in http server so don't put folders that should not be accessed remotely
 def get_directory_by_type(type_name: str) -> str | None:
     if type_name == "output":
diff --git a/latent_preview.py b/latent_preview.py
index 95d3cb733..66bded4b9 100644
--- a/latent_preview.py
+++ b/latent_preview.py
@@ -2,17 +2,24 @@ import torch
 from PIL import Image
 from comfy.cli_args import args, LatentPreviewMethod
 from comfy.taesd.taesd import TAESD
+from comfy.sd import VAE
 import comfy.model_management
 import folder_paths
 import comfy.utils
 import logging
 
 MAX_PREVIEW_RESOLUTION = args.preview_size
+VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"]
 
-def preview_to_image(latent_image):
-        latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1)  # change scale from -1..1 to 0..1
-                            .mul(0xFF)  # to 0..255
-                            )
+def preview_to_image(latent_image, do_scale=True):
+        if do_scale:
+            latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1)  # change scale from -1..1 to 0..1
+                                .mul(0xFF)  # to 0..255
+                                )
+        else:
+            latents_ubyte = (latent_image.clamp(0, 1)
+                                .mul(0xFF)  # to 0..255
+                                )
         if comfy.model_management.directml_enabled:
                 latents_ubyte = latents_ubyte.to(dtype=torch.uint8)
         latents_ubyte = latents_ubyte.to(device="cpu", dtype=torch.uint8, non_blocking=comfy.model_management.device_supports_non_blocking(latent_image.device))
@@ -35,15 +42,22 @@ class TAESDPreviewerImpl(LatentPreviewer):
         x_sample = self.taesd.decode(x0[:1])[0].movedim(0, 2)
         return preview_to_image(x_sample)
 
+class TAEHVPreviewerImpl(TAESDPreviewerImpl):
+    def decode_latent_to_preview(self, x0):
+        x_sample = self.taesd.decode(x0[:1, :, :1])[0][0]
+        return preview_to_image(x_sample, do_scale=False)
 
 class Latent2RGBPreviewer(LatentPreviewer):
-    def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None):
+    def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None, latent_rgb_factors_reshape=None):
         self.latent_rgb_factors = torch.tensor(latent_rgb_factors, device="cpu").transpose(0, 1)
         self.latent_rgb_factors_bias = None
         if latent_rgb_factors_bias is not None:
             self.latent_rgb_factors_bias = torch.tensor(latent_rgb_factors_bias, device="cpu")
+        self.latent_rgb_factors_reshape = latent_rgb_factors_reshape
 
     def decode_latent_to_preview(self, x0):
+        if self.latent_rgb_factors_reshape is not None:
+            x0 = self.latent_rgb_factors_reshape(x0)
         self.latent_rgb_factors = self.latent_rgb_factors.to(dtype=x0.dtype, device=x0.device)
         if self.latent_rgb_factors_bias is not None:
             self.latent_rgb_factors_bias = self.latent_rgb_factors_bias.to(dtype=x0.dtype, device=x0.device)
@@ -78,14 +92,19 @@ def get_previewer(device, latent_format):
 
         if method == LatentPreviewMethod.TAESD:
             if taesd_decoder_path:
-                taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device)
-                previewer = TAESDPreviewerImpl(taesd)
+                if latent_format.taesd_decoder_name in VIDEO_TAES:
+                    taesd = VAE(comfy.utils.load_torch_file(taesd_decoder_path))
+                    taesd.first_stage_model.show_progress_bar = False
+                    previewer = TAEHVPreviewerImpl(taesd)
+                else:
+                    taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device)
+                    previewer = TAESDPreviewerImpl(taesd)
             else:
                 logging.warning("Warning: TAESD previews enabled, but could not find models/vae_approx/{}".format(latent_format.taesd_decoder_name))
 
         if previewer is None:
             if latent_format.latent_rgb_factors is not None:
-                previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors, latent_format.latent_rgb_factors_bias)
+                previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors, latent_format.latent_rgb_factors_bias, latent_format.latent_rgb_factors_reshape)
     return previewer
 
 def prepare_callback(model, steps, x0_output_dict=None):
diff --git a/main.py b/main.py
index e1b0f1620..0cd815d9e 100644
--- a/main.py
+++ b/main.py
@@ -15,6 +15,7 @@ from comfy_execution.progress import get_progress_state
 from comfy_execution.utils import get_executing_context
 from comfy_api import feature_flags
 
+
 if __name__ == "__main__":
     #NOTE: These do not do anything on core ComfyUI, they are for custom nodes.
     os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
@@ -22,6 +23,23 @@ if __name__ == "__main__":
 
 setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
 
+
+def handle_comfyui_manager_unavailable():
+    if not args.windows_standalone_build:
+        logging.warning(f"\n\nYou appear to be running comfyui-manager from source, this is not recommended. Please install comfyui-manager using the following command:\ncommand:\n\t{sys.executable} -m pip install --pre comfyui_manager\n")
+    args.enable_manager = False
+
+
+if args.enable_manager:
+    if importlib.util.find_spec("comfyui_manager"):
+        import comfyui_manager
+
+        if not comfyui_manager.__file__ or not comfyui_manager.__file__.endswith('__init__.py'):
+            handle_comfyui_manager_unavailable()
+    else:
+        handle_comfyui_manager_unavailable()
+
+
 def apply_custom_paths():
     # extra model paths
     extra_model_paths_config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extra_model_paths.yaml")
@@ -79,6 +97,11 @@ def execute_prestartup_script():
 
         for possible_module in possible_modules:
             module_path = os.path.join(custom_node_path, possible_module)
+
+            if args.enable_manager:
+                if comfyui_manager.should_be_disabled(module_path):
+                    continue
+
             if os.path.isfile(module_path) or module_path.endswith(".disabled") or module_path == "__pycache__":
                 continue
 
@@ -101,6 +124,10 @@ def execute_prestartup_script():
         logging.info("")
 
 apply_custom_paths()
+
+if args.enable_manager:
+    comfyui_manager.prestartup()
+
 execute_prestartup_script()
 
 
@@ -323,6 +350,9 @@ def start_comfyui(asyncio_loop=None):
         asyncio.set_event_loop(asyncio_loop)
     prompt_server = server.PromptServer(asyncio_loop)
 
+    if args.enable_manager and not args.disable_manager_ui:
+        comfyui_manager.start()
+
     hook_breaker_ac10a0.save_functions()
     asyncio_loop.run_until_complete(nodes.init_extra_nodes(
         init_custom_nodes=(not args.disable_all_custom_nodes) or len(args.whitelist_custom_nodes) > 0,
diff --git a/manager_requirements.txt b/manager_requirements.txt
new file mode 100644
index 000000000..52cc5389c
--- /dev/null
+++ b/manager_requirements.txt
@@ -0,0 +1 @@
+comfyui_manager==4.0.3b3
diff --git a/models/latent_upscale_models/put_latent_upscale_models_here b/models/latent_upscale_models/put_latent_upscale_models_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/nodes.py b/nodes.py
index 28a136240..3b8ca55e8 100644
--- a/nodes.py
+++ b/nodes.py
@@ -43,6 +43,9 @@ import folder_paths
 import latent_preview
 import node_helpers
 
+if args.enable_manager:
+    import comfyui_manager
+
 def before_node_execution():
     comfy.model_management.throw_exception_if_processing_interrupted()
 
@@ -692,8 +695,10 @@ class LoraLoaderModelOnly(LoraLoader):
         return (self.load_lora(model, None, lora_name, strength_model, 0)[0],)
 
 class VAELoader:
+    video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"]
+    image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
     @staticmethod
-    def vae_list():
+    def vae_list(s):
         vaes = folder_paths.get_filename_list("vae")
         approx_vaes = folder_paths.get_filename_list("vae_approx")
         sdxl_taesd_enc = False
@@ -722,6 +727,11 @@ class VAELoader:
                 f1_taesd_dec = True
             elif v.startswith("taef1_decoder."):
                 f1_taesd_enc = True
+            else:
+                for tae in s.video_taes:
+                    if v.startswith(tae):
+                        vaes.append(v)
+
         if sd1_taesd_dec and sd1_taesd_enc:
             vaes.append("taesd")
         if sdxl_taesd_dec and sdxl_taesd_enc:
@@ -765,7 +775,7 @@ class VAELoader:
 
     @classmethod
     def INPUT_TYPES(s):
-        return {"required": { "vae_name": (s.vae_list(), )}}
+        return {"required": { "vae_name": (s.vae_list(s), )}}
     RETURN_TYPES = ("VAE",)
     FUNCTION = "load_vae"
 
@@ -776,10 +786,13 @@ class VAELoader:
         if vae_name == "pixel_space":
             sd = {}
             sd["pixel_space_vae"] = torch.tensor(1.0)
-        elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]:
+        elif vae_name in self.image_taes:
             sd = self.load_taesd(vae_name)
         else:
-            vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
+            if os.path.splitext(vae_name)[0] in self.video_taes:
+                vae_path = folder_paths.get_full_path_or_raise("vae_approx", vae_name)
+            else:
+                vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
             sd = comfy.utils.load_torch_file(vae_path)
         vae = comfy.sd.VAE(sd=sd)
         vae.throw_exception_if_invalid()
@@ -929,7 +942,7 @@ class CLIPLoader:
     @classmethod
     def INPUT_TYPES(s):
         return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis"], ),
                               },
                 "optional": {
                               "device": (["default", "cpu"], {"advanced": True}),
@@ -957,7 +970,7 @@ class DualCLIPLoader:
     def INPUT_TYPES(s):
         return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
                               "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15"], ),
                               },
                 "optional": {
                               "device": (["default", "cpu"], {"advanced": True}),
@@ -1852,6 +1865,11 @@ class ImageBatch:
     CATEGORY = "image"
 
     def batch(self, image1, image2):
+        if image1.shape[-1] != image2.shape[-1]:
+            if image1.shape[-1] > image2.shape[-1]:
+                image2 = torch.nn.functional.pad(image2, (0,1), mode='constant', value=1.0)
+            else:
+                image1 = torch.nn.functional.pad(image1, (0,1), mode='constant', value=1.0)
         if image1.shape[1:] != image2.shape[1:]:
             image2 = comfy.utils.common_upscale(image2.movedim(-1,1), image1.shape[2], image1.shape[1], "bilinear", "center").movedim(1,-1)
         s = torch.cat((image1, image2), dim=0)
@@ -2228,6 +2246,12 @@ async def init_external_custom_nodes():
             if args.disable_all_custom_nodes and possible_module not in args.whitelist_custom_nodes:
                 logging.info(f"Skipping {possible_module} due to disable_all_custom_nodes and whitelist_custom_nodes")
                 continue
+
+            if args.enable_manager:
+                if comfyui_manager.should_be_disabled(module_path):
+                    logging.info(f"Blocked by policy: {module_path}")
+                    continue
+
             time_before = time.perf_counter()
             success = await load_custom_node(module_path, base_node_names, module_parent="custom_nodes")
             node_import_times.append((time.perf_counter() - time_before, module_path, success))
@@ -2273,6 +2297,7 @@ async def init_builtin_extra_nodes():
         "nodes_images.py",
         "nodes_video_model.py",
         "nodes_train.py",
+        "nodes_dataset.py",
         "nodes_sag.py",
         "nodes_perpneg.py",
         "nodes_stable3d.py",
@@ -2331,6 +2356,8 @@ async def init_builtin_extra_nodes():
         "nodes_audio_encoder.py",
         "nodes_autoregressive.py",
         "nodes_rope.py",
+        "nodes_logic.py",
+        "nodes_nop.py",
     ]
 
     import_failed = []
@@ -2359,6 +2386,7 @@ async def init_builtin_api_nodes():
         "nodes_pika.py",
         "nodes_runway.py",
         "nodes_sora.py",
+        "nodes_topaz.py",
         "nodes_tripo.py",
         "nodes_moonvalley.py",
         "nodes_rodin.py",
diff --git a/pyproject.toml b/pyproject.toml
index 79ff3f74a..02b94a0ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.68"
+version = "0.3.76"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
@@ -24,7 +24,7 @@ lint.select = [
 exclude = ["*.ipynb", "**/generated/*.pyi"]
 
 [tool.pylint]
-master.py-version = "3.9"
+master.py-version = "3.10"
 master.extension-pkg-allow-list = [
   "pydantic",
 ]
diff --git a/requirements.txt b/requirements.txt
index 249c36dee..f98848e20 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.28.8
-comfyui-workflow-templates==0.2.11
+comfyui-frontend-package==1.33.10
+comfyui-workflow-templates==0.7.25
 comfyui-embedded-docs==0.3.1
 torch
 torchsde
@@ -7,7 +7,7 @@ torchvision
 torchaudio
 numpy>=1.25.0
 einops
-transformers>=4.37.2
+transformers>=4.50.3
 tokenizers>=0.13.3
 sentencepiece
 safetensors>=0.4.2
diff --git a/server.py b/server.py
index d059d3dc9..ac4f42222 100644
--- a/server.py
+++ b/server.py
@@ -30,7 +30,7 @@ import comfy.model_management
 from comfy_api import feature_flags
 import node_helpers
 from comfyui_version import __version__
-from app.frontend_management import FrontendManager
+from app.frontend_management import FrontendManager, parse_version
 from comfy_api.internal import _ComfyNodeInternal
 
 from app.user_manager import UserManager
@@ -44,6 +44,9 @@ from protocol import BinaryEventTypes
 # Import cache control middleware
 from middleware.cache_middleware import cache_control
 
+if args.enable_manager:
+    import comfyui_manager
+
 async def send_socket_catch_exception(function, message):
     try:
         await function(message)
@@ -95,7 +98,7 @@ def create_cors_middleware(allowed_origin: str):
             response = await handler(request)
 
         response.headers['Access-Control-Allow-Origin'] = allowed_origin
-        response.headers['Access-Control-Allow-Methods'] = 'POST, GET, DELETE, PUT, OPTIONS'
+        response.headers['Access-Control-Allow-Methods'] = 'POST, GET, DELETE, PUT, OPTIONS, PATCH'
         response.headers['Access-Control-Allow-Headers'] = 'Content-Type, Authorization'
         response.headers['Access-Control-Allow-Credentials'] = 'true'
         return response
@@ -164,6 +167,22 @@ def create_origin_only_middleware():
 
     return origin_only_middleware
 
+
+def create_block_external_middleware():
+    @web.middleware
+    async def block_external_middleware(request: web.Request, handler):
+        if request.method == "OPTIONS":
+            # Pre-flight request. Reply successfully:
+            response = web.Response()
+        else:
+            response = await handler(request)
+
+        response.headers['Content-Security-Policy'] = "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval' blob:; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self'; frame-src 'self'; object-src 'self';"
+        return response
+
+    return block_external_middleware
+
+
 class PromptServer():
     def __init__(self, loop):
         PromptServer.instance = self
@@ -193,6 +212,12 @@ class PromptServer():
         else:
             middlewares.append(create_origin_only_middleware())
 
+        if args.disable_api_nodes:
+            middlewares.append(create_block_external_middleware())
+
+        if args.enable_manager:
+            middlewares.append(comfyui_manager.create_middleware())
+
         max_upload_size = round(args.max_upload_size * 1024 * 1024)
         self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares)
         self.sockets = dict()
@@ -580,7 +605,7 @@ class PromptServer():
 
             system_stats = {
                 "system": {
-                    "os": os.name,
+                    "os": sys.platform,
                     "ram_total": ram_total,
                     "ram_free": ram_free,
                     "comfyui_version": __version__,
@@ -849,11 +874,31 @@ class PromptServer():
         for name, dir in nodes.EXTENSION_WEB_DIRS.items():
             self.app.add_routes([web.static('/extensions/' + name, dir)])
 
-        workflow_templates_path = FrontendManager.templates_path()
-        if workflow_templates_path:
-            self.app.add_routes([
-                web.static('/templates', workflow_templates_path)
-            ])
+        installed_templates_version = FrontendManager.get_installed_templates_version()
+        use_legacy_templates = True
+        if installed_templates_version:
+            try:
+                use_legacy_templates = (
+                    parse_version(installed_templates_version)
+                    < parse_version("0.3.0")
+                )
+            except Exception as exc:
+                logging.warning(
+                    "Unable to parse templates version '%s': %s",
+                    installed_templates_version,
+                    exc,
+                )
+
+        if use_legacy_templates:
+            workflow_templates_path = FrontendManager.legacy_templates_path()
+            if workflow_templates_path:
+                self.app.add_routes([
+                    web.static('/templates', workflow_templates_path)
+                ])
+        else:
+            handler = FrontendManager.template_asset_handler()
+            if handler:
+                self.app.router.add_get("/templates/{path:.*}", handler)
 
         # Serve embedded documentation from the package
         embedded_docs_path = FrontendManager.embedded_docs_path()
diff --git a/tests-unit/app_test/user_manager_system_user_test.py b/tests-unit/app_test/user_manager_system_user_test.py
new file mode 100644
index 000000000..63b1ac5e5
--- /dev/null
+++ b/tests-unit/app_test/user_manager_system_user_test.py
@@ -0,0 +1,193 @@
+"""Tests for System User Protection in user_manager.py
+
+Tests cover:
+- get_request_user_id(): 1st defense layer - blocks System Users from HTTP headers
+- get_request_user_filepath(): 2nd defense layer - structural blocking via get_public_user_directory()
+- add_user(): 3rd defense layer - prevents creation of System User names
+- Defense layers integration tests
+"""
+
+import pytest
+from unittest.mock import MagicMock, patch
+import tempfile
+
+import folder_paths
+from app.user_manager import UserManager
+
+
+@pytest.fixture
+def mock_user_directory():
+    """Create a temporary user directory."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        original_dir = folder_paths.get_user_directory()
+        folder_paths.set_user_directory(temp_dir)
+        yield temp_dir
+        folder_paths.set_user_directory(original_dir)
+
+
+@pytest.fixture
+def user_manager(mock_user_directory):
+    """Create a UserManager instance for testing."""
+    with patch('app.user_manager.args') as mock_args:
+        mock_args.multi_user = True
+        manager = UserManager()
+        # Add a default user for testing
+        manager.users = {"default": "default", "test_user_123": "Test User"}
+        yield manager
+
+
+@pytest.fixture
+def mock_request():
+    """Create a mock request object."""
+    request = MagicMock()
+    request.headers = {}
+    return request
+
+
+class TestGetRequestUserId:
+    """Tests for get_request_user_id() - 1st defense layer.
+
+    Verifies:
+    - System Users (__ prefix) in HTTP header are rejected with KeyError
+    - Public Users pass through successfully
+    """
+
+    def test_system_user_raises_error(self, user_manager, mock_request):
+        """Test System User in header raises KeyError."""
+        mock_request.headers = {"comfy-user": "__system"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            with pytest.raises(KeyError, match="Unknown user"):
+                user_manager.get_request_user_id(mock_request)
+
+    def test_system_user_cache_raises_error(self, user_manager, mock_request):
+        """Test System User cache raises KeyError."""
+        mock_request.headers = {"comfy-user": "__cache"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            with pytest.raises(KeyError, match="Unknown user"):
+                user_manager.get_request_user_id(mock_request)
+
+    def test_normal_user_works(self, user_manager, mock_request):
+        """Test normal user access works."""
+        mock_request.headers = {"comfy-user": "default"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            user_id = user_manager.get_request_user_id(mock_request)
+            assert user_id == "default"
+
+    def test_unknown_user_raises_error(self, user_manager, mock_request):
+        """Test unknown user raises KeyError."""
+        mock_request.headers = {"comfy-user": "unknown_user"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            with pytest.raises(KeyError, match="Unknown user"):
+                user_manager.get_request_user_id(mock_request)
+
+
+class TestGetRequestUserFilepath:
+    """Tests for get_request_user_filepath() - 2nd defense layer.
+
+    Verifies:
+    - Returns None when get_public_user_directory() returns None (System User)
+    - Acts as backup defense if 1st layer is bypassed
+    """
+
+    def test_system_user_returns_none(self, user_manager, mock_request, mock_user_directory):
+        """Test System User returns None (structural blocking)."""
+        # First, we need to mock get_request_user_id to return System User
+        # But actually, get_request_user_id will raise KeyError first
+        # So we test via get_public_user_directory returning None
+        mock_request.headers = {"comfy-user": "default"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            # Patch get_public_user_directory to return None for testing
+            with patch.object(folder_paths, 'get_public_user_directory', return_value=None):
+                result = user_manager.get_request_user_filepath(mock_request, "test.txt")
+                assert result is None
+
+    def test_normal_user_gets_path(self, user_manager, mock_request, mock_user_directory):
+        """Test normal user gets valid filepath."""
+        mock_request.headers = {"comfy-user": "default"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            path = user_manager.get_request_user_filepath(mock_request, "test.txt")
+            assert path is not None
+            assert "default" in path
+            assert path.endswith("test.txt")
+
+
+class TestAddUser:
+    """Tests for add_user() - 3rd defense layer (creation-time blocking).
+
+    Verifies:
+    - System User name (__ prefix) creation is rejected with ValueError
+    - Sanitized usernames that become System User are also rejected
+    """
+
+    def test_system_user_prefix_name_raises(self, user_manager):
+        """Test System User prefix in name raises ValueError."""
+        with pytest.raises(ValueError, match="System User prefix not allowed"):
+            user_manager.add_user("__system")
+
+    def test_system_user_prefix_cache_raises(self, user_manager):
+        """Test System User cache prefix raises ValueError."""
+        with pytest.raises(ValueError, match="System User prefix not allowed"):
+            user_manager.add_user("__cache")
+
+    def test_sanitized_system_user_prefix_raises(self, user_manager):
+        """Test sanitized name becoming System User prefix raises ValueError (bypass prevention)."""
+        # "__test" directly starts with System User prefix
+        with pytest.raises(ValueError, match="System User prefix not allowed"):
+            user_manager.add_user("__test")
+
+    def test_normal_user_creation(self, user_manager, mock_user_directory):
+        """Test normal user creation works."""
+        user_id = user_manager.add_user("Normal User")
+        assert user_id is not None
+        assert not user_id.startswith("__")
+        assert "Normal-User" in user_id or "Normal_User" in user_id
+
+    def test_empty_name_raises(self, user_manager):
+        """Test empty name raises ValueError."""
+        with pytest.raises(ValueError, match="username not provided"):
+            user_manager.add_user("")
+
+    def test_whitespace_only_raises(self, user_manager):
+        """Test whitespace-only name raises ValueError."""
+        with pytest.raises(ValueError, match="username not provided"):
+            user_manager.add_user("   ")
+
+
+class TestDefenseLayers:
+    """Integration tests for all three defense layers.
+
+    Verifies:
+    - Each defense layer blocks System Users independently
+    - System User bypass is impossible through any layer
+    """
+
+    def test_layer1_get_request_user_id(self, user_manager, mock_request):
+        """Test 1st defense layer blocks System Users."""
+        mock_request.headers = {"comfy-user": "__system"}
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            with pytest.raises(KeyError):
+                user_manager.get_request_user_id(mock_request)
+
+    def test_layer2_get_public_user_directory(self):
+        """Test 2nd defense layer blocks System Users."""
+        result = folder_paths.get_public_user_directory("__system")
+        assert result is None
+
+    def test_layer3_add_user(self, user_manager):
+        """Test 3rd defense layer blocks System User creation."""
+        with pytest.raises(ValueError):
+            user_manager.add_user("__system")
diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py
index f8d1fd04e..63361309f 100644
--- a/tests-unit/comfy_quant/test_mixed_precision.py
+++ b/tests-unit/comfy_quant/test_mixed_precision.py
@@ -37,11 +37,8 @@ class TestMixedPrecisionOps(unittest.TestCase):
 
     def test_all_layers_standard(self):
         """Test that model with no quantization works normally"""
-        # Configure no quantization
-        ops.MixedPrecisionOps._layer_quant_config = {}
-
         # Create model
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops({}))
 
         # Initialize weights manually
         model.layer1.weight = torch.nn.Parameter(torch.randn(20, 10, dtype=torch.bfloat16))
@@ -76,7 +73,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                 "params": {}
             }
         }
-        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
 
         # Create state dict with mixed precision
         fp8_weight1 = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn)
@@ -99,7 +95,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
         }
 
         # Create model and load state dict (strict=False because custom loading pops keys)
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
         model.load_state_dict(state_dict, strict=False)
 
         # Verify weights are wrapped in QuantizedTensor
@@ -132,7 +128,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                 "params": {}
             }
         }
-        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
 
         # Create and load model
         fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn)
@@ -146,7 +141,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
             "layer3.bias": torch.randn(40, dtype=torch.bfloat16),
         }
 
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
         model.load_state_dict(state_dict1, strict=False)
 
         # Save state dict
@@ -170,7 +165,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                 "params": {}
             }
         }
-        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
 
         # Create and load model
         fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn)
@@ -184,7 +178,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
             "layer3.bias": torch.randn(40, dtype=torch.bfloat16),
         }
 
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
         model.load_state_dict(state_dict, strict=False)
 
         # Add a weight function (simulating LoRA)
@@ -210,7 +204,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                 "params": {}
             }
         }
-        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
 
         # Create state dict
         state_dict = {
@@ -223,7 +216,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
         }
 
         # Load should raise KeyError for unknown format in QUANT_FORMAT_MIXINS
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
         with self.assertRaises(KeyError):
             model.load_state_dict(state_dict, strict=False)
 
diff --git a/tests-unit/folder_paths_test/system_user_test.py b/tests-unit/folder_paths_test/system_user_test.py
new file mode 100644
index 000000000..cd46459f1
--- /dev/null
+++ b/tests-unit/folder_paths_test/system_user_test.py
@@ -0,0 +1,206 @@
+"""Tests for System User Protection in folder_paths.py
+
+Tests cover:
+- get_system_user_directory(): Internal API for custom nodes to access System User directories
+- get_public_user_directory(): HTTP endpoint access with System User blocking
+- Backward compatibility: Existing APIs unchanged
+- Security: Path traversal and injection prevention
+"""
+
+import pytest
+import os
+import tempfile
+
+from folder_paths import (
+    get_system_user_directory,
+    get_public_user_directory,
+    get_user_directory,
+    set_user_directory,
+)
+
+
+@pytest.fixture(scope="module")
+def mock_user_directory():
+    """Create a temporary user directory for testing."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        original_dir = get_user_directory()
+        set_user_directory(temp_dir)
+        yield temp_dir
+        set_user_directory(original_dir)
+
+
+class TestGetSystemUserDirectory:
+    """Tests for get_system_user_directory() - internal API for System User directories.
+
+    Verifies:
+    - Custom nodes can access System User directories via internal API
+    - Input validation prevents path traversal attacks
+    """
+
+    def test_default_name(self, mock_user_directory):
+        """Test default 'system' name."""
+        path = get_system_user_directory()
+        assert path.endswith("__system")
+        assert mock_user_directory in path
+
+    def test_custom_name(self, mock_user_directory):
+        """Test custom system user name."""
+        path = get_system_user_directory("cache")
+        assert path.endswith("__cache")
+        assert "__cache" in path
+
+    def test_name_with_underscore(self, mock_user_directory):
+        """Test name with underscore in middle."""
+        path = get_system_user_directory("my_cache")
+        assert "__my_cache" in path
+
+    def test_empty_name_raises(self):
+        """Test empty name raises ValueError."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            get_system_user_directory("")
+
+    def test_none_name_raises(self):
+        """Test None name raises ValueError."""
+        with pytest.raises(ValueError, match="cannot be empty"):
+            get_system_user_directory(None)
+
+    def test_name_starting_with_underscore_raises(self):
+        """Test name starting with underscore raises ValueError."""
+        with pytest.raises(ValueError, match="should not start with underscore"):
+            get_system_user_directory("_system")
+
+    def test_path_traversal_raises(self):
+        """Test path traversal attempt raises ValueError (security)."""
+        with pytest.raises(ValueError, match="Invalid system user name"):
+            get_system_user_directory("../escape")
+
+    def test_path_traversal_middle_raises(self):
+        """Test path traversal in middle raises ValueError (security)."""
+        with pytest.raises(ValueError, match="Invalid system user name"):
+            get_system_user_directory("system/../other")
+
+    def test_special_chars_raise(self):
+        """Test special characters raise ValueError (security)."""
+        with pytest.raises(ValueError, match="Invalid system user name"):
+            get_system_user_directory("system!")
+
+    def test_returns_absolute_path(self, mock_user_directory):
+        """Test returned path is absolute."""
+        path = get_system_user_directory("test")
+        assert os.path.isabs(path)
+
+
+class TestGetPublicUserDirectory:
+    """Tests for get_public_user_directory() - HTTP endpoint access with System User blocking.
+
+    Verifies:
+    - System Users (__ prefix) return None, blocking HTTP access
+    - Public Users get valid paths
+    - New endpoints using this function are automatically protected
+    """
+
+    def test_normal_user(self, mock_user_directory):
+        """Test normal user returns valid path."""
+        path = get_public_user_directory("default")
+        assert path is not None
+        assert "default" in path
+        assert mock_user_directory in path
+
+    def test_system_user_returns_none(self):
+        """Test System User (__ prefix) returns None - blocks HTTP access."""
+        assert get_public_user_directory("__system") is None
+
+    def test_system_user_cache_returns_none(self):
+        """Test System User cache returns None."""
+        assert get_public_user_directory("__cache") is None
+
+    def test_empty_user_returns_none(self):
+        """Test empty user returns None."""
+        assert get_public_user_directory("") is None
+
+    def test_none_user_returns_none(self):
+        """Test None user returns None."""
+        assert get_public_user_directory(None) is None
+
+    def test_header_injection_returns_none(self):
+        """Test header injection attempt returns None (security)."""
+        assert get_public_user_directory("__system\r\nX-Injected: true") is None
+
+    def test_null_byte_injection_returns_none(self):
+        """Test null byte injection handling (security)."""
+        # Note: startswith check happens before any path operations
+        result = get_public_user_directory("user\x00__system")
+        # This should return a path since it doesn't start with __
+        # The actual security comes from the path not being __*
+        assert result is not None or result is None  # Depends on validation
+
+    def test_path_traversal_attempt(self, mock_user_directory):
+        """Test path traversal attempt handling."""
+        # This function doesn't validate paths, only reserved prefix
+        # Path traversal should be handled by the caller
+        path = get_public_user_directory("../../../etc/passwd")
+        # Returns path but doesn't start with __, so not None
+        # Actual path validation happens in user_manager
+        assert path is not None or "__" not in "../../../etc/passwd"
+
+    def test_returns_absolute_path(self, mock_user_directory):
+        """Test returned path is absolute."""
+        path = get_public_user_directory("testuser")
+        assert path is not None
+        assert os.path.isabs(path)
+
+
+class TestBackwardCompatibility:
+    """Tests for backward compatibility with existing APIs.
+
+    Verifies:
+    - get_user_directory() API unchanged
+    - Existing user data remains accessible
+    """
+
+    def test_get_user_directory_unchanged(self, mock_user_directory):
+        """Test get_user_directory() still works as before."""
+        user_dir = get_user_directory()
+        assert user_dir is not None
+        assert os.path.isabs(user_dir)
+        assert user_dir == mock_user_directory
+
+    def test_existing_user_accessible(self, mock_user_directory):
+        """Test existing users can access their directories."""
+        path = get_public_user_directory("default")
+        assert path is not None
+        assert "default" in path
+
+
+class TestEdgeCases:
+    """Tests for edge cases in System User detection.
+
+    Verifies:
+    - Only __ prefix is blocked (not _, not middle __)
+    - Bypass attempts are prevented
+    """
+
+    def test_prefix_only(self):
+        """Test prefix-only string is blocked."""
+        assert get_public_user_directory("__") is None
+
+    def test_single_underscore_allowed(self):
+        """Test single underscore prefix is allowed (not System User)."""
+        path = get_public_user_directory("_system")
+        assert path is not None
+        assert "_system" in path
+
+    def test_triple_underscore_blocked(self):
+        """Test triple underscore is blocked (starts with __)."""
+        assert get_public_user_directory("___system") is None
+
+    def test_underscore_in_middle_allowed(self):
+        """Test underscore in middle is allowed."""
+        path = get_public_user_directory("my__system")
+        assert path is not None
+        assert "my__system" in path
+
+    def test_leading_space_allowed(self):
+        """Test leading space + prefix is allowed (doesn't start with __)."""
+        path = get_public_user_directory(" __system")
+        assert path is not None
diff --git a/tests-unit/prompt_server_test/system_user_endpoint_test.py b/tests-unit/prompt_server_test/system_user_endpoint_test.py
new file mode 100644
index 000000000..22ac00af9
--- /dev/null
+++ b/tests-unit/prompt_server_test/system_user_endpoint_test.py
@@ -0,0 +1,375 @@
+"""E2E Tests for System User Protection HTTP Endpoints
+
+Tests cover:
+- HTTP endpoint blocking: System Users cannot access /userdata (GET, POST, DELETE, move)
+- User creation blocking: System User names cannot be created via POST /users
+- Backward compatibility: Public Users work as before
+- Custom node scenario: Internal API works while HTTP is blocked
+- Structural security: get_public_user_directory() provides automatic protection
+"""
+
+import pytest
+import os
+from aiohttp import web
+from app.user_manager import UserManager
+from unittest.mock import patch
+import folder_paths
+
+
+@pytest.fixture
+def mock_user_directory(tmp_path):
+    """Create a temporary user directory."""
+    original_dir = folder_paths.get_user_directory()
+    folder_paths.set_user_directory(str(tmp_path))
+    yield tmp_path
+    folder_paths.set_user_directory(original_dir)
+
+
+@pytest.fixture
+def user_manager_multi_user(mock_user_directory):
+    """Create UserManager in multi-user mode."""
+    with patch('app.user_manager.args') as mock_args:
+        mock_args.multi_user = True
+        um = UserManager()
+        # Add test users
+        um.users = {"default": "default", "test_user_123": "Test User"}
+        yield um
+
+
+@pytest.fixture
+def app_multi_user(user_manager_multi_user):
+    """Create app with multi-user mode enabled."""
+    app = web.Application()
+    routes = web.RouteTableDef()
+    user_manager_multi_user.add_routes(routes)
+    app.add_routes(routes)
+    return app
+
+
+class TestSystemUserEndpointBlocking:
+    """E2E tests for System User blocking on all HTTP endpoints.
+
+    Verifies:
+    - GET /userdata blocked for System Users
+    - POST /userdata blocked for System Users
+    - DELETE /userdata blocked for System Users
+    - POST /userdata/.../move/... blocked for System Users
+    """
+
+    @pytest.mark.asyncio
+    async def test_userdata_get_blocks_system_user(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        GET /userdata with System User header should be blocked.
+        """
+        # Create test directory for System User (simulating internal creation)
+        system_user_dir = mock_user_directory / "__system"
+        system_user_dir.mkdir()
+        (system_user_dir / "secret.txt").write_text("sensitive data")
+
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            # Attempt to access System User's data via HTTP
+            resp = await client.get(
+                "/userdata?dir=.",
+                headers={"comfy-user": "__system"}
+            )
+
+        # Should be blocked (403 Forbidden or similar error)
+        assert resp.status in [400, 403, 500], \
+            f"System User access should be blocked, got {resp.status}"
+
+    @pytest.mark.asyncio
+    async def test_userdata_post_blocks_system_user(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        POST /userdata with System User header should be blocked.
+        """
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.post(
+                "/userdata/test.txt",
+                headers={"comfy-user": "__system"},
+                data=b"malicious content"
+            )
+
+        assert resp.status in [400, 403, 500], \
+            f"System User write should be blocked, got {resp.status}"
+
+        # Verify no file was created
+        assert not (mock_user_directory / "__system" / "test.txt").exists()
+
+    @pytest.mark.asyncio
+    async def test_userdata_delete_blocks_system_user(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        DELETE /userdata with System User header should be blocked.
+        """
+        # Create a file in System User directory
+        system_user_dir = mock_user_directory / "__system"
+        system_user_dir.mkdir()
+        secret_file = system_user_dir / "secret.txt"
+        secret_file.write_text("do not delete")
+
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.delete(
+                "/userdata/secret.txt",
+                headers={"comfy-user": "__system"}
+            )
+
+        assert resp.status in [400, 403, 500], \
+            f"System User delete should be blocked, got {resp.status}"
+
+        # Verify file still exists
+        assert secret_file.exists()
+
+    @pytest.mark.asyncio
+    async def test_v2_userdata_blocks_system_user(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        GET /v2/userdata with System User header should be blocked.
+        """
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.get(
+                "/v2/userdata",
+                headers={"comfy-user": "__system"}
+            )
+
+        assert resp.status in [400, 403, 500], \
+            f"System User v2 access should be blocked, got {resp.status}"
+
+    @pytest.mark.asyncio
+    async def test_move_userdata_blocks_system_user(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        POST /userdata/{file}/move/{dest} with System User header should be blocked.
+        """
+        system_user_dir = mock_user_directory / "__system"
+        system_user_dir.mkdir()
+        (system_user_dir / "source.txt").write_text("sensitive data")
+
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.post(
+                "/userdata/source.txt/move/dest.txt",
+                headers={"comfy-user": "__system"}
+            )
+
+        assert resp.status in [400, 403, 500], \
+            f"System User move should be blocked, got {resp.status}"
+
+        # Verify source file still exists (move was blocked)
+        assert (system_user_dir / "source.txt").exists()
+
+
+class TestSystemUserCreationBlocking:
+    """E2E tests for blocking System User name creation via POST /users.
+
+    Verifies:
+    - POST /users returns 400 for System User name (not 500)
+    """
+
+    @pytest.mark.asyncio
+    async def test_post_users_blocks_system_user_name(
+        self, aiohttp_client, app_multi_user
+    ):
+        """POST /users with System User name should return 400 Bad Request."""
+        client = await aiohttp_client(app_multi_user)
+
+        resp = await client.post(
+            "/users",
+            json={"username": "__system"}
+        )
+
+        assert resp.status == 400, \
+            f"System User creation should return 400, got {resp.status}"
+
+    @pytest.mark.asyncio
+    async def test_post_users_blocks_system_user_prefix_variations(
+        self, aiohttp_client, app_multi_user
+    ):
+        """POST /users with any System User prefix variation should return 400 Bad Request."""
+        client = await aiohttp_client(app_multi_user)
+
+        system_user_names = ["__system", "__cache", "__config", "__anything"]
+
+        for name in system_user_names:
+            resp = await client.post("/users", json={"username": name})
+            assert resp.status == 400, \
+                f"System User name '{name}' should return 400, got {resp.status}"
+
+
+class TestPublicUserStillWorks:
+    """E2E tests for backward compatibility - Public Users should work as before.
+
+    Verifies:
+    - Public Users can access their data via HTTP
+    - Public Users can create files via HTTP
+    """
+
+    @pytest.mark.asyncio
+    async def test_public_user_can_access_userdata(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        Public Users should still be able to access their data.
+        """
+        # Create test directory for Public User
+        user_dir = mock_user_directory / "default"
+        user_dir.mkdir()
+        test_dir = user_dir / "workflows"
+        test_dir.mkdir()
+        (test_dir / "test.json").write_text('{"test": true}')
+
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.get(
+                "/userdata?dir=workflows",
+                headers={"comfy-user": "default"}
+            )
+
+        assert resp.status == 200
+        data = await resp.json()
+        assert "test.json" in data
+
+    @pytest.mark.asyncio
+    async def test_public_user_can_create_files(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        Public Users should still be able to create files.
+        """
+        # Create user directory
+        user_dir = mock_user_directory / "default"
+        user_dir.mkdir()
+
+        client = await aiohttp_client(app_multi_user)
+
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.post(
+                "/userdata/newfile.txt",
+                headers={"comfy-user": "default"},
+                data=b"user content"
+            )
+
+        assert resp.status == 200
+        assert (user_dir / "newfile.txt").exists()
+
+
+class TestCustomNodeScenario:
+    """Tests for custom node use case: internal API access vs HTTP blocking.
+
+    Verifies:
+    - Internal API (get_system_user_directory) works for custom nodes
+    - HTTP endpoint cannot access data created via internal API
+    """
+
+    def test_internal_api_can_access_system_user(self, mock_user_directory):
+        """
+        Internal API (get_system_user_directory) should work for custom nodes.
+        """
+        # Custom node uses internal API
+        system_path = folder_paths.get_system_user_directory("mynode_config")
+
+        assert system_path is not None
+        assert "__mynode_config" in system_path
+
+        # Can create and write to System User directory
+        os.makedirs(system_path, exist_ok=True)
+        config_file = os.path.join(system_path, "settings.json")
+        with open(config_file, "w") as f:
+            f.write('{"api_key": "secret"}')
+
+        assert os.path.exists(config_file)
+
+    @pytest.mark.asyncio
+    async def test_http_cannot_access_internal_data(
+        self, aiohttp_client, app_multi_user, mock_user_directory
+    ):
+        """
+        HTTP endpoint cannot access data created via internal API.
+        """
+        # Custom node creates data via internal API
+        system_path = folder_paths.get_system_user_directory("mynode_config")
+        os.makedirs(system_path, exist_ok=True)
+        with open(os.path.join(system_path, "secret.json"), "w") as f:
+            f.write('{"api_key": "secret"}')
+
+        client = await aiohttp_client(app_multi_user)
+
+        # Attacker tries to access via HTTP
+        with patch('app.user_manager.args') as mock_args:
+            mock_args.multi_user = True
+            resp = await client.get(
+                "/userdata/secret.json",
+                headers={"comfy-user": "__mynode_config"}
+            )
+
+        # Should be blocked
+        assert resp.status in [400, 403, 500]
+
+
+class TestStructuralSecurity:
+    """Tests for structural security pattern.
+
+    Verifies:
+    - get_public_user_directory() automatically blocks System Users
+    - New endpoints using this function are automatically protected
+    """
+
+    def test_get_public_user_directory_blocks_system_user(self):
+        """
+        Any code using get_public_user_directory() is automatically protected.
+        """
+        # This is the structural security - any new endpoint using this function
+        # will automatically block System Users
+        assert folder_paths.get_public_user_directory("__system") is None
+        assert folder_paths.get_public_user_directory("__cache") is None
+        assert folder_paths.get_public_user_directory("__anything") is None
+
+        # Public Users work
+        assert folder_paths.get_public_user_directory("default") is not None
+        assert folder_paths.get_public_user_directory("user123") is not None
+
+    def test_structural_security_pattern(self, mock_user_directory):
+        """
+        Demonstrate the structural security pattern for new endpoints.
+
+        Any new endpoint should follow this pattern:
+        1. Get user from request
+        2. Use get_public_user_directory() - automatically blocks System Users
+        3. If None, return error
+        """
+        def new_endpoint_handler(user_id: str) -> str | None:
+            """Example of how new endpoints should be implemented."""
+            user_path = folder_paths.get_public_user_directory(user_id)
+            if user_path is None:
+                return None  # Blocked
+            return user_path
+
+        # System Users are automatically blocked
+        assert new_endpoint_handler("__system") is None
+        assert new_endpoint_handler("__secret") is None
+
+        # Public Users work
+        assert new_endpoint_handler("default") is not None
diff --git a/tests/execution/test_public_api.py b/tests/execution/test_public_api.py
new file mode 100644
index 000000000..52bc2fcd8
--- /dev/null
+++ b/tests/execution/test_public_api.py
@@ -0,0 +1,153 @@
+"""
+Tests for public ComfyAPI and ComfyAPISync functions.
+
+These tests verify that the public API methods work correctly in both sync and async contexts,
+ensuring that the sync wrapper generation (via get_type_hints() in async_to_sync.py) correctly
+handles string annotations from 'from __future__ import annotations'.
+"""
+
+import pytest
+import time
+import subprocess
+import torch
+from pytest import fixture
+from comfy_execution.graph_utils import GraphBuilder
+from tests.execution.test_execution import ComfyClient
+
+
+@pytest.mark.execution
+class TestPublicAPI:
+    """Test suite for public ComfyAPI and ComfyAPISync methods."""
+
+    @fixture(scope="class", autouse=True)
+    def _server(self, args_pytest):
+        """Start ComfyUI server for testing."""
+        pargs = [
+            'python', 'main.py',
+            '--output-directory', args_pytest["output_dir"],
+            '--listen', args_pytest["listen"],
+            '--port', str(args_pytest["port"]),
+            '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml',
+            '--cpu',
+        ]
+        p = subprocess.Popen(pargs)
+        yield
+        p.kill()
+        torch.cuda.empty_cache()
+
+    @fixture(scope="class", autouse=True)
+    def shared_client(self, args_pytest, _server):
+        """Create shared client with connection retry."""
+        client = ComfyClient()
+        n_tries = 5
+        for i in range(n_tries):
+            time.sleep(4)
+            try:
+                client.connect(listen=args_pytest["listen"], port=args_pytest["port"])
+                break
+            except ConnectionRefusedError:
+                if i == n_tries - 1:
+                    raise
+        yield client
+        del client
+        torch.cuda.empty_cache()
+
+    @fixture
+    def client(self, shared_client, request):
+        """Set test name for each test."""
+        shared_client.set_test_name(f"public_api[{request.node.name}]")
+        yield shared_client
+
+    @fixture
+    def builder(self, request):
+        """Create GraphBuilder for each test."""
+        yield GraphBuilder(prefix=request.node.name)
+
+    def test_sync_progress_update_executes(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that TestSyncProgressUpdate executes without errors.
+
+        This test validates that api_sync.execution.set_progress() works correctly,
+        which is the primary code path fixed by adding get_type_hints() to async_to_sync.py.
+        """
+        g = builder
+        image = g.node("StubImage", content="BLACK", height=256, width=256, batch_size=1)
+
+        # Use TestSyncProgressUpdate with short sleep
+        progress_node = g.node("TestSyncProgressUpdate",
+                              value=image.out(0),
+                              sleep_seconds=0.5)
+        output = g.node("SaveImage", images=progress_node.out(0))
+
+        # Execute workflow
+        result = client.run(g)
+
+        # Verify execution
+        assert result.did_run(progress_node), "Progress node should have executed"
+        assert result.did_run(output), "Output node should have executed"
+
+        # Verify output
+        images = result.get_images(output)
+        assert len(images) == 1, "Should have produced 1 image"
+
+    def test_async_progress_update_executes(self, client: ComfyClient, builder: GraphBuilder):
+        """Test that TestAsyncProgressUpdate executes without errors.
+
+        This test validates that await api.execution.set_progress() works correctly
+        in async contexts.
+        """
+        g = builder
+        image = g.node("StubImage", content="WHITE", height=256, width=256, batch_size=1)
+
+        # Use TestAsyncProgressUpdate with short sleep
+        progress_node = g.node("TestAsyncProgressUpdate",
+                              value=image.out(0),
+                              sleep_seconds=0.5)
+        output = g.node("SaveImage", images=progress_node.out(0))
+
+        # Execute workflow
+        result = client.run(g)
+
+        # Verify execution
+        assert result.did_run(progress_node), "Async progress node should have executed"
+        assert result.did_run(output), "Output node should have executed"
+
+        # Verify output
+        images = result.get_images(output)
+        assert len(images) == 1, "Should have produced 1 image"
+
+    def test_sync_and_async_progress_together(self, client: ComfyClient, builder: GraphBuilder):
+        """Test both sync and async progress updates in same workflow.
+
+        This test ensures that both ComfyAPISync and ComfyAPI can coexist and work
+        correctly in the same workflow execution.
+        """
+        g = builder
+        image1 = g.node("StubImage", content="BLACK", height=256, width=256, batch_size=1)
+        image2 = g.node("StubImage", content="WHITE", height=256, width=256, batch_size=1)
+
+        # Use both types of progress nodes
+        sync_progress = g.node("TestSyncProgressUpdate",
+                              value=image1.out(0),
+                              sleep_seconds=0.3)
+        async_progress = g.node("TestAsyncProgressUpdate",
+                               value=image2.out(0),
+                               sleep_seconds=0.3)
+
+        # Create outputs
+        output1 = g.node("SaveImage", images=sync_progress.out(0))
+        output2 = g.node("SaveImage", images=async_progress.out(0))
+
+        # Execute workflow
+        result = client.run(g)
+
+        # Both should execute successfully
+        assert result.did_run(sync_progress), "Sync progress node should have executed"
+        assert result.did_run(async_progress), "Async progress node should have executed"
+        assert result.did_run(output1), "First output node should have executed"
+        assert result.did_run(output2), "Second output node should have executed"
+
+        # Verify outputs
+        images1 = result.get_images(output1)
+        images2 = result.get_images(output2)
+        assert len(images1) == 1, "Should have produced 1 image from sync node"
+        assert len(images2) == 1, "Should have produced 1 image from async node"