Merge branch 'master' of github.com:comfyanonymous/ComfyUI

2026-02-28 23:07:33 +08:00 · 2025-04-21 13:14:36 -07:00 · 2025-04-21 13:14:36 -07:00 · 5823497d55
commit 5823497d55
parent ffc1912eff ce22f687cc
68 changed files with 2721 additions and 333 deletions
--- a/26
+++ b/26
@ -5,20 +5,20 @@
 # Inlined the team members for now.

 # Maintainers
-*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
-/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink
+*.md @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/tests/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/tests-unit/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/notebooks/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/script_examples/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/.github/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/requirements.txt @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne
+/pyproject.toml @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @Kosinkadink @christian-byrne

 # Python web server
-/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
-/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
-/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata
+/api_server/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/app/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne
+/utils/ @yoland68 @robinjhuang @huchenlei @webfiltered @pythongosssss @ltdrdata @christian-byrne

 # Node developers
-/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered
-/comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered
+/comfy_extras/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
+/comfy/comfy_types/ @yoland68 @robinjhuang @huchenlei @pythongosssss @ltdrdata @Kosinkadink @webfiltered @christian-byrne
--- a/README.md
+++ b/README.md
@ -51,6 +51,7 @@ See what ComfyUI can do with the [example workflows](https://comfyanonymous.gith
   - [HunyuanDiT](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_dit/)
   - [Flux](https://comfyanonymous.github.io/ComfyUI_examples/flux/)
   - [Lumina Image 2.0](https://comfyanonymous.github.io/ComfyUI_examples/lumina2/)
+   - [HiDream](https://comfyanonymous.github.io/ComfyUI_examples/hidream/)
 - Video Models
   - [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
   - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/)
--- a/comfy/init.py
+++ b/comfy/init.py
@ -1 +1 @@
-__version__ = "0.3.27"
+__version__ = "0.3.29"
--- a/comfy/app/app_settings.py
+++ b/comfy/app/app_settings.py
@ -10,8 +10,14 @@ class AppSettings():
        self.user_manager = user_manager

    def get_settings(self, request):
-        file = self.user_manager.get_request_user_filepath(
-            request, "comfy.settings.json")
+        try:
+            file = self.user_manager.get_request_user_filepath(
+                request,
+                "comfy.settings.json"
+            )
+        except KeyError as e:
+            logging.error("User settings not found.")
+            raise web.HTTPUnauthorized() from e
        if os.path.isfile(file):
            try:
                with open(file) as f:
--- a/comfy/app/frontend_management.py
+++ b/comfy/app/frontend_management.py
@ -126,6 +126,27 @@ class FrontendManager:
            logging.error(f"""comfyui-frontend-package is not installed.""".strip())
            return ""

+    @classmethod
+    def templates_path(cls) -> str:
+        try:
+            import comfyui_workflow_templates
+
+            return str(
+                importlib.resources.files(comfyui_workflow_templates) / "templates"
+            )
+        except ImportError:
+            logging.error(
+                f"""
+********** ERROR ***********
+
+comfyui-workflow-templates is not installed.
+
+{frontend_install_warning_message()}
+
+********** ERROR ***********
+""".strip()
+            )
+
    @classmethod
    def parse_version_string(cls, value: str) -> tuple[str, str, str]:
        """
--- a/comfy/caching.py
+++ b/comfy/caching.py
@ -322,3 +322,157 @@ class LRUCache(BasicCache):
            self._mark_used(child_id)
            self.children[cache_key].append(self.cache_key_set.get_data_key(child_id))
        return self
+
+
+class DependencyAwareCache(BasicCache):
+    """
+    A cache implementation that tracks dependencies between nodes and manages
+    their execution and caching accordingly. It extends the BasicCache class.
+    Nodes are removed from this cache once all of their descendants have been
+    executed.
+    """
+
+    def __init__(self, key_class):
+        """
+        Initialize the DependencyAwareCache.
+
+        Args:
+            key_class: The class used for generating cache keys.
+        """
+        super().__init__(key_class)
+        self.descendants = {}  # Maps node_id -> set of descendant node_ids
+        self.ancestors = {}    # Maps node_id -> set of ancestor node_ids
+        self.executed_nodes = set()  # Tracks nodes that have been executed
+
+    def set_prompt(self, dynprompt, node_ids, is_changed_cache):
+        """
+        Clear the entire cache and rebuild the dependency graph.
+
+        Args:
+            dynprompt: The dynamic prompt object containing node information.
+            node_ids: List of node IDs to initialize the cache for.
+            is_changed_cache: Flag indicating if the cache has changed.
+        """
+        # Clear all existing cache data
+        self.cache.clear()
+        self.subcaches.clear()
+        self.descendants.clear()
+        self.ancestors.clear()
+        self.executed_nodes.clear()
+
+        # Call the parent method to initialize the cache with the new prompt
+        super().set_prompt(dynprompt, node_ids, is_changed_cache)
+
+        # Rebuild the dependency graph
+        self._build_dependency_graph(dynprompt, node_ids)
+
+    def _build_dependency_graph(self, dynprompt, node_ids):
+        """
+        Build the dependency graph for all nodes.
+
+        Args:
+            dynprompt: The dynamic prompt object containing node information.
+            node_ids: List of node IDs to build the graph for.
+        """
+        self.descendants.clear()
+        self.ancestors.clear()
+        for node_id in node_ids:
+            self.descendants[node_id] = set()
+            self.ancestors[node_id] = set()
+
+        for node_id in node_ids:
+            inputs = dynprompt.get_node(node_id)["inputs"]
+            for input_data in inputs.values():
+                if is_link(input_data):  # Check if the input is a link to another node
+                    ancestor_id = input_data[0]
+                    self.descendants[ancestor_id].add(node_id)
+                    self.ancestors[node_id].add(ancestor_id)
+
+    def set(self, node_id, value):
+        """
+        Mark a node as executed and store its value in the cache.
+
+        Args:
+            node_id: The ID of the node to store.
+            value: The value to store for the node.
+        """
+        self._set_immediate(node_id, value)
+        self.executed_nodes.add(node_id)
+        self._cleanup_ancestors(node_id)
+
+    def get(self, node_id):
+        """
+        Retrieve the cached value for a node.
+
+        Args:
+            node_id: The ID of the node to retrieve.
+
+        Returns:
+            The cached value for the node.
+        """
+        return self._get_immediate(node_id)
+
+    def ensure_subcache_for(self, node_id, children_ids):
+        """
+        Ensure a subcache exists for a node and update dependencies.
+
+        Args:
+            node_id: The ID of the parent node.
+            children_ids: List of child node IDs to associate with the parent node.
+
+        Returns:
+            The subcache object for the node.
+        """
+        subcache = super()._ensure_subcache(node_id, children_ids)
+        for child_id in children_ids:
+            self.descendants[node_id].add(child_id)
+            self.ancestors[child_id].add(node_id)
+        return subcache
+
+    def _cleanup_ancestors(self, node_id):
+        """
+        Check if ancestors of a node can be removed from the cache.
+
+        Args:
+            node_id: The ID of the node whose ancestors are to be checked.
+        """
+        for ancestor_id in self.ancestors.get(node_id, []):
+            if ancestor_id in self.executed_nodes:
+                # Remove ancestor if all its descendants have been executed
+                if all(descendant in self.executed_nodes for descendant in self.descendants[ancestor_id]):
+                    self._remove_node(ancestor_id)
+
+    def _remove_node(self, node_id):
+        """
+        Remove a node from the cache.
+
+        Args:
+            node_id: The ID of the node to remove.
+        """
+        cache_key = self.cache_key_set.get_data_key(node_id)
+        if cache_key in self.cache:
+            del self.cache[cache_key]
+        subcache_key = self.cache_key_set.get_subcache_key(node_id)
+        if subcache_key in self.subcaches:
+            del self.subcaches[subcache_key]
+
+    def clean_unused(self):
+        """
+        Clean up unused nodes. This is a no-op for this cache implementation.
+        """
+        pass
+
+    def recursive_debug_dump(self):
+        """
+        Dump the cache and dependency graph for debugging.
+
+        Returns:
+            A list containing the cache state and dependency graph.
+        """
+        result = super().recursive_debug_dump()
+        result.append({
+            "descendants": self.descendants,
+            "ancestors": self.ancestors,
+            "executed_nodes": list(self.executed_nodes),
+        })
+        return result
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -81,6 +81,7 @@ def _create_parser() -> EnhancedConfigArgParser:
                            help="Store text encoder weights in fp8 (e5m2 variant).")
    fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
    fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
+    fpte_group.add_argument("--bf16-text-enc", action="store_true", help="Store text encoder weights in bf16.")

    parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1,
                        help="Use torch-directml.")
@ -96,6 +97,7 @@ def _create_parser() -> EnhancedConfigArgParser:
    cache_group = parser.add_mutually_exclusive_group()
    cache_group.add_argument("--cache-classic", action="store_true", help="WARNING: Unused. Use the old style (aggressive) caching.")
    cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
+    cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
    attn_group = parser.add_mutually_exclusive_group()
    attn_group.add_argument("--use-split-cross-attention", action="store_true",
                            help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -129,7 +131,7 @@ def _create_parser() -> EnhancedConfigArgParser:
    parser.add_argument("--deterministic", action="store_true",
                        help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")

-    parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. Pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult")
+    parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. Pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")

    parser.add_argument("--dont-print-server", action="store_true", help="Don't print server output.")
    parser.add_argument("--quick-test-for-ci", action="store_true", help="Quick test for CI. Raises an error if nodes cannot be imported,")
--- a/comfy/cli_args_types.py
+++ b/comfy/cli_args_types.py
@ -45,6 +45,7 @@ def is_valid_directory(path: str) -> str:
 class PerformanceFeature(enum.Enum):
    Fp16Accumulation = "fp16_accumulation"
    Fp8MatrixMultiplication = "fp8_matrix_mult"
+    CublasOps = "cublas_ops"


 class Configuration(dict):
@ -105,7 +106,7 @@ class Configuration(dict):
        lowvram (bool): Reduce UNet's VRAM usage.
        novram (bool): Minimize VRAM usage.
        cpu (bool): Use CPU for processing.
-        fast (set[PerformanceFeature]): Enable some untested and potentially quality deteriorating optimizations. Pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult
+        fast (set[PerformanceFeature]): Enable some untested and potentially quality deteriorating optimizations. Pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops
        reserve_vram (Optional[float]): Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS
        disable_smart_memory (bool): Disable smart memory management.
        deterministic (bool): Use deterministic algorithms where possible.
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -124,9 +124,13 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = files.get_path_as_dict(None, "clip_vision_config_h.json")
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            json_config = files.get_path_as_dict(None, "clip_vision_siglip_384.json")
-        elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
+            if embed_shape == 729:
+                json_config = files.get_path_as_dict(None, "clip_vision_siglip_384.json")
+            elif embed_shape == 1024:
+                json_config = files.get_path_as_dict(None, "clip_vision_siglip_512.json")
+        elif embed_shape == 577:
            if "multi_modal_projector.linear_1.bias" in sd:
                json_config = files.get_path_as_dict(None, "clip_vision_config_vitl_336_llava.json")
            else:
--- a/comfy/clip_vision_siglip_512.json
+++ b/comfy/clip_vision_siglip_512.json
@ -0,0 +1,13 @@
+{
+  "num_channels": 3,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_size": 1152,
+  "image_size": 512,
+  "intermediate_size": 4304,
+  "model_type": "siglip_vision_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 27,
+  "patch_size": 16,
+  "image_mean": [0.5, 0.5, 0.5],
+  "image_std": [0.5, 0.5, 0.5]
+}
--- a/comfy/cmd/execution.py
+++ b/comfy/cmd/execution.py
@ -11,8 +11,9 @@ import time
 import traceback
 import typing
 from contextlib import nullcontext
+from enum import Enum
 from os import PathLike
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Literal

 import torch
 from opentelemetry.trace import get_current_span, StatusCode, Status
@ -20,7 +21,7 @@ from opentelemetry.trace import get_current_span, StatusCode, Status
 from .main_pre import tracer
 from .. import interruption
 from .. import model_management
-from ..caching import HierarchicalCache, LRUCache, CacheKeySetInputSignature, CacheKeySetID
+from ..caching import HierarchicalCache, LRUCache, CacheKeySetInputSignature, CacheKeySetID, DependencyAwareCache
 from ..cli_args import args
 from ..component_model.abstract_prompt_queue import AbstractPromptQueue
 from ..component_model.executor_types import ExecutorToClientProgress, ValidationTuple, ValidateInputsTuple, \
@ -80,22 +81,44 @@ class IsChangedCache:
        return self.is_changed[node_id]


-class CacheSet:
-    def __init__(self, lru_size=None):
-        if lru_size is None or lru_size == 0:
-            # Performs like the old cache -- dump data ASAP
+class CacheType(Enum):
+    CLASSIC = 0
+    LRU = 1
+    DEPENDENCY_AWARE = 2

-            self.outputs = HierarchicalCache(CacheKeySetInputSignature)
-            self.ui = HierarchicalCache(CacheKeySetInputSignature)
-            self.objects = HierarchicalCache(CacheKeySetID)
+
+class CacheSet:
+    def __init__(self, cache_type=None, cache_size=None):
+        if cache_type == CacheType.DEPENDENCY_AWARE:
+            self.init_dependency_aware_cache()
+            logging.info("Disabling intermediate node cache.")
+        elif cache_type == CacheType.LRU:
+            if cache_size is None:
+                cache_size = 0
+            self.init_lru_cache(cache_size)
+            logging.info("Using LRU cache")
        else:
-            # Useful for those with ample RAM/VRAM -- allows experimenting without
-            # blowing away the cache every time
-            self.outputs = LRUCache(CacheKeySetInputSignature, max_size=lru_size)
-            self.ui = LRUCache(CacheKeySetInputSignature, max_size=lru_size)
-            self.objects = HierarchicalCache(CacheKeySetID)
+            self.init_classic_cache()
+
        self.all = [self.outputs, self.ui, self.objects]

+    # Performs like the old cache -- dump data ASAP
+    def init_classic_cache(self):
+        self.outputs = HierarchicalCache(CacheKeySetInputSignature)
+        self.ui = HierarchicalCache(CacheKeySetInputSignature)
+        self.objects = HierarchicalCache(CacheKeySetID)
+
+    def init_lru_cache(self, cache_size):
+        self.outputs = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
+        self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
+        self.objects = HierarchicalCache(CacheKeySetID)
+
+    # only hold cached items while the decendents have not executed
+    def init_dependency_aware_cache(self):
+        self.outputs = DependencyAwareCache(CacheKeySetInputSignature)
+        self.ui = DependencyAwareCache(CacheKeySetInputSignature)
+        self.objects = DependencyAwareCache(CacheKeySetID)
+
    def recursive_debug_dump(self):
        result = {
            "outputs": self.outputs.recursive_debug_dump(),
@ -114,7 +137,7 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
    missing_keys = {}
    for x in inputs:
        input_data = inputs[x]
-        input_type, input_category, input_info = get_input_info(class_def, x, valid_inputs)
+        _, input_category, input_info = get_input_info(class_def, x, valid_inputs)

        def mark_missing():
            missing_keys[x] = True
@ -150,6 +173,8 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
                input_data_all[x] = [extra_data.get('extra_pnginfo', None)]
            if h[x] == "UNIQUE_ID":
                input_data_all[x] = [unique_id]
+            if h[x] == "AUTH_TOKEN_COMFY_ORG":
+                input_data_all[x] = [extra_data.get("auth_token_comfy_org", None)]
    return input_data_all, missing_keys


@ -502,9 +527,10 @@ def _execute(server, dynprompt, caches: CacheSet, current_item: str, extra_data,


 class PromptExecutor:
-    def __init__(self, server: ExecutorToClientProgress, lru_size=None):
+    def __init__(self, server: ExecutorToClientProgress, cache_type: CacheType | Literal[False] = False, cache_size: int | None = None):
        self.success = None
-        self.lru_size = lru_size
+        self.cache_size = cache_size
+        self.cache_type = cache_type
        self.server = server
        self.raise_exceptions = False
        self.reset()
@ -512,7 +538,7 @@ class PromptExecutor:

    def reset(self):
        self.success = True
-        self.caches = CacheSet(self.lru_size)
+        self.caches = CacheSet(cache_type=self.cache_type, cache_size=self.cache_size)
        self.status_messages = []

    def add_message(self, event, data: dict, broadcast: bool):
@ -682,7 +708,7 @@ def validate_inputs(prompt, item, validated: typing.Dict[str, ValidateInputsTupl
    received_types = {}

    for x in valid_inputs:
-        type_input, input_category, extra_info = get_input_info(obj_class, x, class_inputs)
+        input_type, input_category, extra_info = get_input_info(obj_class, x, class_inputs)
        assert extra_info is not None
        if x not in inputs:
            if input_category == "required":
@ -698,7 +724,7 @@ def validate_inputs(prompt, item, validated: typing.Dict[str, ValidateInputsTupl
            continue

        val = inputs[x]
-        info: InputTypeSpec = (type_input, extra_info)
+        info: InputTypeSpec = (input_type, extra_info)
        if isinstance(val, list):
            if len(val) != 2:
                error = {
@ -721,8 +747,8 @@ def validate_inputs(prompt, item, validated: typing.Dict[str, ValidateInputsTupl
            received_types[x] = received_type
            any_enum = received_type == [] and (isinstance(type_input, list) or isinstance(type_input, tuple))

-            if 'input_types' not in validate_function_inputs and not validate_node_input(received_type, type_input) and not any_enum:
-                details = f"{x}, {received_type} != {type_input}"
+            if 'input_types' not in validate_function_inputs and not validate_node_input(received_type, input_type) and not any_enum:
+                details = f"{x}, {received_type} != {input_type}"
                error = {
                    "type": "return_type_mismatch",
                    "message": "Return type mismatch between linked nodes",
@ -770,22 +796,22 @@ def validate_inputs(prompt, item, validated: typing.Dict[str, ValidateInputsTupl
                    val = val["__value__"]
                    inputs[x] = val

-                if type_input == "INT":
+                if input_type == "INT":
                    val = int(val)
                    inputs[x] = val
-                if type_input == "FLOAT":
+                if input_type == "FLOAT":
                    val = float(val)
                    inputs[x] = val
-                if type_input == "STRING":
+                if input_type == "STRING":
                    val = str(val)
                    inputs[x] = val
-                if type_input == "BOOLEAN":
+                if input_type == "BOOLEAN":
                    val = bool(val)
                    inputs[x] = val
            except Exception as ex:
                error = {
                    "type": "invalid_input_type",
-                    "message": f"Failed to convert an input value to a {type_input} value",
+                    "message": f"Failed to convert an input value to a {input_type} value",
                    "details": f"{x}, {val}, {ex}",
                    "extra_info": {
                        "input_name": x,
@ -826,23 +852,24 @@ def validate_inputs(prompt, item, validated: typing.Dict[str, ValidateInputsTupl
                    errors.append(error)
                    continue

-                if isinstance(type_input, list):
+                if isinstance(input_type, list):
+                    combo_options = input_type
                    if "\\" in val:
                        # try to normalize paths for comparison purposes
                        val = canonicalize_path(val)
                    if all(isinstance(item, (str, PathLike)) for item in type_input):
                        type_input = [canonicalize_path(item) for item in type_input]
-                    if val not in type_input:
+                    if val not in combo_options:
                        input_config = info
                        list_info = ""

                        # Don't send back gigantic lists like if they're lots of
                        # scanned model filepaths
-                        if len(type_input) > 20:
-                            list_info = f"(list of length {len(type_input)})"
+                        if len(combo_options) > 20:
+                            list_info = f"(list of length {len(combo_options)})"
                            input_config = None
                        else:
-                            list_info = str(type_input)
+                            list_info = str(combo_options)

                        error = {
                            "type": "value_not_in_list",
@ -935,7 +962,7 @@ def _validate_prompt(prompt: typing.Mapping[str, typing.Any]) -> ValidationTuple
                "details": f"Node ID '#{x}'",
                "extra_info": {}
            }
-            return ValidationTuple(False, error, [], [])
+            return ValidationTuple(False, error, [], {})

        class_type = prompt[x]['class_type']
        class_ = get_nodes().NODE_CLASS_MAPPINGS.get(class_type, None)
@ -946,7 +973,7 @@ def _validate_prompt(prompt: typing.Mapping[str, typing.Any]) -> ValidationTuple
                "details": f"Node ID '#{x}'",
                "extra_info": {}
            }
-            return ValidationTuple(False, error, [], [])
+            return ValidationTuple(False, error, [], {})

        if hasattr(class_, 'OUTPUT_NODE') and class_.OUTPUT_NODE is True:
            outputs.add(x)
@ -958,7 +985,7 @@ def _validate_prompt(prompt: typing.Mapping[str, typing.Any]) -> ValidationTuple
            "details": "",
            "extra_info": {}
        }
-        return ValidationTuple(False, error, [], [])
+        return ValidationTuple(False, error, [], {})

    good_outputs = set()
    errors = []
--- a/comfy/cmd/folder_paths.py
+++ b/comfy/cmd/folder_paths.py
@ -423,7 +423,7 @@ def invalidate_cache(folder_name):
    pass


-def filter_files_content_types(files: list[str], content_types: list[Literal["image", "video", "audio"]]) -> list[str]:
+def filter_files_content_types(files: list[str], content_types: list[Literal["image", "video", "audio", "model"]]) -> list[str]:
    """
    Example:
        files = os.listdir(folder_paths.get_input_directory())
--- a/comfy/cmd/folder_paths.pyi
+++ b/comfy/cmd/folder_paths.pyi
@ -103,4 +103,4 @@ def create_directories(paths: Optional[FolderNames] = ...) -> None: ...
 def invalidate_cache(folder_name: str) -> None: ...


-def filter_files_content_types(files: List[str], content_types: List[Literal["image", "video", "audio"]]) -> List[str]: ...
+def filter_files_content_types(files: List[str], content_types: List[Literal["image", "video", "audio", "model"]]) -> List[str]: ...
--- a/comfy/cmd/main.py
+++ b/comfy/cmd/main.py
@ -28,9 +28,15 @@ logger = logging.getLogger(__name__)


 def prompt_worker(q: AbstractPromptQueue, _server: server_module.PromptServer):
-    from ..cmd.execution import PromptExecutor
+    from ..cmd.execution import PromptExecutor, CacheType
+    cache_type = CacheType.CLASSIC
+    if args.cache_lru > 0:
+        cache_type = CacheType.LRU
+    elif args.cache_none:
+        cache_type = CacheType.DEPENDENCY_AWARE

-    e = PromptExecutor(_server)
+
+    e = PromptExecutor(_server, cache_type=cache_type, cache_size=args.cache_lru)
    last_gc_collect = 0
    need_gc = False
    gc_collect_interval = 10.0
--- a/comfy/cmd/server.py
+++ b/comfy/cmd/server.py
@ -75,7 +75,7 @@ def get_comfyui_version():
@web.middleware
 async def cache_control(request: web.Request, handler):
    response: web.Response = await handler(request)
-    if request.path.endswith('.js') or request.path.endswith('.css'):
+    if request.path.endswith('.js') or request.path.endswith('.css') or request.path.endswith('index.json'):
        response.headers.setdefault('Cache-Control', 'no-cache')
    return response

@ -692,7 +692,13 @@ class PromptServer(ExecutorToClientProgress):
                    logger.warning("invalid prompt: {}".format(valid[1]))
                    return web.json_response({"error": valid[1], "node_errors": valid[3]}, status=400)
            else:
-                return web.json_response({"error": "no prompt", "node_errors": []}, status=400)
+                error = {
+                    "type": "no_prompt",
+                    "message": "No prompt provided",
+                    "details": "No prompt provided",
+                    "extra_info": {}
+                }
+                return web.json_response({"error": error, "node_errors": {}}, status=400)

        @routes.post("/queue")
        async def post_queue(request):
@ -978,6 +984,12 @@ class PromptServer(ExecutorToClientProgress):
        for name, dir in self.nodes.EXTENSION_WEB_DIRS.items():
            self.app.add_routes([web.static('/extensions/' + name, dir, follow_symlinks=True)])

+        workflow_templates_path = FrontendManager.templates_path()
+        if workflow_templates_path:
+            self.app.add_routes([
+                web.static('/templates', workflow_templates_path)
+            ])
+
        self.app.add_routes([
            web.static('/', self.web_root, follow_symlinks=True),
        ])
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@ -99,55 +99,59 @@ class InputTypeOptions(TypedDict):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
    """

-    default: bool | str | float | int | list | tuple
+    default: NotRequired[bool | str | float | int | list | tuple]
    """The default value of the widget"""
-    defaultInput: bool
-    """Defaults to an input slot rather than a widget"""
-    forceInput: bool
-    """`defaultInput` and also don't allow converting to a widget"""
-    lazy: bool
+    defaultInput: NotRequired[bool]
+    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
+    - defaultInput on required inputs should be dropped.
+    - defaultInput on optional inputs should be replaced with forceInput.
+    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
+    """
+    forceInput: NotRequired[bool]
+    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
+    lazy: NotRequired[bool]
    """Declares that this input uses lazy evaluation"""
-    rawLink: bool
+    rawLink: NotRequired[bool]
    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
-    tooltip: str
+    tooltip: NotRequired[str]
    """Tooltip for the input (or widget), shown on pointer hover"""
    # class InputTypeNumber(InputTypeOptions):
    # default: float | int
-    min: float
+    min: NotRequired[float]
    """The minimum value of a number (``FLOAT`` | ``INT``)"""
-    max: float
+    max: NotRequired[float]
    """The maximum value of a number (``FLOAT`` | ``INT``)"""
-    step: float
+    step: NotRequired[float]
    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
-    round: float
+    round: NotRequired[float]
    """Floats are rounded by this value (``FLOAT``)"""
    # class InputTypeBoolean(InputTypeOptions):
    # default: bool
-    label_on: str
+    label_on: NotRequired[str]
    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
-    label_off: str
+    label_off: NotRequired[str]
    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
    # class InputTypeString(InputTypeOptions):
    # default: str
-    multiline: bool
+    multiline: NotRequired[bool]
    """Use a multiline text box (``STRING``)"""
-    placeholder: str
+    placeholder: NotRequired[str]
    """Placeholder text to display in the UI when empty (``STRING``)"""
    # Deprecated:
    # defaultVal: str
-    dynamicPrompts: bool
+    dynamicPrompts: NotRequired[bool]
    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
    # class InputTypeCombo(InputTypeOptions):
-    image_upload: bool
+    image_upload: NotRequired[bool]
    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
-    image_folder: Literal["input", "output", "temp"]
+    image_folder: NotRequired[Literal["input", "output", "temp"]]
    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
    """
-    remote: RemoteInputOptions
+    remote: NotRequired[RemoteInputOptions]
    """Specifies the configuration for a remote input.
    Available after ComfyUI frontend v1.9.7
    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
-    control_after_generate: bool
+    control_after_generate: NotRequired[bool]
    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
    options: NotRequired[list[str | int | float]]
    """COMBO type only. Specifies the selectable options for the combo widget.
@ -165,15 +169,15 @@ class InputTypeOptions(TypedDict):
 class HiddenInputTypeDict(TypedDict):
    """Provides type hinting for the hidden entry of node INPUT_TYPES."""

-    node_id: Literal["UNIQUE_ID"]
+    node_id: NotRequired[Literal["UNIQUE_ID"]]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    unique_id: Literal["UNIQUE_ID"]
+    unique_id: NotRequired[Literal["UNIQUE_ID"]]
    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
-    prompt: Literal["PROMPT"]
+    prompt: NotRequired[Literal["PROMPT"]]
    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
-    extra_pnginfo: Literal["EXTRA_PNGINFO"]
+    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
-    dynprompt: Literal["DYNPROMPT"]
+    dynprompt: NotRequired[Literal["DYNPROMPT"]]
    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""


@ -183,11 +187,11 @@ class InputTypeDict(TypedDict):
    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
    """

-    required: dict[str, tuple[IO, InputTypeOptions]]
+    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
    """Describes all inputs that must be connected for the node to execute."""
-    optional: dict[str, tuple[IO, InputTypeOptions]]
+    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
    """Describes inputs which do not need to be connected."""
-    hidden: HiddenInputTypeDict
+    hidden: NotRequired[HiddenInputTypeDict]
    """Offers advanced functionality and server-client communication.

    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
--- a/comfy/component_model/folder_path_types.py
+++ b/comfy/component_model/folder_path_types.py
@ -16,6 +16,7 @@ from .platform_path import construct_path
 supported_pt_extensions = frozenset(['.ckpt', '.pt', '.pt2', '.bin', '.pth', '.safetensors', '.pkl', '.sft' ".index.json"])
 extension_mimetypes_cache = {
    "webp": "image",
+    "fbx": "model",
 }

 logger = logging.getLogger(__name__)
@ -122,6 +123,7 @@ class PathsList:
        p: FolderNames = self.parent()
        return len(list(p.directory_paths(self.folder_name)))

+
@dataclasses.dataclass
 class SupportedExtensions:
    folder_name: str
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@ -817,6 +817,8 @@ def load_controlnet_state_dict(state_dict, model=None, model_options=None, ckpt_
 def load_controlnet(ckpt_path, model=None, model_options=None):
    if model_options is None:
        model_options = {}
+    model_options = model_options.copy()
+
    if "global_average_pooling" not in model_options:
        filename = os.path.splitext(ckpt_path)[0]
        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"):  # TODO: smarter way of enabling global_average_pooling
--- a/comfy/graph.py
+++ b/comfy/graph.py
@ -1,5 +1,8 @@
-from typing import Optional
+from __future__ import annotations

+from typing import Optional, Type, Literal
+
+from .comfy_types.node_typing import ComfyNodeABC, InputTypeDict, InputTypeOptions
 from .component_model.executor_types import DependencyCycleError, NodeInputError, NodeNotFoundError, \
    DependencyExecutionErrorMessage
 from .graph_utils import is_link
@ -50,7 +53,21 @@ class DynamicPrompt:
        return self.original_prompt


-def get_input_info(class_def, input_name, valid_inputs=None):
+def get_input_info(
+        class_def: Type[ComfyNodeABC],
+        input_name: str,
+        valid_inputs: InputTypeDict | None = None
+) -> tuple[str, Literal["required", "optional", "hidden"], InputTypeOptions] | tuple[None, None, None]:
+    """Get the input type, category, and extra info for a given input name.
+
+    Arguments:
+        class_def: The class definition of the node.
+        input_name: The name of the input to get info for.
+        valid_inputs: The valid inputs for the node, or None to use the class_def.INPUT_TYPES().
+
+    Returns:
+        tuple[str, str, dict] | tuple[None, None, None]: The input type, category, and extra info for the input name.
+    """
    valid_inputs = valid_inputs or class_def.INPUT_TYPES()
    input_info = None
    input_category = None
@ -123,7 +140,7 @@ class TopologicalSort:
                    from_node_id, from_socket = value
                    if subgraph_nodes is not None and from_node_id not in subgraph_nodes:
                        continue
-                    input_type, input_category, input_info = self.get_input_info(unique_id, input_name)
+                    _, _, input_info = self.get_input_info(unique_id, input_name)
                    is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"]
                    if (include_lazy or not is_lazy) and not self.is_cached(from_node_id):
                        node_ids.append(from_node_id)
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@ -1466,3 +1466,101 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
            x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * (sigmas[i + 1] ** 2 - sigmas[i] ** 2 * r ** 2).sqrt().nan_to_num(nan=0.0)
        old_denoised = denoised
    return x
+
+@torch.no_grad()
+def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=0.5):
+    '''
+    SEEDS-2 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 2
+    Arxiv: https://arxiv.org/abs/2305.14267
+    '''
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+
+    inject_noise = eta > 0 and s_noise > 0
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        if sigmas[i + 1] == 0:
+            x = denoised
+        else:
+            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
+            h = t_next - t
+            h_eta = h * (eta + 1)
+            s = t + r * h
+            fac = 1 / (2 * r)
+            sigma_s = s.neg().exp()
+
+            coeff_1, coeff_2 = (-r * h_eta).expm1(), (-h_eta).expm1()
+            if inject_noise:
+                noise_coeff_1 = (-2 * r * h * eta).expm1().neg().sqrt()
+                noise_coeff_2 = ((-2 * r * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
+                noise_1, noise_2 = noise_sampler(sigmas[i], sigma_s), noise_sampler(sigma_s, sigmas[i + 1])
+
+            # Step 1
+            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
+            if inject_noise:
+                x_2 = x_2 + sigma_s * (noise_coeff_1 * noise_1) * s_noise
+            denoised_2 = model(x_2, sigma_s * s_in, **extra_args)
+
+            # Step 2
+            denoised_d = (1 - fac) * denoised + fac * denoised_2
+            x = (coeff_2 + 1) * x - coeff_2 * denoised_d
+            if inject_noise:
+                x = x + sigmas[i + 1] * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
+    return x
+
+@torch.no_grad()
+def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r_1=1./3, r_2=2./3):
+    '''
+    SEEDS-3 - Stochastic Explicit Exponential Derivative-free Solvers (VE Data Prediction) stage 3
+    Arxiv: https://arxiv.org/abs/2305.14267
+    '''
+    extra_args = {} if extra_args is None else extra_args
+    seed = extra_args.get("seed", None)
+    noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+    s_in = x.new_ones([x.shape[0]])
+
+    inject_noise = eta > 0 and s_noise > 0
+
+    for i in trange(len(sigmas) - 1, disable=disable):
+        denoised = model(x, sigmas[i] * s_in, **extra_args)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
+        if sigmas[i + 1] == 0:
+            x = denoised
+        else:
+            t, t_next = -sigmas[i].log(), -sigmas[i + 1].log()
+            h = t_next - t
+            h_eta = h * (eta + 1)
+            s_1 = t + r_1 * h
+            s_2 = t + r_2 * h
+            sigma_s_1, sigma_s_2 = s_1.neg().exp(), s_2.neg().exp()
+
+            coeff_1, coeff_2, coeff_3 = (-r_1 * h_eta).expm1(), (-r_2 * h_eta).expm1(), (-h_eta).expm1()
+            if inject_noise:
+                noise_coeff_1 = (-2 * r_1 * h * eta).expm1().neg().sqrt()
+                noise_coeff_2 = ((-2 * r_1 * h * eta).expm1() - (-2 * r_2 * h * eta).expm1()).sqrt()
+                noise_coeff_3 = ((-2 * r_2 * h * eta).expm1() - (-2 * h * eta).expm1()).sqrt()
+                noise_1, noise_2, noise_3 = noise_sampler(sigmas[i], sigma_s_1), noise_sampler(sigma_s_1, sigma_s_2), noise_sampler(sigma_s_2, sigmas[i + 1])
+
+            # Step 1
+            x_2 = (coeff_1 + 1) * x - coeff_1 * denoised
+            if inject_noise:
+                x_2 = x_2 + sigma_s_1 * (noise_coeff_1 * noise_1) * s_noise
+            denoised_2 = model(x_2, sigma_s_1 * s_in, **extra_args)
+
+            # Step 2
+            x_3 = (coeff_2 + 1) * x - coeff_2 * denoised + (r_2 / r_1) * (coeff_2 / (r_2 * h_eta) + 1) * (denoised_2 - denoised)
+            if inject_noise:
+                x_3 = x_3 + sigma_s_2 * (noise_coeff_2 * noise_1 + noise_coeff_1 * noise_2) * s_noise
+            denoised_3 = model(x_3, sigma_s_2 * s_in, **extra_args)
+
+            # Step 3
+            x = (coeff_3 + 1) * x - coeff_3 * denoised + (1. / r_2) * (coeff_3 / h_eta + 1) * (denoised_3 - denoised)
+            if inject_noise:
+                x = x + sigmas[i + 1] * (noise_coeff_3 * noise_1 + noise_coeff_2 * noise_2 + noise_coeff_1 * noise_3) * s_noise
+    return x
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@ -1,6 +1,5 @@
 import torch
-
-from comfy import ops
+from .. import rmsnorm


 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
@ -13,20 +12,5 @@ def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):

    return torch.nn.functional.pad(img, pad, mode=padding_mode)

-try:
-    rms_norm_torch = torch.nn.functional.rms_norm  # pylint: disable=no-member
-except:
-    rms_norm_torch = None

-def rms_norm(x, weight=None, eps=1e-6):
-    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
-        if weight is None:
-            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
-        else:
-            return rms_norm_torch(x, weight.shape, weight=ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
-    else:
-        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
-        if weight is None:
-            return r
-        else:
-            return r * ops.cast_to(weight, dtype=x.dtype, device=x.device)
+rms_norm = rmsnorm.rms_norm
--- a/comfy/ldm/hidream/model.py
+++ b/comfy/ldm/hidream/model.py
@ -0,0 +1,799 @@
+from typing import Optional, Tuple, List
+
+import torch
+import torch.nn as nn
+import einops
+from einops import repeat
+
+from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps
+import torch.nn.functional as F
+
+from comfy.ldm.flux.math import apply_rope, rope
+from comfy.ldm.flux.layers import LastLayer
+
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.model_management
+import comfy.ldm.common_dit
+
+
+# Copied from https://github.com/black-forest-labs/flux/blob/main/src/flux/modules/layers.py
+class EmbedND(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(2)
+
+
+class PatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size=2,
+        in_channels=4,
+        out_channels=1024,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.out_channels = out_channels
+        self.proj = operations.Linear(in_channels * patch_size * patch_size, out_channels, bias=True, dtype=dtype, device=device)
+
+    def forward(self, latent):
+        latent = self.proj(latent)
+        return latent
+
+
+class PooledEmbed(nn.Module):
+    def __init__(self, text_emb_dim, hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.pooled_embedder = TimestepEmbedding(in_channels=text_emb_dim, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, pooled_embed):
+        return self.pooled_embedder(pooled_embed)
+
+
+class TimestepEmbed(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, timesteps, wdtype):
+        t_emb = self.time_proj(timesteps).to(dtype=wdtype)
+        t_emb = self.timestep_embedder(t_emb)
+        return t_emb
+
+
+def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
+    return optimized_attention(query.view(query.shape[0], -1, query.shape[-1] * query.shape[-2]), key.view(key.shape[0], -1, key.shape[-1] * key.shape[-2]), value.view(value.shape[0], -1, value.shape[-1] * value.shape[-2]), query.shape[2])
+
+
+class HiDreamAttnProcessor_flashattn:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+
+    def __call__(
+        self,
+        attn,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        rope: torch.FloatTensor = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        dtype = image_tokens.dtype
+        batch_size = image_tokens.shape[0]
+
+        query_i = attn.q_rms_norm(attn.to_q(image_tokens)).to(dtype=dtype)
+        key_i = attn.k_rms_norm(attn.to_k(image_tokens)).to(dtype=dtype)
+        value_i = attn.to_v(image_tokens)
+
+        inner_dim = key_i.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query_i = query_i.view(batch_size, -1, attn.heads, head_dim)
+        key_i = key_i.view(batch_size, -1, attn.heads, head_dim)
+        value_i = value_i.view(batch_size, -1, attn.heads, head_dim)
+        if image_tokens_masks is not None:
+            key_i = key_i * image_tokens_masks.view(batch_size, -1, 1, 1)
+
+        if not attn.single:
+            query_t = attn.q_rms_norm_t(attn.to_q_t(text_tokens)).to(dtype=dtype)
+            key_t = attn.k_rms_norm_t(attn.to_k_t(text_tokens)).to(dtype=dtype)
+            value_t = attn.to_v_t(text_tokens)
+
+            query_t = query_t.view(batch_size, -1, attn.heads, head_dim)
+            key_t = key_t.view(batch_size, -1, attn.heads, head_dim)
+            value_t = value_t.view(batch_size, -1, attn.heads, head_dim)
+
+            num_image_tokens = query_i.shape[1]
+            num_text_tokens = query_t.shape[1]
+            query = torch.cat([query_i, query_t], dim=1)
+            key = torch.cat([key_i, key_t], dim=1)
+            value = torch.cat([value_i, value_t], dim=1)
+        else:
+            query = query_i
+            key = key_i
+            value = value_i
+
+        if query.shape[-1] == rope.shape[-3] * 2:
+            query, key = apply_rope(query, key, rope)
+        else:
+            query_1, query_2 = query.chunk(2, dim=-1)
+            key_1, key_2 = key.chunk(2, dim=-1)
+            query_1, key_1 = apply_rope(query_1, key_1, rope)
+            query = torch.cat([query_1, query_2], dim=-1)
+            key = torch.cat([key_1, key_2], dim=-1)
+
+        hidden_states = attention(query, key, value)
+
+        if not attn.single:
+            hidden_states_i, hidden_states_t = torch.split(hidden_states, [num_image_tokens, num_text_tokens], dim=1)
+            hidden_states_i = attn.to_out(hidden_states_i)
+            hidden_states_t = attn.to_out_t(hidden_states_t)
+            return hidden_states_i, hidden_states_t
+        else:
+            hidden_states = attn.to_out(hidden_states)
+            return hidden_states
+
+class HiDreamAttention(nn.Module):
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        scale_qk: bool = True,
+        eps: float = 1e-5,
+        processor = None,
+        out_dim: int = None,
+        single: bool = False,
+        dtype=None, device=None, operations=None
+    ):
+        # super(Attention, self).__init__()
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.out_dim = out_dim if out_dim is not None else query_dim
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.sliceable_head_dim = heads
+        self.single = single
+
+        linear_cls = operations.Linear
+        self.linear_cls = linear_cls
+        self.to_q = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
+        self.to_k = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+        self.to_v = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+        self.to_out = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
+        self.q_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+        self.k_rms_norm = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+
+        if not single:
+            self.to_q_t = linear_cls(query_dim, self.inner_dim, dtype=dtype, device=device)
+            self.to_k_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+            self.to_v_t = linear_cls(self.inner_dim, self.inner_dim, dtype=dtype, device=device)
+            self.to_out_t = linear_cls(self.inner_dim, self.out_dim, dtype=dtype, device=device)
+            self.q_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+            self.k_rms_norm_t = operations.RMSNorm(self.inner_dim, eps, dtype=dtype, device=device)
+
+        self.processor = processor
+
+    def forward(
+        self,
+        norm_image_tokens: torch.FloatTensor,
+        image_tokens_masks: torch.FloatTensor = None,
+        norm_text_tokens: torch.FloatTensor = None,
+        rope: torch.FloatTensor = None,
+    ) -> torch.Tensor:
+        return self.processor(
+            self,
+            image_tokens = norm_image_tokens,
+            image_tokens_masks = image_tokens_masks,
+            text_tokens = norm_text_tokens,
+            rope = rope,
+        )
+
+
+class FeedForwardSwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+
+        self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+        self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
+        self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.w2(torch.nn.functional.silu(self.w1(x)) * self.w3(x))
+
+
+# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
+class MoEGate(nn.Module):
+    def __init__(self, embed_dim, num_routed_experts=4, num_activated_experts=2, aux_loss_alpha=0.01, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.top_k = num_activated_experts
+        self.n_routed_experts = num_routed_experts
+
+        self.scoring_func = 'softmax'
+        self.alpha = aux_loss_alpha
+        self.seq_aux = False
+
+        # topk selection algorithm
+        self.norm_topk_prob = False
+        self.gating_dim = embed_dim
+        self.weight = nn.Parameter(torch.empty((self.n_routed_experts, self.gating_dim), dtype=dtype, device=device))
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        pass
+        # import torch.nn.init  as init
+        # init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(hidden_states, comfy.model_management.cast_to(self.weight, dtype=hidden_states.dtype, device=hidden_states.device), None)
+        if self.scoring_func == 'softmax':
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f'insupportable scoring function for MoE gating: {self.scoring_func}')
+
+        ### select top-k experts
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+
+        aux_loss = None
+        return topk_idx, topk_weight, aux_loss
+
+
+# Modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/model.py
+class MOEFeedForwardSwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        num_routed_experts: int,
+        num_activated_experts: int,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.shared_experts = FeedForwardSwiGLU(dim, hidden_dim // 2, dtype=dtype, device=device, operations=operations)
+        self.experts = nn.ModuleList([FeedForwardSwiGLU(dim, hidden_dim, dtype=dtype, device=device, operations=operations) for i in range(num_routed_experts)])
+        self.gate = MoEGate(
+            embed_dim = dim,
+            num_routed_experts = num_routed_experts,
+            num_activated_experts = num_activated_experts,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.num_activated_experts = num_activated_experts
+
+    def forward(self, x):
+        wtype = x.dtype
+        identity = x
+        orig_shape = x.shape
+        topk_idx, topk_weight, aux_loss = self.gate(x)
+        x = x.view(-1, x.shape[-1])
+        flat_topk_idx = topk_idx.view(-1)
+        if True:  # self.training: # TODO: check which branch performs faster
+            x = x.repeat_interleave(self.num_activated_experts, dim=0)
+            y = torch.empty_like(x, dtype=wtype)
+            for i, expert in enumerate(self.experts):
+                y[flat_topk_idx == i] = expert(x[flat_topk_idx == i]).to(dtype=wtype)
+            y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+            y =  y.view(*orig_shape).to(dtype=wtype)
+            #y = AddAuxiliaryLoss.apply(y, aux_loss)
+        else:
+            y = self.moe_infer(x, flat_topk_idx, topk_weight.view(-1, 1)).view(*orig_shape)
+        y = y + self.shared_experts(identity)
+        return y
+
+    @torch.no_grad()
+    def moe_infer(self, x, flat_expert_indices, flat_expert_weights):
+        expert_cache = torch.zeros_like(x)
+        idxs = flat_expert_indices.argsort()
+        tokens_per_expert = flat_expert_indices.bincount().cpu().numpy().cumsum(0)
+        token_idxs = idxs // self.num_activated_experts
+        for i, end_idx in enumerate(tokens_per_expert):
+            start_idx = 0 if i == 0 else tokens_per_expert[i-1]
+            if start_idx == end_idx:
+                continue
+            expert = self.experts[i]
+            exp_token_idx = token_idxs[start_idx:end_idx]
+            expert_tokens = x[exp_token_idx]
+            expert_out = expert(expert_tokens)
+            expert_out.mul_(flat_expert_weights[idxs[start_idx:end_idx]])
+
+            # for fp16 and other dtype
+            expert_cache = expert_cache.to(expert_out.dtype)
+            expert_cache.scatter_reduce_(0, exp_token_idx.view(-1, 1).repeat(1, x.shape[-1]), expert_out, reduce='sum')
+        return expert_cache
+
+
+class TextProjection(nn.Module):
+    def __init__(self, in_features, hidden_size, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.linear = operations.Linear(in_features=in_features, out_features=hidden_size, bias=False, dtype=dtype, device=device)
+
+    def forward(self, caption):
+        hidden_states = self.linear(caption)
+        return hidden_states
+
+
+class BlockType:
+    TransformerBlock = 1
+    SingleTransformerBlock = 2
+
+
+class HiDreamImageSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device)
+        )
+
+        # 1. Attention
+        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        self.attn1 = HiDreamAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            processor = HiDreamAttnProcessor_flashattn(),
+            single = True,
+            dtype=dtype, device=device, operations=operations
+        )
+
+        # 3. Feed-forward
+        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        if num_routed_experts > 0:
+            self.ff_i = MOEFeedForwardSwiGLU(
+                dim = dim,
+                hidden_dim = 4 * dim,
+                num_routed_experts = num_routed_experts,
+                num_activated_experts = num_activated_experts,
+                dtype=dtype, device=device, operations=operations
+            )
+        else:
+            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(
+        self,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        adaln_input: Optional[torch.FloatTensor] = None,
+        rope: torch.FloatTensor = None,
+
+    ) -> torch.FloatTensor:
+        wtype = image_tokens.dtype
+        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i = \
+            self.adaLN_modulation(adaln_input)[:,None].chunk(6, dim=-1)
+
+        # 1. MM-Attention
+        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
+        attn_output_i = self.attn1(
+            norm_image_tokens,
+            image_tokens_masks,
+            rope = rope,
+        )
+        image_tokens = gate_msa_i * attn_output_i + image_tokens
+
+        # 2. Feed-forward
+        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
+        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens.to(dtype=wtype))
+        image_tokens = ff_output_i + image_tokens
+        return image_tokens
+
+
+class HiDreamImageTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            operations.Linear(dim, 12 * dim, bias=True, dtype=dtype, device=device)
+        )
+        # nn.init.zeros_(self.adaLN_modulation[1].weight)
+        # nn.init.zeros_(self.adaLN_modulation[1].bias)
+
+        # 1. Attention
+        self.norm1_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        self.norm1_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        self.attn1 = HiDreamAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            processor = HiDreamAttnProcessor_flashattn(),
+            single = False,
+            dtype=dtype, device=device, operations=operations
+        )
+
+        # 3. Feed-forward
+        self.norm3_i = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False, dtype=dtype, device=device)
+        if num_routed_experts > 0:
+            self.ff_i = MOEFeedForwardSwiGLU(
+                dim = dim,
+                hidden_dim = 4 * dim,
+                num_routed_experts = num_routed_experts,
+                num_activated_experts = num_activated_experts,
+                dtype=dtype, device=device, operations=operations
+            )
+        else:
+            self.ff_i = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
+        self.norm3_t = operations.LayerNorm(dim, eps = 1e-06, elementwise_affine = False)
+        self.ff_t = FeedForwardSwiGLU(dim = dim, hidden_dim = 4 * dim, dtype=dtype, device=device, operations=operations)
+
+    def forward(
+        self,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        adaln_input: Optional[torch.FloatTensor] = None,
+        rope: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        wtype = image_tokens.dtype
+        shift_msa_i, scale_msa_i, gate_msa_i, shift_mlp_i, scale_mlp_i, gate_mlp_i, \
+        shift_msa_t, scale_msa_t, gate_msa_t, shift_mlp_t, scale_mlp_t, gate_mlp_t = \
+            self.adaLN_modulation(adaln_input)[:,None].chunk(12, dim=-1)
+
+        # 1. MM-Attention
+        norm_image_tokens = self.norm1_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_msa_i) + shift_msa_i
+        norm_text_tokens = self.norm1_t(text_tokens).to(dtype=wtype)
+        norm_text_tokens = norm_text_tokens * (1 + scale_msa_t) + shift_msa_t
+
+        attn_output_i, attn_output_t = self.attn1(
+            norm_image_tokens,
+            image_tokens_masks,
+            norm_text_tokens,
+            rope = rope,
+        )
+
+        image_tokens = gate_msa_i * attn_output_i + image_tokens
+        text_tokens = gate_msa_t * attn_output_t + text_tokens
+
+        # 2. Feed-forward
+        norm_image_tokens = self.norm3_i(image_tokens).to(dtype=wtype)
+        norm_image_tokens = norm_image_tokens * (1 + scale_mlp_i) + shift_mlp_i
+        norm_text_tokens = self.norm3_t(text_tokens).to(dtype=wtype)
+        norm_text_tokens = norm_text_tokens * (1 + scale_mlp_t) + shift_mlp_t
+
+        ff_output_i = gate_mlp_i * self.ff_i(norm_image_tokens)
+        ff_output_t = gate_mlp_t * self.ff_t(norm_text_tokens)
+        image_tokens = ff_output_i + image_tokens
+        text_tokens = ff_output_t + text_tokens
+        return image_tokens, text_tokens
+
+
+class HiDreamImageBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        block_type: BlockType = BlockType.TransformerBlock,
+        dtype=None, device=None, operations=None
+    ):
+        super().__init__()
+        block_classes = {
+            BlockType.TransformerBlock: HiDreamImageTransformerBlock,
+            BlockType.SingleTransformerBlock: HiDreamImageSingleTransformerBlock,
+        }
+        self.block = block_classes[block_type](
+            dim,
+            num_attention_heads,
+            attention_head_dim,
+            num_routed_experts,
+            num_activated_experts,
+            dtype=dtype, device=device, operations=operations
+        )
+
+    def forward(
+        self,
+        image_tokens: torch.FloatTensor,
+        image_tokens_masks: Optional[torch.FloatTensor] = None,
+        text_tokens: Optional[torch.FloatTensor] = None,
+        adaln_input: torch.FloatTensor = None,
+        rope: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        return self.block(
+            image_tokens,
+            image_tokens_masks,
+            text_tokens,
+            adaln_input,
+            rope,
+        )
+
+
+class HiDreamImageTransformer2DModel(nn.Module):
+    def __init__(
+        self,
+        patch_size: Optional[int] = None,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 16,
+        num_single_layers: int = 32,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 20,
+        caption_channels: List[int] = None,
+        text_emb_dim: int = 2048,
+        num_routed_experts: int = 4,
+        num_activated_experts: int = 2,
+        axes_dims_rope: Tuple[int, int] = (32, 32),
+        max_resolution: Tuple[int, int] = (128, 128),
+        llama_layers: List[int] = None,
+        image_model=None,
+        dtype=None, device=None, operations=None
+    ):
+        self.patch_size = patch_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.num_layers = num_layers
+        self.num_single_layers = num_single_layers
+
+        self.gradient_checkpointing = False
+
+        super().__init__()
+        self.dtype = dtype
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = self.num_attention_heads * self.attention_head_dim
+        self.llama_layers = llama_layers
+
+        self.t_embedder = TimestepEmbed(self.inner_dim, dtype=dtype, device=device, operations=operations)
+        self.p_embedder = PooledEmbed(text_emb_dim, self.inner_dim, dtype=dtype, device=device, operations=operations)
+        self.x_embedder = PatchEmbed(
+            patch_size = patch_size,
+            in_channels = in_channels,
+            out_channels = self.inner_dim,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.pe_embedder = EmbedND(theta=10000, axes_dim=axes_dims_rope)
+
+        self.double_stream_blocks = nn.ModuleList(
+            [
+                HiDreamImageBlock(
+                    dim = self.inner_dim,
+                    num_attention_heads = self.num_attention_heads,
+                    attention_head_dim = self.attention_head_dim,
+                    num_routed_experts = num_routed_experts,
+                    num_activated_experts = num_activated_experts,
+                    block_type = BlockType.TransformerBlock,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+
+        self.single_stream_blocks = nn.ModuleList(
+            [
+                HiDreamImageBlock(
+                    dim = self.inner_dim,
+                    num_attention_heads = self.num_attention_heads,
+                    attention_head_dim = self.attention_head_dim,
+                    num_routed_experts = num_routed_experts,
+                    num_activated_experts = num_activated_experts,
+                    block_type = BlockType.SingleTransformerBlock,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for i in range(self.num_single_layers)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.inner_dim, patch_size, self.out_channels, dtype=dtype, device=device, operations=operations)
+
+        caption_channels = [caption_channels[1], ] * (num_layers + num_single_layers) + [caption_channels[0], ]
+        caption_projection = []
+        for caption_channel in caption_channels:
+            caption_projection.append(TextProjection(in_features=caption_channel, hidden_size=self.inner_dim, dtype=dtype, device=device, operations=operations))
+        self.caption_projection = nn.ModuleList(caption_projection)
+        self.max_seq = max_resolution[0] * max_resolution[1] // (patch_size * patch_size)
+
+    def expand_timesteps(self, timesteps, batch_size, device):
+        if not torch.is_tensor(timesteps):
+            is_mps = device.type == "mps"
+            if isinstance(timesteps, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(batch_size)
+        return timesteps
+
+    def unpatchify(self, x: torch.Tensor, img_sizes: List[Tuple[int, int]]) -> List[torch.Tensor]:
+        x_arr = []
+        for i, img_size in enumerate(img_sizes):
+            pH, pW = img_size
+            x_arr.append(
+                einops.rearrange(x[i, :pH*pW].reshape(1, pH, pW, -1), 'B H W (p1 p2 C) -> B C (H p1) (W p2)',
+                    p1=self.patch_size, p2=self.patch_size)
+            )
+        x = torch.cat(x_arr, dim=0)
+        return x
+
+    def patchify(self, x, max_seq, img_sizes=None):
+        pz2 = self.patch_size * self.patch_size
+        if isinstance(x, torch.Tensor):
+            B = x.shape[0]
+            device = x.device
+            dtype = x.dtype
+        else:
+            B = len(x)
+            device = x[0].device
+            dtype = x[0].dtype
+        x_masks = torch.zeros((B, max_seq), dtype=dtype, device=device)
+
+        if img_sizes is not None:
+            for i, img_size in enumerate(img_sizes):
+                x_masks[i, 0:img_size[0] * img_size[1]] = 1
+            x = einops.rearrange(x, 'B C S p -> B S (p C)', p=pz2)
+        elif isinstance(x, torch.Tensor):
+            pH, pW = x.shape[-2] // self.patch_size, x.shape[-1] // self.patch_size
+            x = einops.rearrange(x, 'B C (H p1) (W p2) -> B (H W) (p1 p2 C)', p1=self.patch_size, p2=self.patch_size)
+            img_sizes = [[pH, pW]] * B
+            x_masks = None
+        else:
+            raise NotImplementedError
+        return x, x_masks, img_sizes
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        y: Optional[torch.Tensor] = None,
+        context: Optional[torch.Tensor] = None,
+        encoder_hidden_states_llama3=None,
+        control = None,
+        transformer_options = {},
+    ) -> torch.Tensor:
+        bs, c, h, w = x.shape
+        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+        timesteps = t
+        pooled_embeds = y
+        T5_encoder_hidden_states = context
+
+        img_sizes = None
+
+        # spatial forward
+        batch_size = hidden_states.shape[0]
+        hidden_states_type = hidden_states.dtype
+
+        # 0. time
+        timesteps = self.expand_timesteps(timesteps, batch_size, hidden_states.device)
+        timesteps = self.t_embedder(timesteps, hidden_states_type)
+        p_embedder = self.p_embedder(pooled_embeds)
+        adaln_input = timesteps + p_embedder
+
+        hidden_states, image_tokens_masks, img_sizes = self.patchify(hidden_states, self.max_seq, img_sizes)
+        if image_tokens_masks is None:
+            pH, pW = img_sizes[0]
+            img_ids = torch.zeros(pH, pW, 3, device=hidden_states.device)
+            img_ids[..., 1] = img_ids[..., 1] + torch.arange(pH, device=hidden_states.device)[:, None]
+            img_ids[..., 2] = img_ids[..., 2] + torch.arange(pW, device=hidden_states.device)[None, :]
+            img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
+        hidden_states = self.x_embedder(hidden_states)
+
+        # T5_encoder_hidden_states = encoder_hidden_states[0]
+        encoder_hidden_states = encoder_hidden_states_llama3.movedim(1, 0)
+        encoder_hidden_states = [encoder_hidden_states[k] for k in self.llama_layers]
+
+        if self.caption_projection is not None:
+            new_encoder_hidden_states = []
+            for i, enc_hidden_state in enumerate(encoder_hidden_states):
+                enc_hidden_state = self.caption_projection[i](enc_hidden_state)
+                enc_hidden_state = enc_hidden_state.view(batch_size, -1, hidden_states.shape[-1])
+                new_encoder_hidden_states.append(enc_hidden_state)
+            encoder_hidden_states = new_encoder_hidden_states
+            T5_encoder_hidden_states = self.caption_projection[-1](T5_encoder_hidden_states)
+            T5_encoder_hidden_states = T5_encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+            encoder_hidden_states.append(T5_encoder_hidden_states)
+
+        txt_ids = torch.zeros(
+            batch_size,
+            encoder_hidden_states[-1].shape[1] + encoder_hidden_states[-2].shape[1] + encoder_hidden_states[0].shape[1],
+            3,
+            device=img_ids.device, dtype=img_ids.dtype
+        )
+        ids = torch.cat((img_ids, txt_ids), dim=1)
+        rope = self.pe_embedder(ids)
+
+        # 2. Blocks
+        block_id = 0
+        initial_encoder_hidden_states = torch.cat([encoder_hidden_states[-1], encoder_hidden_states[-2]], dim=1)
+        initial_encoder_hidden_states_seq_len = initial_encoder_hidden_states.shape[1]
+        for bid, block in enumerate(self.double_stream_blocks):
+            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
+            cur_encoder_hidden_states = torch.cat([initial_encoder_hidden_states, cur_llama31_encoder_hidden_states], dim=1)
+            hidden_states, initial_encoder_hidden_states = block(
+                image_tokens = hidden_states,
+                image_tokens_masks = image_tokens_masks,
+                text_tokens = cur_encoder_hidden_states,
+                adaln_input = adaln_input,
+                rope = rope,
+            )
+            initial_encoder_hidden_states = initial_encoder_hidden_states[:, :initial_encoder_hidden_states_seq_len]
+            block_id += 1
+
+        image_tokens_seq_len = hidden_states.shape[1]
+        hidden_states = torch.cat([hidden_states, initial_encoder_hidden_states], dim=1)
+        hidden_states_seq_len = hidden_states.shape[1]
+        if image_tokens_masks is not None:
+            encoder_attention_mask_ones = torch.ones(
+                (batch_size, initial_encoder_hidden_states.shape[1] + cur_llama31_encoder_hidden_states.shape[1]),
+                device=image_tokens_masks.device, dtype=image_tokens_masks.dtype
+            )
+            image_tokens_masks = torch.cat([image_tokens_masks, encoder_attention_mask_ones], dim=1)
+
+        for bid, block in enumerate(self.single_stream_blocks):
+            cur_llama31_encoder_hidden_states = encoder_hidden_states[block_id]
+            hidden_states = torch.cat([hidden_states, cur_llama31_encoder_hidden_states], dim=1)
+            hidden_states = block(
+                image_tokens=hidden_states,
+                image_tokens_masks=image_tokens_masks,
+                text_tokens=None,
+                adaln_input=adaln_input,
+                rope=rope,
+            )
+            hidden_states = hidden_states[:, :hidden_states_seq_len]
+            block_id += 1
+
+        hidden_states = hidden_states[:, :image_tokens_seq_len, ...]
+        output = self.final_layer(hidden_states, adaln_input)
+        output = self.unpatchify(output, img_sizes)
+        return -output[:, :, :h, :w]
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@ -876,6 +876,7 @@ class SpatialTransformer(nn.Module):
        if not isinstance(context, list):
            context = [context] * len(self.transformer_blocks)
        b, c, h, w = x.shape
+        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        x = self.norm(x)
        if not self.use_linear:
@ -991,6 +992,7 @@ class SpatialVideoTransformer(SpatialTransformer):
            transformer_options={}
    ) -> torch.Tensor:
        _, _, h, w = x.shape
+        transformer_options["activations_shape"] = list(x.shape)
        x_in = x
        spatial_context = None
        if exists(context):
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -83,7 +83,7 @@ class WanSelfAttention(nn.Module):

 class WanT2VCrossAttention(WanSelfAttention):

-    def forward(self, x, context):
+    def forward(self, x, context, **kwargs):
        r"""
        Args:
            x(Tensor): Shape [B, L1, C]
@ -116,14 +116,14 @@ class WanI2VCrossAttention(WanSelfAttention):
        # self.alpha = nn.Parameter(torch.zeros((1, )))
        self.norm_k_img = RMSNorm(dim, eps=eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")) if qk_norm else nn.Identity()

-    def forward(self, x, context):
+    def forward(self, x, context, context_img_len):
        r"""
        Args:
            x(Tensor): Shape [B, L1, C]
            context(Tensor): Shape [B, L2, C]
        """
-        context_img = context[:, :257]
-        context = context[:, 257:]
+        context_img = context[:, :context_img_len]
+        context = context[:, context_img_len:]

        # compute query, key, value
        q = self.norm_q(self.q(x))
@ -193,6 +193,7 @@ class WanAttentionBlock(nn.Module):
        e,
        freqs,
        context,
+        context_img_len=257,
    ):
        r"""
        Args:
@ -213,12 +214,40 @@ class WanAttentionBlock(nn.Module):
        x = x + y * e[2]

        # cross-attention & ffn
-        x = x + self.cross_attn(self.norm3(x), context)
+        x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len)
        y = self.ffn(self.norm2(x) * (1 + e[4]) + e[3])
        x = x + y * e[5]
        return x


+class VaceWanAttentionBlock(WanAttentionBlock):
+    def __init__(
+            self,
+            cross_attn_type,
+            dim,
+            ffn_dim,
+            num_heads,
+            window_size=(-1, -1),
+            qk_norm=True,
+            cross_attn_norm=False,
+            eps=1e-6,
+            block_id=0,
+            operation_settings={}
+    ):
+        super().__init__(cross_attn_type, dim, ffn_dim, num_heads, window_size, qk_norm, cross_attn_norm, eps, operation_settings=operation_settings)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+        self.after_proj = operation_settings.get("operations").Linear(self.dim, self.dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
+
+    def forward(self, c, x, **kwargs):
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+        c = super().forward(c, **kwargs)
+        c_skip = self.after_proj(c)
+        return c_skip, c
+
+
 class Head(nn.Module):

    def __init__(self, dim, out_dim, patch_size, eps=1e-6, operation_settings={}):
@ -250,7 +279,7 @@ class Head(nn.Module):

 class MLPProj(torch.nn.Module):

-    def __init__(self, in_dim, out_dim, operation_settings={}):
+    def __init__(self, in_dim, out_dim, flf_pos_embed_token_number=None, operation_settings={}):
        super().__init__()

        self.proj = torch.nn.Sequential(
@ -258,7 +287,15 @@ class MLPProj(torch.nn.Module):
            torch.nn.GELU(), operation_settings.get("operations").Linear(in_dim, out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
            operation_settings.get("operations").LayerNorm(out_dim, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))

+        if flf_pos_embed_token_number is not None:
+            self.emb_pos = nn.Parameter(torch.empty((1, flf_pos_embed_token_number, in_dim), device=operation_settings.get("device"), dtype=operation_settings.get("dtype")))
+        else:
+            self.emb_pos = None
+
    def forward(self, image_embeds):
+        if self.emb_pos is not None:
+            image_embeds = image_embeds[:, :self.emb_pos.shape[1]] + comfy.model_management.cast_to(self.emb_pos[:, :image_embeds.shape[1]], dtype=image_embeds.dtype, device=image_embeds.device)
+
        clip_extra_context_tokens = self.proj(image_embeds)
        return clip_extra_context_tokens

@ -284,6 +321,7 @@ class WanModel(torch.nn.Module):
                 qk_norm=True,
                 cross_attn_norm=True,
                 eps=1e-6,
+                 flf_pos_embed_token_number=None,
                 image_model=None,
                 device=None,
                 dtype=None,
@ -373,7 +411,7 @@ class WanModel(torch.nn.Module):
        self.rope_embedder = EmbedND(dim=d, theta=10000.0, axes_dim=[d - 4 * (d // 6), 2 * (d // 6), 2 * (d // 6)])

        if model_type == 'i2v':
-            self.img_emb = MLPProj(1280, dim, operation_settings=operation_settings)
+            self.img_emb = MLPProj(1280, dim, flf_pos_embed_token_number=flf_pos_embed_token_number, operation_settings=operation_settings)
        else:
            self.img_emb = None

@ -385,6 +423,7 @@ class WanModel(torch.nn.Module):
        clip_fea=None,
        freqs=None,
        transformer_options={},
+        **kwargs,
    ):
        r"""
        Forward pass through the diffusion model
@ -420,9 +459,12 @@ class WanModel(torch.nn.Module):
        # context
        context = self.text_embedding(context)

-        if clip_fea is not None and self.img_emb is not None:
-            context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
-            context = torch.concat([context_clip, context], dim=1)
+        context_img_len = None
+        if clip_fea is not None:
+            if self.img_emb is not None:
+                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+                context = torch.concat([context_clip, context], dim=1)
+            context_img_len = clip_fea.shape[-2]

        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
@ -430,12 +472,12 @@ class WanModel(torch.nn.Module):
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
-                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"])
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
                    return out
                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
                x = out["img"]
            else:
-                x = block(x, e=e0, freqs=freqs, context=context)
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)

        # head
        x = self.head(x, e)
@ -444,7 +486,7 @@ class WanModel(torch.nn.Module):
        x = self.unpatchify(x, grid_sizes)
        return x

-    def forward(self, x, timestep, context, clip_fea=None, transformer_options={},**kwargs):
+    def forward(self, x, timestep, context, clip_fea=None, transformer_options={}, **kwargs):
        bs, c, t, h, w = x.shape
        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
        patch_size = self.patch_size
@ -458,7 +500,7 @@ class WanModel(torch.nn.Module):
        img_ids = repeat(img_ids, "t h w c -> b (t h w) c", b=bs)

        freqs = self.rope_embedder(img_ids).movedim(1, 2)
-        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options)[:, :, :t, :h, :w]
+        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]

    def unpatchify(self, x, grid_sizes):
        r"""
@ -483,3 +525,114 @@ class WanModel(torch.nn.Module):
        u = torch.einsum('bfhwpqrc->bcfphqwr', u)
        u = u.reshape(b, c, *[i * j for i, j in zip(grid_sizes, self.patch_size)])
        return u
+
+
+class VaceWanModel(WanModel):
+    r"""
+    Wan diffusion backbone supporting both text-to-video and image-to-video.
+    """
+
+    def __init__(self,
+                 model_type='vace',
+                 patch_size=(1, 2, 2),
+                 text_len=512,
+                 in_dim=16,
+                 dim=2048,
+                 ffn_dim=8192,
+                 freq_dim=256,
+                 text_dim=4096,
+                 out_dim=16,
+                 num_heads=16,
+                 num_layers=32,
+                 window_size=(-1, -1),
+                 qk_norm=True,
+                 cross_attn_norm=True,
+                 eps=1e-6,
+                 flf_pos_embed_token_number=None,
+                 image_model=None,
+                 vace_layers=None,
+                 vace_in_dim=None,
+                 device=None,
+                 dtype=None,
+                 operations=None,
+                 ):
+
+        super().__init__(model_type='t2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim, num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, flf_pos_embed_token_number=flf_pos_embed_token_number, image_model=image_model, device=device, dtype=dtype, operations=operations)
+        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+        # Vace
+        if vace_layers is not None:
+            self.vace_layers = vace_layers
+            self.vace_in_dim = vace_in_dim
+            # vace blocks
+            self.vace_blocks = nn.ModuleList([
+                VaceWanAttentionBlock('t2v_cross_attn', self.dim, self.ffn_dim, self.num_heads, self.window_size, self.qk_norm, self.cross_attn_norm, self.eps, block_id=i, operation_settings=operation_settings)
+                for i in range(self.vace_layers)
+            ])
+
+            self.vace_layers_mapping = {i: n for n, i in enumerate(range(0, self.num_layers, self.num_layers // self.vace_layers))}
+            # vace patch embeddings
+            self.vace_patch_embedding = operations.Conv3d(
+                self.vace_in_dim, self.dim, kernel_size=self.patch_size, stride=self.patch_size, device=device, dtype=torch.float32
+            )
+
+    def forward_orig(
+        self,
+        x,
+        t,
+        context,
+        vace_context,
+        clip_fea=None,
+        freqs=None,
+        transformer_options={},
+        **kwargs,
+    ):
+        # embeddings
+        x = self.patch_embedding(x.float()).to(x.dtype)
+        grid_sizes = x.shape[2:]
+        x = x.flatten(2).transpose(1, 2)
+
+        # time embeddings
+        e = self.time_embedding(
+            sinusoidal_embedding_1d(self.freq_dim, t).to(dtype=x[0].dtype))
+        e0 = self.time_projection(e).unflatten(1, (6, self.dim))
+
+        # context
+        context = self.text_embedding(context)
+
+        context_img_len = None
+        if clip_fea is not None:
+            if self.img_emb is not None:
+                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
+                context = torch.concat([context_clip, context], dim=1)
+            context_img_len = clip_fea.shape[-2]
+
+        c = self.vace_patch_embedding(vace_context.float()).to(vace_context.dtype)
+        c = c.flatten(2).transpose(1, 2)
+
+        # arguments
+        x_orig = x
+
+        patches_replace = transformer_options.get("patches_replace", {})
+        blocks_replace = patches_replace.get("dit", {})
+        for i, block in enumerate(self.blocks):
+            if ("double_block", i) in blocks_replace:
+                def block_wrap(args):
+                    out = {}
+                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len)
+                    return out
+                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs}, {"original_block": block_wrap})
+                x = out["img"]
+            else:
+                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+
+            ii = self.vace_layers_mapping.get(i, None)
+            if ii is not None:
+                c_skip, c = self.vace_blocks[ii](c, x=x_orig, e=e0, freqs=freqs, context=context, context_img_len=context_img_len)
+            x += c_skip
+        # head
+        x = self.head(x, e)
+
+        # unpatchify
+        x = self.unpatchify(x, grid_sizes)
+        return x
--- a/comfy/lora_convert.py
+++ b/comfy/lora_convert.py
@ -1,4 +1,5 @@
 import torch
+import comfy.utils


 def convert_lora_bfl_control(sd): #BFL loras for Flux
@ -11,7 +12,13 @@ def convert_lora_bfl_control(sd): #BFL loras for Flux
    return sd_out


+def convert_lora_wan_fun(sd): #Wan Fun loras
+    return comfy.utils.state_dict_prefix_replace(sd, {"lora_unet__": "lora_unet_"})
+
+
 def convert_lora(sd):
    if "img_in.lora_A.weight" in sd and "single_blocks.0.norm.key_norm.scale" in sd:
        return convert_lora_bfl_control(sd)
+    if "lora_unet__blocks_0_cross_attn_k.lora_down.weight" in sd:
+        return convert_lora_wan_fun(sd)
    return sd
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -36,17 +36,18 @@ from .ldm.cascade.stage_c import StageC
 from .ldm.cosmos.model import GeneralDIT
 from .ldm.flux import model as flux_model
 from .ldm.genmo.joint_model.asymm_models_joint import AsymmDiTJoint
+from .ldm.hidream.model import HiDreamImageTransformer2DModel
+from .ldm.hunyuan3d.model import Hunyuan3Dv2 as Hunyuan3Dv2Model
 from .ldm.hunyuan_video.model import HunyuanVideo as HunyuanVideoModel
 from .ldm.hydit.models import HunYuanDiT
 from .ldm.lightricks.model import LTXVModel
 from .ldm.lumina.model import NextDiT
-from .ldm.hunyuan3d.model import Hunyuan3Dv2 as Hunyuan3Dv2Model
 from .ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
 from .ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
 from .ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
 from .ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
 from .ldm.pixart.pixartms import PixArtMS
-from .ldm.wan.model import WanModel
+from .ldm.wan.model import WanModel, VaceWanModel
 from .model_management_types import ModelManageable
 from .ops import Operations
 from .patcher_extension import WrapperExecutor, WrappersMP, get_all_wrappers
@ -635,6 +636,7 @@ class SDXL_instructpix2pix(IP2P, SDXL):
        else:
            self.process_ip2p_image_in = lambda image: image  # diffusers ip2p

+
 class Lotus(BaseModel):
    def extra_conds(self, **kwargs):
        out = {}
@ -649,6 +651,7 @@ class Lotus(BaseModel):
    def __init__(self, model_config, model_type=ModelType.IMG_TO_IMG, device=None):
        super().__init__(model_config, model_type, device=device)

+
 class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageC)
@ -960,6 +963,7 @@ class HunyuanVideo(BaseModel):
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return latent_image

+
 class HunyuanVideoI2V(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
@ -968,6 +972,7 @@ class HunyuanVideoI2V(HunyuanVideo):
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)

+
 class HunyuanVideoSkyreelsI2V(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
@ -976,6 +981,7 @@ class HunyuanVideoSkyreelsI2V(HunyuanVideo):
    def scale_latent_inpaint(self, latent_image, **kwargs):
        return super().scale_latent_inpaint(latent_image=latent_image, **kwargs)

+
 class CosmosVideo(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EDM, image_to_video=False, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=GeneralDIT)
@ -1036,7 +1042,6 @@ class WAN21(BaseModel):
        device = kwargs["device"]

        if image is None:
-            image = torch.zeros_like(noise)
            shape_image = list(noise.shape)
            shape_image[1] = extra_channels
            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
@ -1079,6 +1084,34 @@ class WAN21(BaseModel):
            out['clip_fea'] = conds.CONDRegular(clip_vision_output.penultimate_hidden_states)
        return out

+
+class WAN21_Vace(WAN21):
+    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
+        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=VaceWanModel)
+        self.image_to_video = image_to_video
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        noise = kwargs.get("noise", None)
+        noise_shape = list(noise.shape)
+        vace_frames = kwargs.get("vace_frames", None)
+        if vace_frames is None:
+            noise_shape[1] = 32
+            vace_frames = torch.zeros(noise_shape, device=noise.device, dtype=noise.dtype)
+
+        for i in range(0, vace_frames.shape[1], 16):
+            vace_frames = vace_frames.clone()
+            vace_frames[:, i:i + 16] = self.process_latent_in(vace_frames[:, i:i + 16])
+
+        mask = kwargs.get("vace_mask", None)
+        if mask is None:
+            noise_shape[1] = 64
+            mask = torch.ones(noise_shape, device=noise.device, dtype=noise.dtype)
+
+        out['vace_context'] = conds.CONDRegular(torch.cat([vace_frames.to(noise), mask.to(noise)], dim=1))
+        return out
+
+
 class Hunyuan3Dv2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=Hunyuan3Dv2Model)
@ -1093,3 +1126,21 @@ class Hunyuan3Dv2(BaseModel):
        if guidance is not None:
            out['guidance'] = conds.CONDRegular(torch.FloatTensor([guidance]))
        return out
+
+
+class HiDream(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=HiDreamImageTransformer2DModel)
+
+    def encode_adm(self, **kwargs):
+        return kwargs["pooled_output"]
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out['c_crossattn'] = conds.CONDRegular(cross_attn)
+        conditioning_llama3 = kwargs.get("conditioning_llama3", None)
+        if conditioning_llama3 is not None:
+            out['encoder_hidden_states_llama3'] = conds.CONDRegular(conditioning_llama3)
+        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -321,10 +321,18 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["cross_attn_norm"] = True
        dit_config["eps"] = 1e-6
        dit_config["in_dim"] = state_dict['{}patch_embedding.weight'.format(key_prefix)].shape[1]
-        if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
-            dit_config["model_type"] = "i2v"
+        if '{}vace_patch_embedding.weight'.format(key_prefix) in state_dict_keys:
+            dit_config["model_type"] = "vace"
+            dit_config["vace_in_dim"] = state_dict['{}vace_patch_embedding.weight'.format(key_prefix)].shape[1]
+            dit_config["vace_layers"] = count_blocks(state_dict_keys, '{}vace_blocks.'.format(key_prefix) + '{}.')
        else:
-            dit_config["model_type"] = "t2v"
+            if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
+                dit_config["model_type"] = "i2v"
+            else:
+                dit_config["model_type"] = "t2v"
+        flf_weight = state_dict.get('{}img_emb.emb_pos'.format(key_prefix))
+        if flf_weight is not None:
+            dit_config["flf_pos_embed_token_number"] = flf_weight.shape[1]
        return dit_config

    if '{}latent_in.weight'.format(key_prefix) in state_dict_keys:  # Hunyuan 3D
@ -342,6 +350,25 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys
        return dit_config

+    if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys:  # HiDream
+        dit_config = {}
+        dit_config["image_model"] = "hidream"
+        dit_config["attention_head_dim"] = 128
+        dit_config["axes_dims_rope"] = [64, 32, 32]
+        dit_config["caption_channels"] = [4096, 4096]
+        dit_config["max_resolution"] = [128, 128]
+        dit_config["in_channels"] = 16
+        dit_config["llama_layers"] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]
+        dit_config["num_attention_heads"] = 20
+        dit_config["num_routed_experts"] = 4
+        dit_config["num_activated_experts"] = 2
+        dit_config["num_layers"] = 16
+        dit_config["num_single_layers"] = 32
+        dit_config["out_channels"] = 16
+        dit_config["patch_size"] = 2
+        dit_config["text_emb_dim"] = 2048
+        return dit_config
+
    if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
        return None

--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -928,6 +928,8 @@ def text_encoder_dtype(device=None):
        return torch.float8_e5m2
    elif args.fp16_text_enc:
        return torch.float16
+    elif args.bf16_text_enc:
+        return torch.bfloat16
    elif args.fp32_text_enc:
        return torch.float32

@ -1405,6 +1407,8 @@ def _soft_empty_cache(force=False):
        torch.xpu.empty_cache()  # pylint: disable=no-member
    elif is_ascend_npu():
        torch.npu.empty_cache()  # pylint: disable=no-member
+    elif is_mlu():
+        torch.mlu.empty_cache()  # pylint: disable=no-member
    elif torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
--- a/comfy/node_helpers.py
+++ b/comfy/node_helpers.py
@ -76,7 +76,8 @@ def export_custom_nodes():

    return custom_nodes

-def export_package_as_web_directory(package:str):
+
+def export_package_as_web_directory(package: str):
    import inspect

    # Get the calling module
@ -89,6 +90,7 @@ def export_package_as_web_directory(package:str):
        # Clean up circular reference
        del frame

+
 def string_to_torch_dtype(string):
    import torch
    if string == "fp32":
@ -97,3 +99,12 @@ def string_to_torch_dtype(string):
        return torch.float16
    if string == "bf16":
        return torch.bfloat16
+
+
+def image_alpha_fix(destination, source):
+    if destination.shape[-1] < source.shape[-1]:
+        source = source[..., :destination.shape[-1]]
+    elif destination.shape[-1] > source.shape[-1]:
+        destination = torch.nn.functional.pad(destination, (0, 1))
+        destination[..., -1] = 1.0
+    return destination, source
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@ -790,6 +790,8 @@ class ControlNetLoader:
    def load_controlnet(self, control_net_name):
        controlnet_path = get_or_download("controlnet", control_net_name, KNOWN_CONTROLNETS)
        controlnet_ = controlnet.load_controlnet(controlnet_path)
+        if controlnet is None:
+            raise RuntimeError("ERROR: controlnet file is invalid and does not contain a valid controlnet model.")
        return (controlnet_,)


@ -947,7 +949,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (get_filename_list_with_downloadable("text_encoders", KNOWN_CLIP_MODELS),),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -957,30 +959,10 @@ class CLIPLoader:

    CATEGORY = "advanced/loaders"

-    DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl"
+    DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5"

    def load_clip(self, clip_name, type="stable_diffusion", device="default"):
-        clip_type = sd.CLIPType.STABLE_DIFFUSION
-        if type == "stable_cascade":
-            clip_type = sd.CLIPType.STABLE_CASCADE
-        elif type == "sd3":
-            clip_type = sd.CLIPType.SD3
-        elif type == "stable_audio":
-            clip_type = sd.CLIPType.STABLE_AUDIO
-        elif type == "mochi":
-            clip_type = sd.CLIPType.MOCHI
-        elif type == "ltxv":
-            clip_type = sd.CLIPType.LTXV
-        elif type == "pixart":
-            clip_type = sd.CLIPType.PIXART
-        elif type == "cosmos":
-            clip_type = sd.CLIPType.COSMOS
-        elif type == "lumina2":
-            clip_type = sd.CLIPType.LUMINA2
-        elif type == "wan":
-            clip_type = sd.CLIPType.WAN
-        else:
-            logging.warning(f"Unknown clip type argument passed: {type} for model {clip_name}")
+        clip_type = getattr(sd.CLIPType, type.upper(), sd.CLIPType.STABLE_DIFFUSION)

        model_options = {}
        if device == "cpu":
@ -995,7 +977,7 @@ class DualCLIPLoader:
    def INPUT_TYPES(s):
        return {"required": { "clip_name1": (get_filename_list_with_downloadable("text_encoders"),), "clip_name2": (
            get_filename_list_with_downloadable("text_encoders"),),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -1005,21 +987,12 @@ class DualCLIPLoader:

    CATEGORY = "advanced/loaders"

-    DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5"
+    DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama"

    def load_clip(self, clip_name1, clip_name2, type, device="default"):
+        clip_type = getattr(sd.CLIPType, type.upper(), sd.CLIPType.STABLE_DIFFUSION)
        clip_path1 = get_or_download("text_encoders", clip_name1)
        clip_path2 = get_or_download("text_encoders", clip_name2)
-        if type == "sdxl":
-            clip_type = sd.CLIPType.STABLE_DIFFUSION
-        elif type == "sd3":
-            clip_type = sd.CLIPType.SD3
-        elif type == "flux":
-            clip_type = sd.CLIPType.FLUX
-        elif type == "hunyuan_video":
-            clip_type = sd.CLIPType.HUNYUAN_VIDEO
-        else:
-            raise ValueError(f"Unknown clip type argument passed: {type} for model {clip_name1} and {clip_name2}")

        model_options = {}
        if device == "cpu":
@ -1041,6 +1014,8 @@ class CLIPVisionLoader:
    def load_clip(self, clip_name):
        clip_path = get_or_download("clip_vision", clip_name, KNOWN_CLIP_VISION_MODELS)
        clip_vision = clip_vision_module.load(clip_path)
+        if clip_vision is None:
+            raise RuntimeError("ERROR: clip vision file is invalid and does not contain a valid vision model.")
        return (clip_vision,)

 class CLIPVisionEncode:
@ -1692,6 +1667,7 @@ class LoadImage:
    def INPUT_TYPES(s):
        input_dir = folder_paths.get_input_directory()
        files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
+        files = folder_paths.filter_files_content_types(files, ["image"])
        return {
            "required": {
                "image": (natsorted(files), {"image_upload": True}),
@ -1737,7 +1713,9 @@ class LoadImage:
                if 'A' in i.getbands():
                    mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0
                    mask = 1. - torch.from_numpy(mask)
-                else:
+                elif i.mode == 'P' and 'transparency' in i.info:
+                mask = np.array(i.convert('RGBA').getchannel('A')).astype(np.float32) / 255.0
+                mask = 1. - torch.from_numpy(mask)else:
                    mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
                output_images.append(image)
                output_masks.append(mask.unsqueeze(0))
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -21,7 +21,7 @@ from typing import Optional, Type, Union
 import torch
 from torch import Tensor

-from . import model_management
+from . import model_management, rmsnorm
 from .cli_args import args, PerformanceFeature
 from .execution_context import current_execution_context
 from .float import stochastic_rounding
@ -30,6 +30,7 @@ cast_to = model_management.cast_to  # TODO: remove once no more references

 logger = logging.getLogger(__name__)

+
 def cast_to_input(weight, input, non_blocking=False, copy=True):
    return model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)

@ -201,6 +202,25 @@ class disable_weight_init:
            else:
                return super().forward(*args, **kwargs)

+    class RMSNorm(rmsnorm.RMSNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            self.bias = None
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            if self.weight is not None:
+                weight, bias = cast_bias_weight(self, input)
+            else:
+                weight = None
+            return rmsnorm.rms_norm(input, weight, self.eps)  # TODO: switch to commented out line when old torch is deprecated
+            # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
    class ConvTranspose2d(torch.nn.ConvTranspose2d, CastWeightBiasOp):
        def reset_parameters(self):
            return None
@ -298,6 +318,9 @@ class manual_cast(disable_weight_init):
    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
        comfy_cast_weights = True

+    class RMSNorm(disable_weight_init.RMSNorm):
+        comfy_cast_weights = True
+
    class Embedding(disable_weight_init.Embedding):
        comfy_cast_weights = True

@ -371,6 +394,7 @@ class scaled_fp8_op_base(manual_cast):

 def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
    logger.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
+
    class scaled_fp8_op(scaled_fp8_op_base):
        class Linear(manual_cast.Linear):
            def __init__(self, *args, **kwargs):
@ -419,6 +443,29 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None
    return scaled_fp8_op


+CUBLAS_IS_AVAILABLE = False
+try:
+    from cublas_ops import CublasLinear
+
+    CUBLAS_IS_AVAILABLE = True
+except ImportError:
+    pass
+
+if CUBLAS_IS_AVAILABLE:
+    class cublas_ops(disable_weight_init):
+        class Linear(CublasLinear, disable_weight_init.Linear):
+            def reset_parameters(self):
+                return None
+
+            def forward_comfy_cast_weights(self, input):
+                return super().forward(input)
+
+            def forward(self, *args, **kwargs):
+                return super().forward(*args, **kwargs)
+else:
+    class cublas_ops(disable_weight_init):
+        pass
+
 Operations = Type[Union[manual_cast, fp8_ops, disable_weight_init, skip_init, scaled_fp8_op_base]]


@ -431,12 +478,21 @@ def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_
        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)

    if (
-        fp8_compute and
-        (fp8_optimizations or PerformanceFeature.Fp8MatrixMultiplication in args.fast) and
-        not disable_fast_fp8
+            fp8_compute and
+            (fp8_optimizations or PerformanceFeature.Fp8MatrixMultiplication in args.fast) and
+            not disable_fast_fp8
    ):
        return fp8_ops

+    if (
+            PerformanceFeature.CublasOps in args.fast and
+            CUBLAS_IS_AVAILABLE and
+            weight_dtype == torch.float16 and
+            (compute_dtype == torch.float16 or compute_dtype is None)
+    ):
+        logging.info("Using cublas ops")
+        return cublas_ops
+
    if compute_dtype is None or weight_dtype == compute_dtype:
        # disable_weight_init seems to interact poorly with some other optimization code
        return disable_weight_init if inference_mode else skip_init
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@ -48,6 +48,7 @@ def get_all_callbacks(call_type: str, transformer_options: dict, is_model_option

 class WrappersMP:
    OUTER_SAMPLE = "outer_sample"
+    PREPARE_SAMPLING = "prepare_sampling"
    SAMPLER_SAMPLE = "sampler_sample"
    CALC_COND_BATCH = "calc_cond_batch"
    APPLY_MODEL = "apply_model"
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
@ -0,0 +1,55 @@
+import torch
+from .model_management import cast_to
+import numbers
+
+RMSNorm = None
+
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+    RMSNorm = torch.nn.RMSNorm
+except:
+    rms_norm_torch = None
+
+
+def rms_norm(x, weight=None, eps=1e-6):
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * cast_to(weight, dtype=x.dtype, device=x.device)
+
+
+if RMSNorm is None:
+    class RMSNorm(torch.nn.Module):
+        def __init__(
+            self,
+            normalized_shape,
+            eps=None,
+            elementwise_affine=True,
+            device=None,
+            dtype=None,
+        ):
+            factory_kwargs = {"device": device, "dtype": dtype}
+            super().__init__()
+            if isinstance(normalized_shape, numbers.Integral):
+                # mypy error: incompatible types in assignment
+                normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+            self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+            self.eps = eps
+            self.elementwise_affine = elementwise_affine
+            if self.elementwise_affine:
+                self.weight = torch.nn.Parameter(
+                    torch.empty(self.normalized_shape, **factory_kwargs)
+                )
+            else:
+                self.register_parameter("weight", None)
+            self.bias = None
+
+        def forward(self, x):
+            return rms_norm(x, self.weight, self.eps)
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@ -113,6 +113,13 @@ def cleanup_additional_models(models):


 def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
+        _prepare_sampling,
+        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
+    )
+    return executor.execute(model, noise_shape, conds, model_options=model_options)
+
+def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
    real_model: BaseModel = None
    models, inference_memory = get_additional_models(conds, model.model_dtype())
    models += get_additional_models_from_model_options(model_options)
--- a/comfy/sampler_names.py
+++ b/comfy/sampler_names.py
@ -2,7 +2,7 @@ KSAMPLER_NAMES = ["euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_c
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm",
                  "ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp",
-                  "gradient_estimation", "er_sde"]
+                  "gradient_estimation", "er_sde", "seeds_2", "seeds_3"]

 SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform", "beta", "linear_quadratic", "kl_optimal"]
 SAMPLER_NAMES = KSAMPLER_NAMES + ["ddim", "uni_pc", "uni_pc_bh2"]
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -38,6 +38,7 @@ from .model_management import load_models_gpu
 from .t2i_adapter import adapter
 from .taesd import taesd
 from .text_encoders import aura_t5
+from .text_encoders import hidream
 from .text_encoders import cosmos
 from .text_encoders import flux
 from .text_encoders import genmo
@ -277,6 +278,7 @@ class VAE:
        self.process_input = lambda image: image * 2.0 - 1.0
        self.process_output = lambda image: torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)
        self.working_dtypes = [torch.bfloat16, torch.float32]
+        self.disable_offload = False

        self.downscale_index_formula = None
        self.upscale_index_formula = None
@ -349,6 +351,7 @@ class VAE:
                self.process_output = lambda audio: audio
                self.process_input = lambda audio: audio
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+                self.disable_offload = True
            elif "blocks.2.blocks.3.stack.5.weight" in sd or "decoder.blocks.2.blocks.3.stack.5.weight" in sd or "layers.4.layers.1.attn_block.attn.qkv.weight" in sd or "encoder.layers.4.layers.1.attn_block.attn.qkv.weight" in sd:  # genmo mochi vae
                if "blocks.2.blocks.3.stack.5.weight" in sd:
                    sd = utils.state_dict_prefix_replace(sd, {"": "decoder."})
@ -527,7 +530,7 @@ class VAE:
        pixel_samples = None
        try:
            memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
-            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)
@ -556,7 +559,7 @@ class VAE:
    def decode_tiled(self, samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
        self.throw_exception_if_invalid()
        memory_used = self.memory_used_decode(samples.shape, self.vae_dtype)  # TODO: calculate mem required for tile
-        load_models_gpu([self.patcher], memory_required=memory_used)
+        load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
        dims = samples.ndim - 2
        args = {}
        if tile_x is not None:
@ -592,7 +595,7 @@ class VAE:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
        try:
            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
-            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / max(1, memory_used))
            batch_number = max(1, batch_number)
@ -626,7 +629,7 @@ class VAE:
            pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)

        memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)  # TODO: calculate mem required for tile
-        load_models_gpu([self.patcher], memory_required=memory_used)
+        load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)

        args = {}
        if tile_x is not None:
@ -719,6 +722,7 @@ class CLIPType(Enum):
    COSMOS = 11
    LUMINA2 = 12
    WAN = 13
+    HIDREAM = 14


@dataclasses.dataclass
@ -820,6 +824,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            elif clip_type == CLIPType.SD3:
                clip_target.clip = sd3_clip.sd3_clip(clip_l=False, clip_g=True, t5=False)
                clip_target.tokenizer = sd3_clip.SD3Tokenizer
+            elif clip_type == CLIPType.HIDREAM:
+                clip_target.clip = hidream.hidream_clip(clip_l=False, clip_g=True, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
+                clip_target.tokenizer = hidream.HiDreamTokenizer
            else:
                clip_target.clip = sdxl_clip.SDXLRefinerClipModel
                clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@ -840,6 +847,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_target.clip = wan.te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = wan.WanT5Tokenizer
                tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
+            elif clip_type == CLIPType.HIDREAM:
+                clip_target.clip = hidream.hidream_clip(**t5xxl_detect(clip_data),
+                                                                        clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
+                clip_target.tokenizer = hidream.HiDreamTokenizer
            else:  # CLIPType.MOCHI
                clip_target.clip = genmo.mochi_te(**t5xxl_detect(clip_data))
                clip_target.tokenizer = genmo.MochiT5Tokenizer
@ -856,10 +867,18 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = lumina2.te(**llama_detect(clip_data))
            clip_target.tokenizer = lumina2.LuminaTokenizer
            tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
+        elif te_model == TEModel.LLAMA3_8:
+            clip_target.clip = hidream.hidream_clip(**llama_detect(clip_data),
+                                                                        clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
+            clip_target.tokenizer = hidream.HiDreamTokenizer
        else:
+            # clip_l
            if clip_type == CLIPType.SD3:
                clip_target.clip = sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                clip_target.tokenizer = sd3_clip.SD3Tokenizer
+            elif clip_type == CLIPType.HIDREAM:
+                clip_target.clip = hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
+                clip_target.tokenizer = hidream.HiDreamTokenizer
            else:
                clip_target.clip = sd1_clip.SD1ClipModel
                clip_target.tokenizer = sd1_clip.SD1Tokenizer
@ -877,12 +896,33 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif clip_type == CLIPType.HUNYUAN_VIDEO:
            clip_target.clip = hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data))
            clip_target.tokenizer = hunyuan_video.HunyuanVideoTokenizer
+        elif clip_type == CLIPType.HIDREAM:
+            # Detect
+            hidream_dualclip_classes = []
+            for hidream_te in clip_data:
+                te_model = detect_te_model(hidream_te)
+                hidream_dualclip_classes.append(te_model)
+
+            clip_l = TEModel.CLIP_L in hidream_dualclip_classes
+            clip_g = TEModel.CLIP_G in hidream_dualclip_classes
+            t5 = TEModel.T5_XXL in hidream_dualclip_classes
+            llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
+
+            # Initialize t5xxl_detect and llama_detect kwargs if needed
+            t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
+            llama_kwargs = llama_detect(clip_data) if llama else {}
+
+            clip_target.clip = hidream.hidream_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, **t5_kwargs, **llama_kwargs)
+            clip_target.tokenizer = hidream.HiDreamTokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
    elif len(clip_data) == 3:
        clip_target.clip = sd3_clip.sd3_clip(**t5xxl_detect(clip_data))
        clip_target.tokenizer = sd3_clip.SD3Tokenizer
+    elif len(clip_data) == 4:
+        clip_target.clip = hidream.hidream_clip(**t5xxl_detect(clip_data), **llama_detect(clip_data))
+        clip_target.tokenizer = hidream.HiDreamTokenizer

    parameters = 0
    for c in clip_data:
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@ -101,7 +101,8 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    LAYERS = [
        "last",
        "pooled",
-        "hidden"
+        "hidden",
+        "all"
    ]

    def __init__(self, device="cpu", max_length=77,
@ -115,8 +116,15 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
            special_tokens = {"start": 49406, "end": 49407, "pad": 49407}
        assert layer in self.LAYERS

+        if textmodel_json_config is None and "model_name" not in model_options:
+            model_options = {**model_options, "model_name": "clip_l"}
+
        config = get_path_as_dict(textmodel_json_config, "sd1_clip_config.json", package=__package__)

+        te_model_options = model_options.get("{}_model_config".format(model_options.get("model_name", "")), {})
+        for k, v in te_model_options.items():
+            config[k] = v
+
        operations = model_options.get("custom_operations", None)
        scaled_fp8 = None

@ -164,7 +172,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
    def set_clip_options(self, options):
        layer_idx = options.get("layer", self.layer_idx)
        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
-        if layer_idx is None or abs(layer_idx) > self.num_layers:
+        if self.layer == "all":
+            pass
+        elif layer_idx is None or abs(layer_idx) > self.num_layers:
            self.layer = "last"
        else:
            self.layer = "hidden"
@ -261,7 +271,12 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        if self.enable_attention_masks:
            attention_mask_model = attention_mask

-        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)
+        if self.layer == "all":
+            intermediate_output = "all"
+        else:
+            intermediate_output = self.layer_idx
+
+        outputs = self.transformer(None, attention_mask_model, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=self.layer_norm_hidden_state, dtype=torch.float32)

        if self.layer == "last":
            z = outputs[0].float()
@ -540,7 +555,7 @@ class SDTokenizer:
        self.tokenizer_class = tokenizer_class
        self.tokenizer_path = tokenizer_path
        self.tokenizer: PreTrainedTokenizerBase | SPieceTokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
-        self.max_length = max_length
+        self.max_length = tokenizer_data.get("{}_max_length".format(embedding_key), max_length)
        self.min_length = min_length
        self.end_token = None

@ -780,6 +795,7 @@ class SD1ClipModel(torch.nn.Module):
            self.clip = "clip_{}".format(self.clip_name)

        clip_model = model_options.get("{}_class".format(self.clip), clip_model)
+        model_options = {**model_options, "model_name": self.clip}
        setattr(self, self.clip, clip_model(device=device, dtype=dtype, model_options=model_options, textmodel_json_config=textmodel_json_config, **kwargs))

        self.dtypes = set()
--- a/comfy/sdxl_clip.py
+++ b/comfy/sdxl_clip.py
@ -13,6 +13,7 @@ class SDXLClipG(sd1_clip.SDClipModel):
            layer_idx = -2

        textmodel_json_config = get_path_as_dict(textmodel_json_config, "clip_config_bigg.json")
+        model_options = {**model_options, "model_name": "clip_g"}
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
                         special_tokens={"start": 49406, "end": 49407, "pad": 0}, layer_norm_hidden_state=False, return_projected_pooled=True, model_options=model_options)

@ -22,16 +23,16 @@ class SDXLClipG(sd1_clip.SDClipModel):

 class SDXLClipGTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, **kwargs):
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')
+        tokenizer_data = kwargs.pop("tokenizer_data", {})
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g', tokenizer_data=tokenizer_data)


 class SDXLTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data=None, **kwargs):
        if tokenizer_data is None:
            tokenizer_data = {}
-        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
-        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory)
+        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.clip_g = SDXLClipGTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)

    def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
        out = {}
@ -55,11 +56,8 @@ class SDXLTokenizer:
 class SDXLClipModel(torch.nn.Module):
    def __init__(self, device="cpu", dtype=None, model_options=None, textmodel_json_config=None):
        super().__init__()
-        if model_options is None:
-            model_options = {}
-        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
-        self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options, textmodel_json_config=textmodel_json_config)
-        self.clip_g = SDXLClipG(device=device, dtype=dtype, model_options=model_options, textmodel_json_config=textmodel_json_config)
+        self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, model_options=model_options)
+        self.clip_g = SDXLClipG(device=device, dtype=dtype, model_options=model_options)
        self.dtypes = {dtype}

    def set_clip_options(self, options):
@ -97,7 +95,7 @@ class StableCascadeClipGTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data=None):
        if tokenizer_data is None:
            tokenizer_data = {}
-        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g')
+        super().__init__(tokenizer_path, pad_with_end=True, embedding_directory=embedding_directory, embedding_size=1280, embedding_key='clip_g', tokenizer_data=tokenizer_data)


 class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
@ -110,6 +108,7 @@ class StableCascadeTokenizer(sd1_clip.SD1Tokenizer):
 class StableCascadeClipG(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", max_length=77, freeze=True, layer="hidden", layer_idx=-1, dtype=None, textmodel_json_config=None, model_options={}):
        textmodel_json_config = get_path_as_dict(textmodel_json_config, "clip_config_bigg.json")
+        model_options = {**model_options, "model_name": "clip_g"}
        super().__init__(device=device, freeze=freeze, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype,
                         special_tokens={"start": 49406, "end": 49407, "pad": 49407}, layer_norm_hidden_state=False, enable_attention_masks=True, return_projected_pooled=True, model_options=model_options)

--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1059,6 +1059,16 @@ class WAN21_FunControl2V(WAN21_T2V):
        out = model_base.WAN21(self, image_to_video=False, device=device)
        return out

+class WAN21_Vace(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "vace",
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_Vace(self, image_to_video=False, device=device)
+        return out
+
 class Hunyuan3Dv2(supported_models_base.BASE):
    unet_config = {
        "image_model": "hunyuan3d2",
@ -1097,6 +1107,36 @@ class Hunyuan3Dv2mini(Hunyuan3Dv2):

    latent_format = latent_formats.Hunyuan3Dv2mini

-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, Hunyuan3Dv2mini, Hunyuan3Dv2]
+class HiDream(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hidream",
+    }
+
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    sampling_settings = {
+    }
+
+    # memory_usage_factor = 1.2 # TODO
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HiDream(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None #  TODO
+
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream]

 models += [SVD_img2vid]
--- a/comfy/text_encoders/aura_t5.py
+++ b/comfy/text_encoders/aura_t5.py
@ -16,7 +16,7 @@ class PT5XlModel(sd1_clip.SDClipModel):
 class PT5XlTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, **kwargs):
        tokenizer_path = resources.files("comfy.text_encoders.t5_pile_tokenizer") / "tokenizer.model"
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1, tokenizer_data=tokenizer_data)


 class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
@ -27,5 +27,7 @@ class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):


 class AuraT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options=None, **kwargs):
+        if model_options is None:
+            model_options = {}
        super().__init__(device=device, dtype=dtype, model_options=model_options, name="pile_t5xl", clip_model=PT5XlModel, **kwargs)
--- a/comfy/text_encoders/cosmos.py
+++ b/comfy/text_encoders/cosmos.py
@ -32,7 +32,7 @@ class T5XXLTokenizer(sd1_clip.SDTokenizer):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=1024, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=512)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=1024, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, tokenizer_data=tokenizer_data)


 class CosmosT5Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@ -13,16 +13,15 @@ class T5XXLTokenizer(sd1_clip.SDTokenizer):
        if tokenizer_data is None:
            tokenizer_data = dict()
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, tokenizer_data=tokenizer_data)


 class FluxTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data=None):
        if tokenizer_data is None:
            tokenizer_data = dict()
-        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
-        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
+        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)

    def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
        out = {
@ -47,8 +46,7 @@ class FluxClipModel(torch.nn.Module):
        if model_options is None:
            model_options = {}
        dtype_t5 = model_management.pick_weight_dtype(dtype_t5, dtype, device)
-        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
-        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
        self.t5xxl = T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options)
        self.dtypes = {dtype, dtype_t5}

--- a/comfy/text_encoders/genmo.py
+++ b/comfy/text_encoders/genmo.py
@ -23,7 +23,7 @@ class T5XXLTokenizer(sd1_clip.SDTokenizer):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, tokenizer_data=tokenizer_data)


 class MochiT5Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/hidream.py
+++ b/comfy/text_encoders/hidream.py
@ -0,0 +1,155 @@
+from . import hunyuan_video
+from . import sd3_clip
+from comfy import sd1_clip
+from comfy import sdxl_clip
+import comfy.model_management
+import torch
+import logging
+
+
+class HiDreamTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.t5xxl = sd3_clip.T5XXLTokenizer(embedding_directory=embedding_directory, min_length=128, max_length=128, tokenizer_data=tokenizer_data)
+        self.llama = hunyuan_video.LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=128, pad_token=128009, tokenizer_data=tokenizer_data)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+        out = {}
+        out["g"] = self.clip_g.tokenize_with_weights(text, return_word_ids)
+        out["l"] = self.clip_l.tokenize_with_weights(text, return_word_ids)
+        t5xxl = self.t5xxl.tokenize_with_weights(text, return_word_ids)
+        out["t5xxl"] = [t5xxl[0]]  # Use only first 128 tokens
+        out["llama"] = self.llama.tokenize_with_weights(text, return_word_ids)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.clip_g.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+
+class HiDreamTEModel(torch.nn.Module):
+    def __init__(self, clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, device="cpu", dtype=None, model_options={}):
+        super().__init__()
+        self.dtypes = set()
+        if clip_l:
+            self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=True, model_options=model_options)
+            self.dtypes.add(dtype)
+        else:
+            self.clip_l = None
+
+        if clip_g:
+            self.clip_g = sdxl_clip.SDXLClipG(device=device, dtype=dtype, model_options=model_options)
+            self.dtypes.add(dtype)
+        else:
+            self.clip_g = None
+
+        if t5:
+            dtype_t5 = comfy.model_management.pick_weight_dtype(dtype_t5, dtype, device)
+            self.t5xxl = sd3_clip.T5XXLModel(device=device, dtype=dtype_t5, model_options=model_options, attention_mask=True)
+            self.dtypes.add(dtype_t5)
+        else:
+            self.t5xxl = None
+
+        if llama:
+            dtype_llama = comfy.model_management.pick_weight_dtype(dtype_llama, dtype, device)
+            if "vocab_size" not in model_options:
+                model_options["vocab_size"] = 128256
+            self.llama = hunyuan_video.LLAMAModel(device=device, dtype=dtype_llama, model_options=model_options, layer="all", layer_idx=None, special_tokens={"start": 128000, "pad": 128009})
+            self.dtypes.add(dtype_llama)
+        else:
+            self.llama = None
+
+        logging.debug("Created HiDream text encoder with: clip_l {}, clip_g {}, t5xxl {}:{}, llama {}:{}".format(clip_l, clip_g, t5, dtype_t5, llama, dtype_llama))
+
+    def set_clip_options(self, options):
+        if self.clip_l is not None:
+            self.clip_l.set_clip_options(options)
+        if self.clip_g is not None:
+            self.clip_g.set_clip_options(options)
+        if self.t5xxl is not None:
+            self.t5xxl.set_clip_options(options)
+        if self.llama is not None:
+            self.llama.set_clip_options(options)
+
+    def reset_clip_options(self):
+        if self.clip_l is not None:
+            self.clip_l.reset_clip_options()
+        if self.clip_g is not None:
+            self.clip_g.reset_clip_options()
+        if self.t5xxl is not None:
+            self.t5xxl.reset_clip_options()
+        if self.llama is not None:
+            self.llama.reset_clip_options()
+
+    def encode_token_weights(self, token_weight_pairs):
+        token_weight_pairs_l = token_weight_pairs["l"]
+        token_weight_pairs_g = token_weight_pairs["g"]
+        token_weight_pairs_t5 = token_weight_pairs["t5xxl"]
+        token_weight_pairs_llama = token_weight_pairs["llama"]
+        lg_out = None
+        pooled = None
+        extra = {}
+
+        if len(token_weight_pairs_g) > 0 or len(token_weight_pairs_l) > 0:
+            if self.clip_l is not None:
+                lg_out, l_pooled = self.clip_l.encode_token_weights(token_weight_pairs_l)
+            else:
+                l_pooled = torch.zeros((1, 768), device=comfy.model_management.intermediate_device())
+
+            if self.clip_g is not None:
+                g_out, g_pooled = self.clip_g.encode_token_weights(token_weight_pairs_g)
+            else:
+                g_pooled = torch.zeros((1, 1280), device=comfy.model_management.intermediate_device())
+
+            pooled = torch.cat((l_pooled, g_pooled), dim=-1)
+
+        if self.t5xxl is not None:
+            t5_output = self.t5xxl.encode_token_weights(token_weight_pairs_t5)
+            t5_out, t5_pooled = t5_output[:2]
+        else:
+            t5_out = None
+
+        if self.llama is not None:
+            ll_output = self.llama.encode_token_weights(token_weight_pairs_llama)
+            ll_out, ll_pooled = ll_output[:2]
+            ll_out = ll_out[:, 1:]
+        else:
+            ll_out = None
+
+        if t5_out is None:
+            t5_out = torch.zeros((1, 128, 4096), device=comfy.model_management.intermediate_device())
+
+        if ll_out is None:
+            ll_out = torch.zeros((1, 32, 1, 4096), device=comfy.model_management.intermediate_device())
+
+        if pooled is None:
+            pooled = torch.zeros((1, 768 + 1280), device=comfy.model_management.intermediate_device())
+
+        extra["conditioning_llama3"] = ll_out
+        return t5_out, pooled, extra
+
+    def load_sd(self, sd):
+        if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
+            return self.clip_g.load_sd(sd)
+        elif "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
+            return self.clip_l.load_sd(sd)
+        elif "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in sd:
+            return self.t5xxl.load_sd(sd)
+        else:
+            return self.llama.load_sd(sd)
+
+
+def hidream_clip(clip_l=True, clip_g=True, t5=True, llama=True, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None):
+    class HiDreamTEModel_(HiDreamTEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
+            if llama_scaled_fp8 is not None and "llama_scaled_fp8" not in model_options:
+                model_options = model_options.copy()
+                model_options["llama_scaled_fp8"] = llama_scaled_fp8
+            super().__init__(clip_l=clip_l, clip_g=clip_g, t5=t5, llama=llama, dtype_t5=dtype_t5, dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options)
+    return HiDreamTEModel_
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@ -22,15 +22,17 @@ def llama_detect(state_dict, prefix=""):


 class LLAMA3Tokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data=None, min_length=256):
+    def __init__(self, embedding_directory=None, tokenizer_data=None, min_length=256, pad_token=128258):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.llama_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=128258, min_length=min_length)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='llama', tokenizer_class=LlamaTokenizerFast, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, pad_token=pad_token, min_length=min_length, tokenizer_data=tokenizer_data)


 class LLAMAModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options=None):
+    def __init__(self, device="cpu", layer="hidden", layer_idx=-3, dtype=None, attention_mask=True, model_options=None, special_tokens=None):
+        if special_tokens is None:
+            special_tokens = {"start": 128000, "pad": 128258}
        if model_options is None:
            model_options = {}
        llama_scaled_fp8 = model_options.get("llama_scaled_fp8", None)
@ -38,17 +40,22 @@ class LLAMAModel(sd1_clip.SDClipModel):
            model_options = model_options.copy()
            model_options["scaled_fp8"] = llama_scaled_fp8

-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 128000, "pad": 128258}, layer_norm_hidden_state=False, model_class=Llama2, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+        textmodel_json_config = {}
+        vocab_size = model_options.get("vocab_size", None)
+        if vocab_size is not None:
+            textmodel_json_config["vocab_size"] = vocab_size
+
+        model_options = {**model_options, "model_name": "llama"}
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens=special_tokens, layer_norm_hidden_state=False, model_class=Llama2, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)


 class HunyuanVideoTokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data=None):
        if tokenizer_data is None:
            tokenizer_data = {}
-        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
+        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.llama_template = """<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"""  # 95 tokens
-        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1)
+        self.llama = LLAMA3Tokenizer(embedding_directory=embedding_directory, min_length=1, tokenizer_data=tokenizer_data)

    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, image_embeds=None, image_interleave=1, **kwargs):
        out = {}
@ -82,8 +89,7 @@ class HunyuanVideoClipModel(torch.nn.Module):
        if model_options is None:
            model_options = {}
        dtype_llama = pick_weight_dtype(dtype_llama, dtype, device)
-        clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
-        self.clip_l = clip_l_class(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
+        self.clip_l = sd1_clip.SDClipModel(device=device, dtype=dtype, return_projected_pooled=False, model_options=model_options)
        self.llama = LLAMAModel(device=device, dtype=dtype_llama, model_options=model_options)
        self.dtypes = {dtype, dtype_llama}

--- a/comfy/text_encoders/hydit.py
+++ b/comfy/text_encoders/hydit.py
@ -15,13 +15,13 @@ class HyditBertModel(sd1_clip.SDClipModel):
        if model_options is None:
            model_options = dict()
        textmodel_json_config = get_path_as_dict(textmodel_json_config, "hydit_clip.json", package=__package__)
+        model_options = {**model_options, "model_name": "hydit_clip"}
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 101, "end": 102, "pad": 0}, model_class=BertModel, enable_attention_masks=True, return_attention_masks=True)

-
 class HyditBertTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, **kwargs):
        tokenizer_path = get_package_as_path(f"{__package__}.hydit_clip_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77)
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='chinese_roberta', tokenizer_class=BertTokenizer, pad_to_max_length=False, max_length=512, min_length=77, tokenizer_data=tokenizer_data)


 class MT5XLModel(sd1_clip.SDClipModel):
@ -29,9 +29,9 @@ class MT5XLModel(sd1_clip.SDClipModel):
        if model_options is None:
            model_options = dict()
        textmodel_json_config = get_path_as_dict(textmodel_json_config, "mt5_config_xl.json", package=__package__)
+        model_options = {**model_options, "model_name": "mt5xl"}
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5, enable_attention_masks=True, return_attention_masks=True)

-
 class MT5XLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, tokenizer_data=None, **kwargs):
        if tokenizer_data is None:
@ -39,7 +39,7 @@ class MT5XLTokenizer(sd1_clip.SDTokenizer):
        if not "spiece_model" in tokenizer_data:
            raise FileNotFoundError("expected a checkpoint that contains the mt5 tokenizer's sentencepiece model")
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=2048, embedding_key='mt5xl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
@ -51,7 +51,7 @@ class HyditTokenizer:
            raise FileNotFoundError("expected mt5xl tokenizer data in the checkpoint")
        mt5_tokenizer_data = tokenizer_data.get("mt5xl.spiece_model", None)
        self.hydit_clip = HyditBertTokenizer(embedding_directory=embedding_directory)
-        self.mt5xl = MT5XLTokenizer(tokenizer_data={"spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)
+        self.mt5xl = MT5XLTokenizer(tokenizer_data={**tokenizer_data, "spiece_model": mt5_tokenizer_data}, embedding_directory=embedding_directory)

    def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
        out = {}
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -272,11 +272,17 @@ class Llama2_(nn.Module):
        optimized_attention = optimized_attention_for_device(x.device, mask=mask is not None, small_input=True)

        intermediate = None
+        all_intermediate = None
        if intermediate_output is not None:
-            if intermediate_output < 0:
+            if intermediate_output == "all":
+                all_intermediate = []
+                intermediate_output = None
+            elif intermediate_output < 0:
                intermediate_output = len(self.layers) + intermediate_output

        for i, layer in enumerate(self.layers):
+            if all_intermediate is not None:
+                all_intermediate.append(x.unsqueeze(1).clone())
            x = layer(
                x=x,
                attention_mask=mask,
@ -287,6 +293,12 @@ class Llama2_(nn.Module):
                intermediate = x.clone()

        x = self.norm(x)
+        if all_intermediate is not None:
+            all_intermediate.append(x.unsqueeze(1).clone())
+
+        if all_intermediate is not None:
+            intermediate = torch.cat(all_intermediate, dim=1)
+
        if intermediate is not None and final_layer_norm_intermediate:
            intermediate = self.norm(intermediate)

--- a/comfy/text_encoders/long_clipl.py
+++ b/comfy/text_encoders/long_clipl.py
@ -1,44 +1,26 @@
-from comfy import sd1_clip
-
-from ..component_model.files import get_path_as_dict
-
-
-class LongClipTokenizer_(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data=None):
-        if tokenizer_data is None:
-            tokenizer_data = {}
-        super().__init__(max_length=248, embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
-
-
-class LongClipModel_(sd1_clip.SDClipModel):
-    def __init__(self, *args, **kwargs):
-        kwargs = kwargs or {}
-        textmodel_json_config = kwargs.get("textmodel_json_config", None)
-        textmodel_json_config = get_path_as_dict(textmodel_json_config, "long_clipl.json", package=__package__)
-        super().__init__(*args, textmodel_json_config=textmodel_json_config, **kwargs)
-
-
-class LongClipTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data=None):
-        if tokenizer_data is None:
-            tokenizer_data = {}
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=LongClipTokenizer_)
-
-
-class LongClipModel(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options=None, **kwargs):
-        if model_options is None:
-            model_options = {}
-        super().__init__(device=device, dtype=dtype, model_options=model_options, clip_model=LongClipModel_, **kwargs)
-
-
-def model_options_long_clip(sd, tokenizer_data, model_options):
+def model_options_long_clip(sd, tokenizer_data, model_options) -> tuple[dict, dict]:
+    model_name = ""
    w = sd.get("clip_l.text_model.embeddings.position_embedding.weight", None)
+    if w is None:
+        w = sd.get("clip_g.text_model.embeddings.position_embedding.weight", None)
+    else:
+        model_name = "clip_g"
+
    if w is None:
        w = sd.get("text_model.embeddings.position_embedding.weight", None)
-    if w is not None and w.shape[0] == 248:
+        if w is not None:
+            if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
+                model_name = "clip_g"
+            elif "text_model.encoder.layers.1.mlp.fc1.weight" in sd:
+                model_name = "clip_l"
+    else:
+        model_name = "clip_l"
+
+    if w is not None:
        tokenizer_data = tokenizer_data.copy()
        model_options = model_options.copy()
-        tokenizer_data["clip_l_tokenizer_class"] = LongClipTokenizer_
-        model_options["clip_l_class"] = LongClipModel_
+        model_config = model_options.get("model_config", {})
+        model_config["max_position_embeddings"] = w.shape[0]
+        model_options["{}_model_config".format(model_name)] = model_config
+        tokenizer_data["{}_max_length".format(model_name)] = w.shape[0]
    return tokenizer_data, model_options
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@ -1,7 +1,7 @@
 from transformers import T5TokenizerFast

-from comfy import sd1_clip
 from .genmo import mochi_te
+from .. import sd1_clip
 from ..component_model import files


@ -10,7 +10,7 @@ class T5XXLTokenizer(sd1_clip.SDTokenizer):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128)  # pad to 128?
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128, tokenizer_data=tokenizer_data)  # pad to 128?


 class LTXVT5Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/lumina2.py
+++ b/comfy/text_encoders/lumina2.py
@ -8,7 +8,7 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False})
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
--- a/comfy/text_encoders/pixart_t5.py
+++ b/comfy/text_encoders/pixart_t5.py
@ -30,7 +30,7 @@ class T5XXLTokenizer(sd1_clip.SDTokenizer):
            tokenizer_data = {}
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")

-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1)  # no padding
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data)  # no padding


 class PixArtTokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/sa_t5.py
+++ b/comfy/text_encoders/sa_t5.py
@ -16,7 +16,8 @@ class T5BaseModel(sd1_clip.SDClipModel):
 class T5BaseTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, *args, **kwargs):
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=768, embedding_key='t5base', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128)
+        tokenizer_data = kwargs.pop("tokenizer_data", {})
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=768, embedding_key='t5base', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=128, tokenizer_data=tokenizer_data)


 class SAT5Tokenizer(sd1_clip.SD1Tokenizer):
@ -27,5 +28,7 @@ class SAT5Tokenizer(sd1_clip.SD1Tokenizer):


 class SAT5Model(sd1_clip.SD1ClipModel):
-    def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+    def __init__(self, device="cpu", dtype=None, model_options=None, **kwargs):
+        if model_options is None:
+            model_options = {}
        super().__init__(device=device, dtype=dtype, model_options=model_options, name="t5base", clip_model=T5BaseModel, **kwargs)
--- a/comfy/text_encoders/sd2_clip.py
+++ b/comfy/text_encoders/sd2_clip.py
@ -14,8 +14,10 @@ class SD2ClipHModel(sd1_clip.SDClipModel):


 class SD2ClipHTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, tokenizer_path=None, embedding_directory=None, **kwargs):
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024)
+    def __init__(self, tokenizer_path=None, embedding_directory=None, tokenizer_data=None, **kwargs):
+        if tokenizer_data is None:
+            tokenizer_data = {}
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=1024, embedding_key='clip_h', tokenizer_data=tokenizer_data)


 class SD2Tokenizer(sd1_clip.SD1Tokenizer):
--- a/comfy/text_encoders/sd3_clip.py
+++ b/comfy/text_encoders/sd3_clip.py
@ -20,6 +20,7 @@ class T5XXLModel(sd1_clip.SDClipModel):
            model_options = model_options.copy()
            model_options["scaled_fp8"] = t5xxl_scaled_fp8

+        model_options = {**model_options, "model_name": "t5xxl"}
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)


@ -37,21 +38,20 @@ def t5_xxl_detect(state_dict, prefix=""):


 class T5XXLTokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data=None):
+    def __init__(self, embedding_directory=None, tokenizer_data=None, min_length=77, max_length=99999999):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer_path = files.get_package_as_path("comfy.text_encoders.t5_tokenizer")
-        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=77)
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=max_length, min_length=min_length, tokenizer_data=tokenizer_data)


 class SD3Tokenizer:
    def __init__(self, embedding_directory=None, tokenizer_data=None):
        if tokenizer_data is None:
            tokenizer_data = {}
-        clip_l_tokenizer_class = tokenizer_data.get("clip_l_tokenizer_class", sd1_clip.SDTokenizer)
-        self.clip_l = clip_l_tokenizer_class(embedding_directory=embedding_directory)
-        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory)
-        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory)
+        self.clip_l = sd1_clip.SDTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.clip_g = sdxl_clip.SDXLClipGTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)

    def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
        out = {}
@ -77,8 +77,7 @@ class SD3ClipModel(torch.nn.Module):
            model_options = {}
        self.dtypes = set()
        if clip_l:
-            clip_l_class = model_options.get("clip_l_class", sd1_clip.SDClipModel)
-            self.clip_l = clip_l_class(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False, model_options=model_options)
+            self.clip_l = sd1_clip.SDClipModel(layer="hidden", layer_idx=-2, device=device, dtype=dtype, layer_norm_hidden_state=False, return_projected_pooled=False, model_options=model_options)
            self.dtypes.add(dtype)
        else:
            self.clip_l = None
--- a/comfy/text_encoders/spiece_tokenizer.py
+++ b/comfy/text_encoders/spiece_tokenizer.py
@ -24,7 +24,10 @@ class SPieceTokenizer:
        if isinstance(tokenizer_path, bytes):
            construction_args["model_proto"] = tokenizer_path
        else:
+            if not Path(tokenizer_path).is_file():
+                raise ValueError(f"invalid tokenizer {tokenizer_path}")
            construction_args["model_file"] = tokenizer_path
+
        self.tokenizer = sentencepiece.SentencePieceProcessor(**construction_args)  # pylint: disable=unexpected-keyword-arg

        self.end = self.tokenizer.eos_id()
--- a/comfy/text_encoders/wan.py
+++ b/comfy/text_encoders/wan.py
@ -19,7 +19,7 @@ class UMT5XXlTokenizer(sd1_clip.SDTokenizer):
        if tokenizer_data is None:
            tokenizer_data = {}
        tokenizer = tokenizer_data.get("spiece_model", None)
-        super().__init__(tokenizer, pad_with_end=False, embedding_size=4096, embedding_key='umt5xxl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=0)
+        super().__init__(tokenizer, pad_with_end=False, embedding_size=4096, embedding_key='umt5xxl', tokenizer_class=SPieceTokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=0, tokenizer_data=tokenizer_data)

    def state_dict(self):
        return {"spiece_model": self.tokenizer.serialize_model()}
--- a/comfy_extras/nodes/nodes_hunyuan3d.py
+++ b/comfy_extras/nodes/nodes_hunyuan3d.py
@ -209,6 +209,196 @@ def voxel_to_mesh(voxels, threshold=0.5, device=None):
    vertices = torch.fliplr(vertices)
    return vertices, faces

+def voxel_to_mesh_surfnet(voxels, threshold=0.5, device=None):
+    if device is None:
+        device = torch.device("cpu")
+    voxels = voxels.to(device)
+
+    D, H, W = voxels.shape
+
+    padded = torch.nn.functional.pad(voxels, (1, 1, 1, 1, 1, 1), 'constant', 0)
+    z, y, x = torch.meshgrid(
+        torch.arange(D, device=device),
+        torch.arange(H, device=device),
+        torch.arange(W, device=device),
+        indexing='ij'
+    )
+    cell_positions = torch.stack([z.flatten(), y.flatten(), x.flatten()], dim=1)
+
+    corner_offsets = torch.tensor([
+        [0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0],
+        [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1]
+    ], device=device)
+
+    corner_values = torch.zeros((cell_positions.shape[0], 8), device=device)
+    for c, (dz, dy, dx) in enumerate(corner_offsets):
+        corner_values[:, c] = padded[
+            cell_positions[:, 0] + dz,
+            cell_positions[:, 1] + dy,
+            cell_positions[:, 2] + dx
+        ]
+
+    corner_signs = corner_values > threshold
+    has_inside = torch.any(corner_signs, dim=1)
+    has_outside = torch.any(~corner_signs, dim=1)
+    contains_surface = has_inside & has_outside
+
+    active_cells = cell_positions[contains_surface]
+    active_signs = corner_signs[contains_surface]
+    active_values = corner_values[contains_surface]
+
+    if active_cells.shape[0] == 0:
+        return torch.zeros((0, 3), device=device), torch.zeros((0, 3), dtype=torch.long, device=device)
+
+    edges = torch.tensor([
+        [0, 1], [0, 2], [0, 4], [1, 3],
+        [1, 5], [2, 3], [2, 6], [3, 7],
+        [4, 5], [4, 6], [5, 7], [6, 7]
+    ], device=device)
+
+    cell_vertices = {}
+    progress = comfy.utils.ProgressBar(100)
+
+    for edge_idx, (e1, e2) in enumerate(edges):
+        progress.update(1)
+        crossing = active_signs[:, e1] != active_signs[:, e2]
+        if not crossing.any():
+            continue
+
+        cell_indices = torch.nonzero(crossing, as_tuple=True)[0]
+
+        v1 = active_values[cell_indices, e1]
+        v2 = active_values[cell_indices, e2]
+
+        t = torch.zeros_like(v1, device=device)
+        denom = v2 - v1
+        valid = denom != 0
+        t[valid] = (threshold - v1[valid]) / denom[valid]
+        t[~valid] = 0.5
+
+        p1 = corner_offsets[e1].float()
+        p2 = corner_offsets[e2].float()
+
+        intersection = p1.unsqueeze(0) + t.unsqueeze(1) * (p2.unsqueeze(0) - p1.unsqueeze(0))
+
+        for i, point in zip(cell_indices.tolist(), intersection):
+            if i not in cell_vertices:
+                cell_vertices[i] = []
+            cell_vertices[i].append(point)
+
+    # Calculate the final vertices as the average of intersection points for each cell
+    vertices = []
+    vertex_lookup = {}
+
+    vert_progress_mod = round(len(cell_vertices)/50)
+
+    for i, points in cell_vertices.items():
+        if not i % vert_progress_mod:
+            progress.update(1)
+
+        if points:
+            vertex = torch.stack(points).mean(dim=0)
+            vertex = vertex + active_cells[i].float()
+            vertex_lookup[tuple(active_cells[i].tolist())] = len(vertices)
+            vertices.append(vertex)
+
+    if not vertices:
+        return torch.zeros((0, 3), device=device), torch.zeros((0, 3), dtype=torch.long, device=device)
+
+    final_vertices = torch.stack(vertices)
+
+    inside_corners_mask = active_signs
+    outside_corners_mask = ~active_signs
+
+    inside_counts = inside_corners_mask.sum(dim=1, keepdim=True).float()
+    outside_counts = outside_corners_mask.sum(dim=1, keepdim=True).float()
+
+    inside_pos = torch.zeros((active_cells.shape[0], 3), device=device)
+    outside_pos = torch.zeros((active_cells.shape[0], 3), device=device)
+
+    for i in range(8):
+        mask_inside = inside_corners_mask[:, i].unsqueeze(1)
+        mask_outside = outside_corners_mask[:, i].unsqueeze(1)
+        inside_pos += corner_offsets[i].float().unsqueeze(0) * mask_inside
+        outside_pos += corner_offsets[i].float().unsqueeze(0) * mask_outside
+
+    inside_pos /= inside_counts
+    outside_pos /= outside_counts
+    gradients = inside_pos - outside_pos
+
+    pos_dirs = torch.tensor([
+        [1, 0, 0],
+        [0, 1, 0],
+        [0, 0, 1]
+    ], device=device)
+
+    cross_products = [
+        torch.linalg.cross(pos_dirs[i].float(), pos_dirs[j].float())
+        for i in range(3) for j in range(i+1, 3)
+    ]
+
+    faces = []
+    all_keys = set(vertex_lookup.keys())
+
+    face_progress_mod = round(len(active_cells)/38*3)
+
+    for pair_idx, (i, j) in enumerate([(0,1), (0,2), (1,2)]):
+        dir_i = pos_dirs[i]
+        dir_j = pos_dirs[j]
+        cross_product = cross_products[pair_idx]
+
+        ni_positions = active_cells + dir_i
+        nj_positions = active_cells + dir_j
+        diag_positions = active_cells + dir_i + dir_j
+
+        alignments = torch.matmul(gradients, cross_product)
+
+        valid_quads = []
+        quad_indices = []
+
+        for idx, active_cell in enumerate(active_cells):
+            if not idx % face_progress_mod:
+                progress.update(1)
+            cell_key = tuple(active_cell.tolist())
+            ni_key = tuple(ni_positions[idx].tolist())
+            nj_key = tuple(nj_positions[idx].tolist())
+            diag_key = tuple(diag_positions[idx].tolist())
+
+            if cell_key in all_keys and ni_key in all_keys and nj_key in all_keys and diag_key in all_keys:
+                v0 = vertex_lookup[cell_key]
+                v1 = vertex_lookup[ni_key]
+                v2 = vertex_lookup[nj_key]
+                v3 = vertex_lookup[diag_key]
+
+                valid_quads.append((v0, v1, v2, v3))
+                quad_indices.append(idx)
+
+        for q_idx, (v0, v1, v2, v3) in enumerate(valid_quads):
+            cell_idx = quad_indices[q_idx]
+            if alignments[cell_idx] > 0:
+                faces.append(torch.tensor([v0, v1, v3], device=device, dtype=torch.long))
+                faces.append(torch.tensor([v0, v3, v2], device=device, dtype=torch.long))
+            else:
+                faces.append(torch.tensor([v0, v3, v1], device=device, dtype=torch.long))
+                faces.append(torch.tensor([v0, v2, v3], device=device, dtype=torch.long))
+
+    if faces:
+        faces = torch.stack(faces)
+    else:
+        faces = torch.zeros((0, 3), dtype=torch.long, device=device)
+
+    v_min = 0
+    v_max = max(D, H, W)
+
+    final_vertices = final_vertices - (v_min + v_max) / 2
+
+    scale = (v_max - v_min) / 2
+    if scale > 0:
+        final_vertices = final_vertices / scale
+
+    final_vertices = torch.fliplr(final_vertices)
+
+    return final_vertices, faces

 class MESH:
    def __init__(self, vertices, faces):
@ -237,6 +427,34 @@ class VoxelToMeshBasic:

        return (MESH(torch.stack(vertices), torch.stack(faces)), )

+class VoxelToMesh:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"voxel": ("VOXEL", ),
+                             "algorithm": (["surface net", "basic"], ),
+                             "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}),
+                             }}
+    RETURN_TYPES = ("MESH",)
+    FUNCTION = "decode"
+
+    CATEGORY = "3d"
+
+    def decode(self, voxel, algorithm, threshold):
+        vertices = []
+        faces = []
+
+        if algorithm == "basic":
+            mesh_function = voxel_to_mesh
+        elif algorithm == "surface net":
+            mesh_function = voxel_to_mesh_surfnet
+
+        for x in voxel.data:
+            v, f = mesh_function(x, threshold=threshold, device=None)
+            vertices.append(v)
+            faces.append(f)
+
+        return (MESH(torch.stack(vertices), torch.stack(faces)), )
+

 def save_glb(vertices, faces, filepath, metadata=None):
    """
@ -244,7 +462,7 @@ def save_glb(vertices, faces, filepath, metadata=None):

    Parameters:
    vertices: torch.Tensor of shape (N, 3) - The vertex coordinates
-    faces: torch.Tensor of shape (M, 4) or (M, 3) - The face indices (quad or triangle faces)
+    faces: torch.Tensor of shape (M, 3) - The face indices (triangle faces)
    filepath: str - Output filepath (should end with .glb)
    """

@ -411,5 +629,6 @@ NODE_CLASS_MAPPINGS = {
    "Hunyuan3Dv2ConditioningMultiView": Hunyuan3Dv2ConditioningMultiView,
    "VAEDecodeHunyuan3D": VAEDecodeHunyuan3D,
    "VoxelToMeshBasic": VoxelToMeshBasic,
+    "VoxelToMesh": VoxelToMesh,
    "SaveGLB": SaveGLB,
 }
--- a/comfy_extras/nodes/nodes_load_3d.py
+++ b/comfy_extras/nodes/nodes_load_3d.py
@ -24,8 +24,8 @@ class Load3D():
            "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
        }}

-    RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "IMAGE")
-    RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "lineart")
+    RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "IMAGE", "LOAD3D_CAMERA")
+    RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "lineart", "camera_info")

    FUNCTION = "process"
    EXPERIMENTAL = True
@ -44,7 +44,7 @@ class Load3D():
        normal_image, ignore_mask2 = load_image_node.load_image(image=normal_path)
        lineart_image, ignore_mask3 = load_image_node.load_image(image=lineart_path)

-        return output_image, output_mask, model_file, normal_image, lineart_image
+        return output_image, output_mask, model_file, normal_image, lineart_image, image['camera_info']


 class Load3DAnimation():
@ -63,8 +63,8 @@ class Load3DAnimation():
            "height": ("INT", {"default": 1024, "min": 1, "max": 4096, "step": 1}),
        }}

-    RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE")
-    RETURN_NAMES = ("image", "mask", "mesh_path", "normal")
+    RETURN_TYPES = ("IMAGE", "MASK", "STRING", "IMAGE", "LOAD3D_CAMERA")
+    RETURN_NAMES = ("image", "mask", "mesh_path", "normal", "camera_info")

    FUNCTION = "process"
    EXPERIMENTAL = True
@ -81,7 +81,7 @@ class Load3DAnimation():
        ignore_image, output_mask = load_image_node.load_image(image=mask_path)
        normal_image, ignore_mask2 = load_image_node.load_image(image=normal_path)

-        return output_image, output_mask, model_file, normal_image
+        return output_image, output_mask, model_file, normal_image, image['camera_info']


 class Preview3D():
@ -89,6 +89,9 @@ class Preview3D():
    def INPUT_TYPES(s):
        return {"required": {
            "model_file": ("STRING", {"default": "", "multiline": False}),
+        },
+        "optional": {
+            "camera_info": ("LOAD3D_CAMERA", {})
        }}

    OUTPUT_NODE = True
@ -100,13 +103,22 @@ class Preview3D():
    EXPERIMENTAL = True

    def process(self, model_file, **kwargs):
-        return {"ui": {"model_file": [model_file]}, "result": ()}
+        camera_info = kwargs.get("camera_info", None)
+
+        return {
+            "ui": {
+                "result": [model_file, camera_info]
+            }
+        }

 class Preview3DAnimation():
    @classmethod
    def INPUT_TYPES(s):
        return {"required": {
            "model_file": ("STRING", {"default": "", "multiline": False}),
+        },
+        "optional": {
+            "camera_info": ("LOAD3D_CAMERA", {})
        }}

    OUTPUT_NODE = True
@ -118,7 +130,13 @@ class Preview3DAnimation():
    EXPERIMENTAL = True

    def process(self, model_file, **kwargs):
-        return {"ui": {"model_file": [model_file]}, "result": ()}
+        camera_info = kwargs.get("camera_info", None)
+
+        return {
+            "ui": {
+                "result": [model_file, camera_info]
+            }
+        }


 NODE_CLASS_MAPPINGS = {
--- a/comfy_extras/nodes/nodes_lt.py
+++ b/comfy_extras/nodes/nodes_lt.py
@ -454,10 +454,9 @@ class LTXVPreprocess:
    CATEGORY = "image"

    def preprocess(self, image, img_compression):
-        if img_compression > 0:
-            output_images = []
-            for i in range(image.shape[0]):
-                output_images.append(preprocess(image[i], img_compression))
+        output_images = []
+        for i in range(image.shape[0]):
+            output_images.append(preprocess(image[i], img_compression))
        return (torch.stack(output_images),)


--- a/comfy_extras/nodes/nodes_mask.py
+++ b/comfy_extras/nodes/nodes_mask.py
@ -1,13 +1,14 @@
 import numpy as np
 import scipy.ndimage
 import torch
+
+from comfy import node_helpers
 from comfy import utils
 from comfy.component_model.tensor_types import MaskBatch, RGBImageBatch
-
 from comfy.nodes.common import MAX_RESOLUTION


-def composite(destination, source, x, y, mask = None, multiplier = 8, resize_source = False):
+def composite(destination, source, x, y, mask=None, multiplier=8, resize_source=False):
    source = source.to(destination.device)
    if resize_source:
        source = torch.nn.functional.interpolate(source, size=(destination.shape[2], destination.shape[3]), mode="bilinear")
@ -36,11 +37,12 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
    inverse_mask = torch.ones_like(mask) - mask

    source_portion = mask * source[:, :, :visible_height, :visible_width]
-    destination_portion = inverse_mask  * destination[:, :, top:bottom, left:right]
+    destination_portion = inverse_mask * destination[:, :, top:bottom, left:right]

    destination[:, :, top:bottom, left:right] = source_portion + destination_portion
    return destination

+
 class LatentCompositeMasked:
    @classmethod
    def INPUT_TYPES(s):
@ -56,18 +58,20 @@ class LatentCompositeMasked:
                "mask": ("MASK",),
            }
        }
+
    RETURN_TYPES = ("LATENT",)
    FUNCTION = "composite"

    CATEGORY = "latent"

-    def composite(self, destination, source, x, y, resize_source, mask = None):
+    def composite(self, destination, source, x, y, resize_source, mask=None):
        output = destination.copy()
        destination = destination["samples"].clone()
        source = source["samples"]
        output["samples"] = composite(destination, source, x, y, mask, 8, resize_source)
        return (output,)

+
 class ImageCompositeMasked:
    @classmethod
    def INPUT_TYPES(s):
@ -83,23 +87,26 @@ class ImageCompositeMasked:
                "mask": ("MASK",),
            }
        }
+
    RETURN_TYPES = ("IMAGE",)
    FUNCTION = "composite"

    CATEGORY = "image"

-    def composite(self, destination, source, x, y, resize_source, mask = None):
+    def composite(self, destination, source, x, y, resize_source, mask=None):
+        destination, source = node_helpers.image_alpha_fix(destination, source)
        destination = destination.clone().movedim(-1, 1)
        output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1)
        return (output,)

+
 class MaskToImage:
    @classmethod
    def INPUT_TYPES(s):
        return {
-                "required": {
-                    "mask": ("MASK",),
-                }
+            "required": {
+                "mask": ("MASK",),
+            }
        }

    CATEGORY = "mask"
@ -111,14 +118,15 @@ class MaskToImage:
        result = mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])).movedim(1, -1).expand(-1, -1, -1, 3)
        return (result,)

+
 class ImageToMask:
    @classmethod
    def INPUT_TYPES(s):
        return {
-                "required": {
-                    "image": ("IMAGE",),
-                    "channel": (["red", "green", "blue", "alpha"],),
-                }
+            "required": {
+                "image": ("IMAGE",),
+                "channel": (["red", "green", "blue", "alpha"],),
+            }
        }

    CATEGORY = "mask"
@ -131,14 +139,15 @@ class ImageToMask:
        mask = image[:, :, :, channels.index(channel)]
        return (mask,)

+
 class ImageColorToMask:
    @classmethod
    def INPUT_TYPES(s):
        return {
-                "required": {
-                    "image": ("IMAGE",),
-                    "color": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFF, "step": 1, "display": "color"}),
-                }
+            "required": {
+                "image": ("IMAGE",),
+                "color": ("INT", {"default": 0, "min": 0, "max": 0xFFFFFF, "step": 1, "display": "color"}),
+            }
        }

    CATEGORY = "mask"
@ -148,10 +157,11 @@ class ImageColorToMask:

    def image_to_mask(self, image, color):
        temp = (torch.clamp(image, 0, 1.0) * 255.0).round().to(torch.int)
-        temp = torch.bitwise_left_shift(temp[:,:,:,0], 16) + torch.bitwise_left_shift(temp[:,:,:,1], 8) + temp[:,:,:,2]
+        temp = torch.bitwise_left_shift(temp[:, :, :, 0], 16) + torch.bitwise_left_shift(temp[:, :, :, 1], 8) + temp[:, :, :, 2]
        mask = torch.where(temp == color, 255, 0).float()
        return (mask,)

+
 class SolidMask:
    @classmethod
    def INPUT_TYPES(cls):
@ -173,6 +183,7 @@ class SolidMask:
        out = torch.full((1, height, width), value, dtype=torch.float32, device="cpu")
        return (out,)

+
 class InvertMask:
    @classmethod
    def INPUT_TYPES(cls):
@ -192,6 +203,7 @@ class InvertMask:
        out = 1.0 - mask
        return (out,)

+
 class CropMask:
    @classmethod
    def INPUT_TYPES(cls):
@ -216,6 +228,7 @@ class CropMask:
        out = mask[:, y:y + height, x:x + width]
        return (out,)

+
 class MaskComposite:
    @classmethod
    def INPUT_TYPES(cls):
@ -263,6 +276,7 @@ class MaskComposite:

        return (output,)

+
 class FeatherMask:
    @classmethod
    def INPUT_TYPES(cls):
@ -308,6 +322,7 @@ class FeatherMask:

        return (output,)

+
 class GrowMask:
    @classmethod
    def INPUT_TYPES(cls):
@ -343,14 +358,15 @@ class GrowMask:
            out.append(output)
        return (torch.stack(out, dim=0),)

+
 class ThresholdMask:
    @classmethod
    def INPUT_TYPES(s):
        return {
-                "required": {
-                    "mask": ("MASK",),
-                    "value": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}),
-                }
+            "required": {
+                "mask": ("MASK",),
+                "value": ("FLOAT", {"default": 0.5, "min": 0.0, "max": 1.0, "step": 0.01}),
+            }
        }

    CATEGORY = "mask"
--- a/comfy_extras/nodes/nodes_post_processing.py
+++ b/comfy_extras/nodes/nodes_post_processing.py
@ -6,7 +6,7 @@ import math

 from comfy import utils
 from comfy import model_management
-
+from comfy import node_helpers

 class Blend:
    def __init__(self):
@ -34,6 +34,7 @@ class Blend:
    CATEGORY = "image/postprocessing"

    def blend_images(self, image1: torch.Tensor, image2: torch.Tensor, blend_factor: float, blend_mode: str):
+        image1, image2 = node_helpers.image_alpha_fix(image1, image2)
        image2 = image2.to(image1.device)
        if image1.shape != image2.shape:
            image2 = image2.permute(0, 3, 1, 2)
--- a/comfy_extras/nodes/nodes_wan.py
+++ b/comfy_extras/nodes/nodes_wan.py
@ -4,6 +4,7 @@ import torch
 import comfy.model_management
 import comfy.utils
 import comfy.latent_formats
+import comfy.clip_vision


 class WanImageToVideo:
@ -99,6 +100,72 @@ class WanFunControlToVideo:
        out_latent["samples"] = latent
        return (positive, negative, out_latent)

+class WanFirstLastFrameToVideo:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"positive": ("CONDITIONING", ),
+                             "negative": ("CONDITIONING", ),
+                             "vae": ("VAE", ),
+                             "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                },
+                "optional": {"clip_vision_start_image": ("CLIP_VISION_OUTPUT", ),
+                             "clip_vision_end_image": ("CLIP_VISION_OUTPUT", ),
+                             "start_image": ("IMAGE", ),
+                             "end_image": ("IMAGE", ),
+                }}
+
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
+    RETURN_NAMES = ("positive", "negative", "latent")
+    FUNCTION = "encode"
+
+    CATEGORY = "conditioning/video_models"
+
+    def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None):
+        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+        if start_image is not None:
+            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+        if end_image is not None:
+            end_image = comfy.utils.common_upscale(end_image[-length:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+
+        image = torch.ones((length, height, width, 3)) * 0.5
+        mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))
+
+        if start_image is not None:
+            image[:start_image.shape[0]] = start_image
+            mask[:, :, :start_image.shape[0] + 3] = 0.0
+
+        if end_image is not None:
+            image[-end_image.shape[0]:] = end_image
+            mask[:, :, -end_image.shape[0]:] = 0.0
+
+        concat_latent_image = vae.encode(image[:, :, :, :3])
+        mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
+        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+
+        if clip_vision_start_image is not None:
+            clip_vision_output = clip_vision_start_image
+
+        if clip_vision_end_image is not None:
+            if clip_vision_output is not None:
+                states = torch.cat([clip_vision_output.penultimate_hidden_states, clip_vision_end_image.penultimate_hidden_states], dim=-2)
+                clip_vision_output = comfy.clip_vision.Output()
+                clip_vision_output.penultimate_hidden_states = states
+            else:
+                clip_vision_output = clip_vision_end_image
+
+        if clip_vision_output is not None:
+            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
+            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+
+        out_latent = {}
+        out_latent["samples"] = latent
+        return (positive, negative, out_latent)
+
+
 class WanFunInpaintToVideo:
    @classmethod
    def INPUT_TYPES(s):
@ -122,38 +189,119 @@ class WanFunInpaintToVideo:
    CATEGORY = "conditioning/video_models"

    def encode(self, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_output=None):
-        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
-        if start_image is not None:
-            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-        if end_image is not None:
-            end_image = comfy.utils.common_upscale(end_image[-length:].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+        flfv = WanFirstLastFrameToVideo()
+        return flfv.encode(positive, negative, vae, width, height, length, batch_size, start_image=start_image, end_image=end_image, clip_vision_start_image=clip_vision_output)

-        image = torch.ones((length, height, width, 3)) * 0.5
-        mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))

-        if start_image is not None:
-            image[:start_image.shape[0]] = start_image
-            mask[:, :, :start_image.shape[0] + 3] = 0.0
+class WanVaceToVideo:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {"positive": ("CONDITIONING", ),
+                             "negative": ("CONDITIONING", ),
+                             "vae": ("VAE", ),
+                             "width": ("INT", {"default": 832, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
+                             "length": ("INT", {"default": 81, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
+                             "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
+                },
+                "optional": {"control_video": ("IMAGE", ),
+                             "control_masks": ("MASK", ),
+                             "reference_image": ("IMAGE", ),
+                }}

-        if end_image is not None:
-            image[-end_image.shape[0]:] = end_image
-            mask[:, :, -end_image.shape[0]:] = 0.0
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT", "INT")
+    RETURN_NAMES = ("positive", "negative", "latent", "trim_latent")
+    FUNCTION = "encode"

-        concat_latent_image = vae.encode(image[:, :, :, :3])
-        mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
-        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
-        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
+    CATEGORY = "conditioning/video_models"

-        if clip_vision_output is not None:
-            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
-            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
+    EXPERIMENTAL = True

+    def encode(self, positive, negative, vae, width, height, length, batch_size, control_video=None, control_masks=None, reference_image=None):
+        latent_length = ((length - 1) // 4) + 1
+        if control_video is not None:
+            control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            if control_video.shape[0] < length:
+                control_video = torch.nn.functional.pad(control_video, (0, 0, 0, 0, 0, 0, 0, length - control_video.shape[0]), value=0.5)
+        else:
+            control_video = torch.ones((length, height, width, 3)) * 0.5
+
+        if reference_image is not None:
+            reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+            reference_image = vae.encode(reference_image[:, :, :, :3])
+            reference_image = torch.cat([reference_image, comfy.latent_formats.Wan21().process_out(torch.zeros_like(reference_image))], dim=1)
+
+        if control_masks is None:
+            mask = torch.ones((length, height, width, 1))
+        else:
+            mask = control_masks
+            if mask.ndim == 3:
+                mask = mask.unsqueeze(1)
+            mask = comfy.utils.common_upscale(mask[:length], width, height, "bilinear", "center").movedim(1, -1)
+            if mask.shape[0] < length:
+                mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, 0, 0, length - mask.shape[0]), value=1.0)
+
+        control_video = control_video - 0.5
+        inactive = (control_video * (1 - mask)) + 0.5
+        reactive = (control_video * mask) + 0.5
+
+        inactive = vae.encode(inactive[:, :, :, :3])
+        reactive = vae.encode(reactive[:, :, :, :3])
+        control_video_latent = torch.cat((inactive, reactive), dim=1)
+        if reference_image is not None:
+            control_video_latent = torch.cat((reference_image, control_video_latent), dim=2)
+
+        vae_stride = 8
+        height_mask = height // vae_stride
+        width_mask = width // vae_stride
+        mask = mask.view(length, height_mask, vae_stride, width_mask, vae_stride)
+        mask = mask.permute(2, 4, 0, 1, 3)
+        mask = mask.reshape(vae_stride * vae_stride, length, height_mask, width_mask)
+        mask = torch.nn.functional.interpolate(mask.unsqueeze(0), size=(latent_length, height_mask, width_mask), mode='nearest-exact').squeeze(0)
+
+        trim_latent = 0
+        if reference_image is not None:
+            mask_pad = torch.zeros_like(mask[:, :reference_image.shape[2], :, :])
+            mask = torch.cat((mask_pad, mask), dim=1)
+            latent_length += reference_image.shape[2]
+            trim_latent = reference_image.shape[2]
+
+        mask = mask.unsqueeze(0)
+        positive = node_helpers.conditioning_set_values(positive, {"vace_frames": control_video_latent, "vace_mask": mask})
+        negative = node_helpers.conditioning_set_values(negative, {"vace_frames": control_video_latent, "vace_mask": mask})
+
+        latent = torch.zeros([batch_size, 16, latent_length, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        out_latent = {}
        out_latent["samples"] = latent
-        return (positive, negative, out_latent)
+        return (positive, negative, out_latent, trim_latent)
+
+class TrimVideoLatent:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "samples": ("LATENT",),
+                              "trim_amount": ("INT", {"default": 0, "min": 0, "max": 99999}),
+                             }}
+
+    RETURN_TYPES = ("LATENT",)
+    FUNCTION = "op"
+
+    CATEGORY = "latent/video"
+
+    EXPERIMENTAL = True
+
+    def op(self, samples, trim_amount):
+        samples_out = samples.copy()
+
+        s1 = samples["samples"]
+        samples_out["samples"] = s1[:, :, trim_amount:]
+        return (samples_out,)
+

 NODE_CLASS_MAPPINGS = {
    "WanImageToVideo": WanImageToVideo,
    "WanFunControlToVideo": WanFunControlToVideo,
    "WanFunInpaintToVideo": WanFunInpaintToVideo,
+    "WanFirstLastFrameToVideo": WanFirstLastFrameToVideo,
+    "WanVaceToVideo": WanVaceToVideo,
+    "TrimVideoLatent": TrimVideoLatent,
 }
--- a/comfy_extras/nodes_fresca.py
+++ b/comfy_extras/nodes_fresca.py
@ -0,0 +1,100 @@
+# Code based on https://github.com/WikiChao/FreSca (MIT License)
+import torch
+import torch.fft as fft
+
+
+def Fourier_filter(x, scale_low=1.0, scale_high=1.5, freq_cutoff=20):
+    """
+    Apply frequency-dependent scaling to an image tensor using Fourier transforms.
+
+    Parameters:
+        x:           Input tensor of shape (B, C, H, W)
+        scale_low:   Scaling factor for low-frequency components (default: 1.0)
+        scale_high:  Scaling factor for high-frequency components (default: 1.5)
+        freq_cutoff: Number of frequency indices around center to consider as low-frequency (default: 20)
+
+    Returns:
+        x_filtered: Filtered version of x in spatial domain with frequency-specific scaling applied.
+    """
+    # Preserve input dtype and device
+    dtype, device = x.dtype, x.device
+
+    # Convert to float32 for FFT computations
+    x = x.to(torch.float32)
+
+    # 1) Apply FFT and shift low frequencies to center
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+
+    # Initialize mask with high-frequency scaling factor
+    mask = torch.ones(x_freq.shape, device=device) * scale_high
+    m = mask
+    for d in range(len(x_freq.shape) - 2):
+        dim = d + 2
+        cc = x_freq.shape[dim] // 2
+        f_c = min(freq_cutoff, cc)
+        m = m.narrow(dim, cc - f_c, f_c * 2)
+
+    # Apply low-frequency scaling factor to center region
+    m[:] = scale_low
+
+    # 3) Apply frequency-specific scaling
+    x_freq = x_freq * mask
+
+    # 4) Convert back to spatial domain
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+
+    # 5) Restore original dtype
+    x_filtered = x_filtered.to(dtype)
+
+    return x_filtered
+
+
+class FreSca:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "model": ("MODEL",),
+                "scale_low": ("FLOAT", {"default": 1.0, "min": 0, "max": 10, "step": 0.01,
+                                        "tooltip": "Scaling factor for low-frequency components"}),
+                "scale_high": ("FLOAT", {"default": 1.25, "min": 0, "max": 10, "step": 0.01,
+                                        "tooltip": "Scaling factor for high-frequency components"}),
+                "freq_cutoff": ("INT", {"default": 20, "min": 1, "max": 10000, "step": 1,
+                                        "tooltip": "Number of frequency indices around center to consider as low-frequency"}),
+            }
+        }
+    RETURN_TYPES = ("MODEL",)
+    FUNCTION = "patch"
+    CATEGORY = "_for_testing"
+    DESCRIPTION = "Applies frequency-dependent scaling to the guidance"
+    def patch(self, model, scale_low, scale_high, freq_cutoff):
+        def custom_cfg_function(args):
+            cond = args["conds_out"][0]
+            uncond = args["conds_out"][1]
+
+            guidance = cond - uncond
+            filtered_guidance = Fourier_filter(
+                guidance,
+                scale_low=scale_low,
+                scale_high=scale_high,
+                freq_cutoff=freq_cutoff,
+            )
+            filtered_cond = filtered_guidance + uncond
+
+            return [filtered_cond, uncond]
+
+        m = model.clone()
+        m.set_model_sampler_pre_cfg_function(custom_cfg_function)
+
+        return (m,)
+
+
+NODE_CLASS_MAPPINGS = {
+    "FreSca": FreSca,
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "FreSca": "FreSca",
+}
--- a/comfy_extras/nodes_hidream.py
+++ b/comfy_extras/nodes_hidream.py
@ -0,0 +1,55 @@
+import folder_paths
+import comfy.sd
+import comfy.model_management
+
+
+class QuadrupleCLIPLoader:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
+                              "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
+                              "clip_name3": (folder_paths.get_filename_list("text_encoders"), ),
+                              "clip_name4": (folder_paths.get_filename_list("text_encoders"), )
+                             }}
+    RETURN_TYPES = ("CLIP",)
+    FUNCTION = "load_clip"
+
+    CATEGORY = "advanced/loaders"
+
+    DESCRIPTION = "[Recipes]\n\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct"
+
+    def load_clip(self, clip_name1, clip_name2, clip_name3, clip_name4):
+        clip_path1 = folder_paths.get_full_path_or_raise("text_encoders", clip_name1)
+        clip_path2 = folder_paths.get_full_path_or_raise("text_encoders", clip_name2)
+        clip_path3 = folder_paths.get_full_path_or_raise("text_encoders", clip_name3)
+        clip_path4 = folder_paths.get_full_path_or_raise("text_encoders", clip_name4)
+        clip = comfy.sd.load_clip(ckpt_paths=[clip_path1, clip_path2, clip_path3, clip_path4], embedding_directory=folder_paths.get_folder_paths("embeddings"))
+        return (clip,)
+
+class CLIPTextEncodeHiDream:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            "clip_g": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            "llama": ("STRING", {"multiline": True, "dynamicPrompts": True})
+            }}
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "advanced/conditioning"
+
+    def encode(self, clip, clip_l, clip_g, t5xxl, llama):
+
+        tokens = clip.tokenize(clip_g)
+        tokens["l"] = clip.tokenize(clip_l)["l"]
+        tokens["t5xxl"] = clip.tokenize(t5xxl)["t5xxl"]
+        tokens["llama"] = clip.tokenize(llama)["llama"]
+        return (clip.encode_from_tokens_scheduled(tokens), )
+
+NODE_CLASS_MAPPINGS = {
+    "QuadrupleCLIPLoader": QuadrupleCLIPLoader,
+    "CLIPTextEncodeHiDream": CLIPTextEncodeHiDream,
+}
--- a/comfy_extras/nodes_optimalsteps.py
+++ b/comfy_extras/nodes_optimalsteps.py
@ -0,0 +1,56 @@
+# from https://github.com/bebebe666/OptimalSteps
+
+
+import numpy as np
+import torch
+
+def loglinear_interp(t_steps, num_steps):
+    """
+    Performs log-linear interpolation of a given array of decreasing numbers.
+    """
+    xs = np.linspace(0, 1, len(t_steps))
+    ys = np.log(t_steps[::-1])
+
+    new_xs = np.linspace(0, 1, num_steps)
+    new_ys = np.interp(new_xs, xs, ys)
+
+    interped_ys = np.exp(new_ys)[::-1].copy()
+    return interped_ys
+
+
+NOISE_LEVELS = {"FLUX": [0.9968, 0.9886, 0.9819, 0.975, 0.966, 0.9471, 0.9158, 0.8287, 0.5512, 0.2808, 0.001],
+"Wan":[1.0, 0.997, 0.995, 0.993, 0.991, 0.989, 0.987, 0.985, 0.98, 0.975, 0.973, 0.968, 0.96, 0.946, 0.927, 0.902, 0.864, 0.776, 0.539, 0.208, 0.001],
+}
+
+class OptimalStepsScheduler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required":
+                    {"model_type": (["FLUX", "Wan"], ),
+                     "steps": ("INT", {"default": 20, "min": 3, "max": 1000}),
+                     "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+                      }
+               }
+    RETURN_TYPES = ("SIGMAS",)
+    CATEGORY = "sampling/custom_sampling/schedulers"
+
+    FUNCTION = "get_sigmas"
+
+    def get_sigmas(self, model_type, steps, denoise):
+        total_steps = steps
+        if denoise < 1.0:
+            if denoise <= 0.0:
+                return (torch.FloatTensor([]),)
+            total_steps = round(steps * denoise)
+
+        sigmas = NOISE_LEVELS[model_type][:]
+        if (steps + 1) != len(sigmas):
+            sigmas = loglinear_interp(sigmas, steps + 1)
+
+        sigmas = sigmas[-(total_steps + 1):]
+        sigmas[-1] = 0
+        return (torch.FloatTensor(sigmas), )
+
+NODE_CLASS_MAPPINGS = {
+    "OptimalStepsScheduler": OptimalStepsScheduler,
+}
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 comfyui-frontend-package
+comfyui-workflow-templates
 torch
 torchvision
 torchdiffeq>=0.2.3
--- a/setup.py
+++ b/setup.py
@ -23,7 +23,7 @@ package_name = "comfyui"
 """
 The current version.
 """
-version = "0.3.23"
+version = "0.3.29"

 """
 The package index to the torch built with AMD ROCm.
--- a/tests/unit/folder_paths_test/filter_by_content_types_test.py
+++ b/tests/unit/folder_paths_test/filter_by_content_types_test.py
@ -4,14 +4,16 @@ import tempfile
 import pytest

 from comfy.cmd.folder_paths import filter_files_content_types
-
+from comfy.component_model.folder_path_types import extension_mimetypes_cache
+from unittest.mock import patch

@pytest.fixture(scope="module")
 def file_extensions():
    return {
        'image': ['gif', 'heif', 'ico', 'jpeg', 'jpg', 'png', 'pnm', 'ppm', 'svg', 'tiff', 'webp', 'xbm', 'xpm'],
        'audio': ['aif', 'aifc', 'aiff', 'au', 'flac', 'm4a', 'mp2', 'mp3', 'ogg', 'snd', 'wav'],
-        'video': ['avi', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'ogv', 'qt', 'webm', 'wmv']
+        'video': ['avi', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'mpg', 'ogv', 'qt', 'webm', 'wmv'],
+        'model': ['gltf', 'glb', 'obj', 'fbx', 'stl']
    }


@ -25,7 +27,18 @@ def mock_dir(file_extensions):
        yield directory


-def test_categorizes_all_correctly(mock_dir, file_extensions):
+@pytest.fixture
+def patched_mimetype_cache(file_extensions):
+    # Mock model file extensions since they may not be in the test-runner system's mimetype cache
+    new_cache = extension_mimetypes_cache.copy()
+    for extension in file_extensions["model"]:
+        new_cache[extension] = "model"
+
+    with patch("folder_paths.extension_mimetypes_cache", new_cache):
+        yield
+
+
+def test_categorizes_all_correctly(mock_dir, file_extensions, patched_mimetype_cache):
    files = os.listdir(mock_dir)
    for content_type, extensions in file_extensions.items():
        filtered_files = filter_files_content_types(files, [content_type])
@ -33,7 +46,7 @@ def test_categorizes_all_correctly(mock_dir, file_extensions):
            assert f"sample_{content_type}.{extension}" in filtered_files


-def test_categorizes_all_uniquely(mock_dir, file_extensions):
+def test_categorizes_all_uniquely(mock_dir, file_extensions, patched_mimetype_cache):
    files = os.listdir(mock_dir)
    for content_type, extensions in file_extensions.items():
        filtered_files = filter_files_content_types(files, [content_type])