From 866a4619db2db56c77a86e5fc9968a2454928627 Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Tue, 20 Jan 2026 06:21:35 +0800
Subject: [PATCH 01/19] chore: update workflow templates to v0.8.14 (#11974)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 622256973..312c7c137 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.36.14
-comfyui-workflow-templates==0.8.11
+comfyui-workflow-templates==0.8.14
 comfyui-embedded-docs==0.4.0
 torch
 torchsde

From b931b37e30bb19b6e13ad8623e193ccdaf671a23 Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Mon, 19 Jan 2026 16:47:14 -0800
Subject: [PATCH 02/19] feat(api-nodes): add Bria Edit node (#11978)

Co-authored-by: Alexander Piskun <bigcat88@icloud.com>
---
 comfy_api_nodes/apis/bria.py        |  61 +++++++++
 comfy_api_nodes/nodes_bria.py       | 198 ++++++++++++++++++++++++++++
 comfy_api_nodes/util/__init__.py    |   2 +
 comfy_api_nodes/util/conversions.py |   6 +
 4 files changed, 267 insertions(+)
 create mode 100644 comfy_api_nodes/apis/bria.py
 create mode 100644 comfy_api_nodes/nodes_bria.py

diff --git a/comfy_api_nodes/apis/bria.py b/comfy_api_nodes/apis/bria.py
new file mode 100644
index 000000000..9119cacc6
--- /dev/null
+++ b/comfy_api_nodes/apis/bria.py
@@ -0,0 +1,61 @@
+from typing import TypedDict
+
+from pydantic import BaseModel, Field
+
+
+class InputModerationSettings(TypedDict):
+    prompt_content_moderation: bool
+    visual_input_moderation: bool
+    visual_output_moderation: bool
+
+
+class BriaEditImageRequest(BaseModel):
+    instruction: str | None = Field(...)
+    structured_instruction: str | None = Field(
+        ...,
+        description="Use this instead of instruction for precise, programmatic control.",
+    )
+    images: list[str] = Field(
+        ...,
+        description="Required. Publicly available URL or Base64-encoded. Must contain exactly one item.",
+    )
+    mask: str | None = Field(
+        None,
+        description="Mask image (black and white). Black areas will be preserved, white areas will be edited. "
+        "If omitted, the edit applies to the entire image. "
+        "The input image and the the input mask must be of the same size.",
+    )
+    negative_prompt: str | None = Field(None)
+    guidance_scale: float = Field(...)
+    model_version: str = Field(...)
+    steps_num: int = Field(...)
+    seed: int = Field(...)
+    ip_signal: bool = Field(
+        False,
+        description="If true, returns a warning for potential IP content in the instruction.",
+    )
+    prompt_content_moderation: bool = Field(
+        False, description="If true, returns 422 on instruction moderation failure."
+    )
+    visual_input_content_moderation: bool = Field(
+        False, description="If true, returns 422 on images or mask moderation failure."
+    )
+    visual_output_content_moderation: bool = Field(
+        False, description="If true, returns 422 on visual output moderation failure."
+    )
+
+
+class BriaStatusResponse(BaseModel):
+    request_id: str = Field(...)
+    status_url: str = Field(...)
+    warning: str | None = Field(None)
+
+
+class BriaResult(BaseModel):
+    structured_prompt: str = Field(...)
+    image_url: str = Field(...)
+
+
+class BriaResponse(BaseModel):
+    status: str = Field(...)
+    result: BriaResult | None = Field(None)
diff --git a/comfy_api_nodes/nodes_bria.py b/comfy_api_nodes/nodes_bria.py
new file mode 100644
index 000000000..72a3055a7
--- /dev/null
+++ b/comfy_api_nodes/nodes_bria.py
@@ -0,0 +1,198 @@
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.bria import (
+    BriaEditImageRequest,
+    BriaResponse,
+    BriaStatusResponse,
+    InputModerationSettings,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    convert_mask_to_image,
+    download_url_to_image_tensor,
+    get_number_of_images,
+    poll_op,
+    sync_op,
+    upload_images_to_comfyapi,
+)
+
+
+class BriaImageEditNode(IO.ComfyNode):
+
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="BriaImageEditNode",
+            display_name="Bria Image Edit",
+            category="api node/image/Bria",
+            description="Edit images using Bria latest model",
+            inputs=[
+                IO.Combo.Input("model", options=["FIBO"]),
+                IO.Image.Input("image"),
+                IO.String.Input(
+                    "prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="Instruction to edit image",
+                ),
+                IO.String.Input("negative_prompt", multiline=True, default=""),
+                IO.String.Input(
+                    "structured_prompt",
+                    multiline=True,
+                    default="",
+                    tooltip="A string containing the structured edit prompt in JSON format. "
+                    "Use this instead of usual prompt for precise, programmatic control.",
+                ),
+                IO.Int.Input(
+                    "seed",
+                    default=1,
+                    min=1,
+                    max=2147483647,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                    control_after_generate=True,
+                ),
+                IO.Float.Input(
+                    "guidance_scale",
+                    default=3,
+                    min=3,
+                    max=5,
+                    step=0.01,
+                    display_mode=IO.NumberDisplay.number,
+                    tooltip="Higher value makes the image follow the prompt more closely.",
+                ),
+                IO.Int.Input(
+                    "steps",
+                    default=50,
+                    min=20,
+                    max=50,
+                    step=1,
+                    display_mode=IO.NumberDisplay.number,
+                ),
+                IO.DynamicCombo.Input(
+                    "moderation",
+                    options=[
+                        IO.DynamicCombo.Option(
+                            "true",
+                            [
+                                IO.Boolean.Input(
+                                    "prompt_content_moderation", default=False
+                                ),
+                                IO.Boolean.Input(
+                                    "visual_input_moderation", default=False
+                                ),
+                                IO.Boolean.Input(
+                                    "visual_output_moderation", default=True
+                                ),
+                            ],
+                        ),
+                        IO.DynamicCombo.Option("false", []),
+                    ],
+                    tooltip="Moderation settings",
+                ),
+                IO.Mask.Input(
+                    "mask",
+                    tooltip="If omitted, the edit applies to the entire image.",
+                    optional=True,
+                ),
+            ],
+            outputs=[
+                IO.Image.Output(),
+                IO.String.Output(display_name="structured_prompt"),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                expr="""{"type":"usd","usd":0.04}""",
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: Input.Image,
+        prompt: str,
+        negative_prompt: str,
+        structured_prompt: str,
+        seed: int,
+        guidance_scale: float,
+        steps: int,
+        moderation: InputModerationSettings,
+        mask: Input.Image | None = None,
+    ) -> IO.NodeOutput:
+        if not prompt and not structured_prompt:
+            raise ValueError(
+                "One of prompt or structured_prompt is required to be non-empty."
+            )
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        mask_url = None
+        if mask is not None:
+            mask_url = (
+                await upload_images_to_comfyapi(
+                    cls,
+                    convert_mask_to_image(mask),
+                    max_images=1,
+                    mime_type="image/png",
+                    wait_label="Uploading mask",
+                )
+            )[0]
+        response = await sync_op(
+            cls,
+            ApiEndpoint(path="proxy/bria/v2/image/edit", method="POST"),
+            data=BriaEditImageRequest(
+                instruction=prompt if prompt else None,
+                structured_instruction=structured_prompt if structured_prompt else None,
+                images=await upload_images_to_comfyapi(
+                    cls,
+                    image,
+                    max_images=1,
+                    mime_type="image/png",
+                    wait_label="Uploading image",
+                ),
+                mask=mask_url,
+                negative_prompt=negative_prompt if negative_prompt else None,
+                guidance_scale=guidance_scale,
+                seed=seed,
+                model_version=model,
+                steps_num=steps,
+                prompt_content_moderation=moderation.get(
+                    "prompt_content_moderation", False
+                ),
+                visual_input_content_moderation=moderation.get(
+                    "visual_input_moderation", False
+                ),
+                visual_output_content_moderation=moderation.get(
+                    "visual_output_moderation", False
+                ),
+            ),
+            response_model=BriaStatusResponse,
+        )
+        response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/bria/v2/status/{response.request_id}"),
+            status_extractor=lambda r: r.status,
+            response_model=BriaResponse,
+        )
+        return IO.NodeOutput(
+            await download_url_to_image_tensor(response.result.image_url),
+            response.result.structured_prompt,
+        )
+
+
+class BriaExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            BriaImageEditNode,
+        ]
+
+
+async def comfy_entrypoint() -> BriaExtension:
+    return BriaExtension()
diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py
index 4cc22abfb..364976000 100644
--- a/comfy_api_nodes/util/__init__.py
+++ b/comfy_api_nodes/util/__init__.py
@@ -11,6 +11,7 @@ from .conversions import (
     audio_input_to_mp3,
     audio_to_base64_string,
     bytesio_to_image_tensor,
+    convert_mask_to_image,
     downscale_image_tensor,
     image_tensor_pair_to_batch,
     pil_to_bytesio,
@@ -72,6 +73,7 @@ __all__ = [
     "audio_input_to_mp3",
     "audio_to_base64_string",
     "bytesio_to_image_tensor",
+    "convert_mask_to_image",
     "downscale_image_tensor",
     "image_tensor_pair_to_batch",
     "pil_to_bytesio",
diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py
index 99c302a2a..546741b7b 100644
--- a/comfy_api_nodes/util/conversions.py
+++ b/comfy_api_nodes/util/conversions.py
@@ -451,6 +451,12 @@ def resize_mask_to_image(
     return mask
 
 
+def convert_mask_to_image(mask: Input.Image) -> torch.Tensor:
+    """Make mask have the expected amount of dims (4) and channels (3) to be recognized as an image."""
+    mask = mask.unsqueeze(-1)
+    return torch.cat([mask] * 3, dim=-1)
+
+
 def text_filepath_to_base64_string(filepath: str) -> str:
     """Converts a text file to a base64 string."""
     with open(filepath, "rb") as f:

From 7458e20465a0efcf91eafc0c65d1929ab7b2238d Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Mon, 19 Jan 2026 16:58:30 -0800
Subject: [PATCH 03/19] Make Autogrow validation work properly (#11977)

* In-progress autogrow validation fixes - properly looks at required/optional inputs, now working on the edge case that all inputs are optional and nothing is plugged in (should just be an empty dictionary passed into node)

* Allow autogrow to work with all inputs being optional

* Revert accidentally pushed changes to nodes_logic.py
---
 comfy_api/latest/_io.py | 53 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index c30d92aaa..4969d3506 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -1000,20 +1000,38 @@ class Autogrow(ComfyTypeI):
             names = [f"{prefix}{i}" for i in range(max)]
         # need to create a new input based on the contents of input
         template_input = None
-        for _, dict_input in input.items():
-            # for now, get just the first value from dict_input
+        template_required = True
+        for _input_type, dict_input in input.items():
+            # for now, get just the first value from dict_input; if not required, min can be ignored
+            if len(dict_input) == 0:
+                continue
             template_input = list(dict_input.values())[0]
+            template_required = _input_type == "required"
+            break
+        if template_input is None:
+            raise Exception("template_input could not be determined from required or optional; this should never happen.")
         new_dict = {}
+        new_dict_added_to = False
+        # first, add possible inputs into out_dict
         for i, name in enumerate(names):
             expected_id = finalize_prefix(curr_prefix, name)
+            # required
+            if i < min and template_required:
+                out_dict["required"][expected_id] = template_input
+                type_dict = new_dict.setdefault("required", {})
+            # optional
+            else:
+                out_dict["optional"][expected_id] = template_input
+                type_dict = new_dict.setdefault("optional", {})
             if expected_id in live_inputs:
-                # required
-                if i < min:
-                    type_dict = new_dict.setdefault("required", {})
-                # optional
-                else:
-                    type_dict = new_dict.setdefault("optional", {})
+                # NOTE: prefix gets added in parse_class_inputs
                 type_dict[name] = template_input
+                new_dict_added_to = True
+        # account for the edge case that all inputs are optional and no values are received
+        if not new_dict_added_to:
+            finalized_prefix = finalize_prefix(curr_prefix)
+            out_dict["dynamic_paths"][finalized_prefix] = finalized_prefix
+            out_dict["dynamic_paths_default_value"][finalized_prefix] = DynamicPathsDefaultValue.EMPTY_DICT
         parse_class_inputs(out_dict, live_inputs, new_dict, curr_prefix)
 
 @comfytype(io_type="COMFY_DYNAMICCOMBO_V3")
@@ -1151,6 +1169,8 @@ class V3Data(TypedDict):
     'Dictionary where the keys are the hidden input ids and the values are the values of the hidden inputs.'
     dynamic_paths: dict[str, Any]
     'Dictionary where the keys are the input ids and the values dictate how to turn the inputs into a nested dictionary.'
+    dynamic_paths_default_value: dict[str, Any]
+    'Dictionary where the keys are the input ids and the values are a string from DynamicPathsDefaultValue for the inputs if value is None.'
     create_dynamic_tuple: bool
     'When True, the value of the dynamic input will be in the format (value, path_key).'
 
@@ -1504,6 +1524,7 @@ def get_finalized_class_inputs(d: dict[str, Any], live_inputs: dict[str, Any], i
         "required": {},
         "optional": {},
         "dynamic_paths": {},
+        "dynamic_paths_default_value": {},
     }
     d = d.copy()
     # ignore hidden for parsing
@@ -1513,8 +1534,12 @@ def get_finalized_class_inputs(d: dict[str, Any], live_inputs: dict[str, Any], i
         out_dict["hidden"] = hidden
     v3_data = {}
     dynamic_paths = out_dict.pop("dynamic_paths", None)
-    if dynamic_paths is not None:
+    if dynamic_paths is not None and len(dynamic_paths) > 0:
         v3_data["dynamic_paths"] = dynamic_paths
+    # this list is used for autogrow, in the case all inputs are optional and no values are passed
+    dynamic_paths_default_value = out_dict.pop("dynamic_paths_default_value", None)
+    if dynamic_paths_default_value is not None and len(dynamic_paths_default_value) > 0:
+        v3_data["dynamic_paths_default_value"] = dynamic_paths_default_value
     return out_dict, hidden, v3_data
 
 def parse_class_inputs(out_dict: dict[str, Any], live_inputs: dict[str, Any], curr_dict: dict[str, Any], curr_prefix: list[str] | None=None) -> None:
@@ -1551,11 +1576,16 @@ def add_to_dict_v1(i: Input, d: dict):
 def add_to_dict_v3(io: Input | Output, d: dict):
     d[io.id] = (io.get_io_type(), io.as_dict())
 
+class DynamicPathsDefaultValue:
+    EMPTY_DICT = "empty_dict"
+
 def build_nested_inputs(values: dict[str, Any], v3_data: V3Data):
     paths = v3_data.get("dynamic_paths", None)
+    default_value_dict = v3_data.get("dynamic_paths_default_value", {})
     if paths is None:
         return values
     values = values.copy()
+
     result = {}
 
     create_tuple = v3_data.get("create_dynamic_tuple", False)
@@ -1569,6 +1599,11 @@ def build_nested_inputs(values: dict[str, Any], v3_data: V3Data):
 
             if is_last:
                 value = values.pop(key, None)
+                if value is None:
+                    # see if a default value was provided for this key
+                    default_option = default_value_dict.get(key, None)
+                    if default_option == DynamicPathsDefaultValue.EMPTY_DICT:
+                        value = {}
                 if create_tuple:
                     value = (value, key)
                 current[p] = value

From e0eacb06883c1f7ddf8af249cd461d7c2ebcbaae Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 19 Jan 2026 19:00:36 -0800
Subject: [PATCH 04/19] Simpler way to implement the #11980 loras. (#11981)

---
 comfy/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comfy/utils.py b/comfy/utils.py
index 2e33a4258..5e79fb449 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -639,6 +639,8 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                         "proj_out.bias": "linear2.bias",
                         "attn.norm_q.weight": "norm.query_norm.scale",
                         "attn.norm_k.weight": "norm.key_norm.scale",
+                        "attn.to_qkv_mlp_proj.weight": "linear1.weight", # Flux 2
+                        "attn.to_out.weight": "linear2.weight", # Flux 2
                     }
 
         for k in block_map:

From 0da5a0fe58ae940726a61b94698e303fb39d73c1 Mon Sep 17 00:00:00 2001
From: rkfg <rkfg@rkfg.me>
Date: Tue, 20 Jan 2026 06:12:02 +0300
Subject: [PATCH 05/19] Convert mono audio to fake stereo for LTXV VAE encoding
 (#11965)

---
 comfy/ldm/lightricks/vae/audio_vae.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py
index a9111d3bd..29d9e6c29 100644
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@@ -189,9 +189,12 @@ class AudioVAE(torch.nn.Module):
         waveform = self.device_manager.move_to_load_device(waveform)
         expected_channels = self.autoencoder.encoder.in_channels
         if waveform.shape[1] != expected_channels:
-            raise ValueError(
-                f"Input audio must have {expected_channels} channels, got {waveform.shape[1]}"
-            )
+            if waveform.shape[1] == 1:
+                waveform = waveform.expand(-1, expected_channels, *waveform.shape[2:])
+            else:
+                raise ValueError(
+                    f"Input audio must have {expected_channels} channels, got {waveform.shape[1]}"
+                )
 
         mel_spec = self.preprocessor.waveform_to_mel(
             waveform, waveform_sample_rate, device=self.device_manager.load_device

From 70c91b8248e08492cf16bfebdc83579b801a6ee0 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 19 Jan 2026 19:32:40 -0800
Subject: [PATCH 06/19] Fix #11963 (#11982)

---
 comfy/text_encoders/ovis.py    | 1 +
 comfy/text_encoders/z_image.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/comfy/text_encoders/ovis.py b/comfy/text_encoders/ovis.py
index 5754424d2..2cc0867c3 100644
--- a/comfy/text_encoders/ovis.py
+++ b/comfy/text_encoders/ovis.py
@@ -61,6 +61,7 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
             if dtype_llama is not None:
                 dtype = dtype_llama
             if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
                 model_options["quantization_metadata"] = llama_quantization_metadata
             super().__init__(device=device, dtype=dtype, model_options=model_options)
     return OvisTEModel_
diff --git a/comfy/text_encoders/z_image.py b/comfy/text_encoders/z_image.py
index 19adde0b7..ad41bfb1e 100644
--- a/comfy/text_encoders/z_image.py
+++ b/comfy/text_encoders/z_image.py
@@ -40,6 +40,7 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
             if dtype_llama is not None:
                 dtype = dtype_llama
             if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
                 model_options["quantization_metadata"] = llama_quantization_metadata
             super().__init__(device=device, dtype=dtype, model_options=model_options)
     return ZImageTEModel_

From 9d273d3ab1fb1d2c8b34de4d54cabe50a5a3e5bc Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Mon, 19 Jan 2026 22:40:18 -0500
Subject: [PATCH 07/19] ComfyUI v0.10.0

---
 comfyui_version.py | 2 +-
 pyproject.toml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfyui_version.py b/comfyui_version.py
index dbb57b4e5..952d413db 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.9.2"
+__version__ = "0.10.0"
diff --git a/pyproject.toml b/pyproject.toml
index 9ea73da05..120b6c751 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.9.2"
+version = "0.10.0"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"

From 2108167f9f70cfd4874945b31a916680f959a6d7 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 19 Jan 2026 20:17:38 -0800
Subject: [PATCH 08/19] Support zimage omni base model. (#11979)

---
 comfy/ldm/lumina/model.py    | 317 ++++++++++++++++++++++++++++-------
 comfy/model_base.py          |  30 ++++
 comfy/model_detection.py     |   3 +
 comfy_extras/nodes_zimage.py |  88 ++++++++++
 nodes.py                     |   1 +
 5 files changed, 381 insertions(+), 58 deletions(-)
 create mode 100644 comfy_extras/nodes_zimage.py

diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index afbab2ac7..139f879a1 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -13,10 +13,53 @@ from comfy.ldm.modules.attention import optimized_attention_masked
 from comfy.ldm.flux.layers import EmbedND
 from comfy.ldm.flux.math import apply_rope
 import comfy.patcher_extension
+import comfy.utils
 
 
-def modulate(x, scale):
-    return x * (1 + scale.unsqueeze(1))
+def invert_slices(slices, length):
+    sorted_slices = sorted(slices)
+    result = []
+    current = 0
+
+    for start, end in sorted_slices:
+        if current < start:
+            result.append((current, start))
+        current = max(current, end)
+
+    if current < length:
+        result.append((current, length))
+
+    return result
+
+
+def modulate(x, scale, timestep_zero_index=None):
+    if timestep_zero_index is None:
+        return x * (1 + scale.unsqueeze(1))
+    else:
+        scale = (1 + scale.unsqueeze(1))
+        actual_batch = scale.size(0) // 2
+        slices = timestep_zero_index
+        invert = invert_slices(timestep_zero_index, x.shape[1])
+        for s in slices:
+            x[:, s[0]:s[1]] *= scale[actual_batch:]
+        for s in invert:
+            x[:, s[0]:s[1]] *= scale[:actual_batch]
+        return x
+
+
+def apply_gate(gate, x, timestep_zero_index=None):
+    if timestep_zero_index is None:
+        return gate * x
+    else:
+        actual_batch = gate.size(0) // 2
+
+        slices = timestep_zero_index
+        invert = invert_slices(timestep_zero_index, x.shape[1])
+        for s in slices:
+            x[:, s[0]:s[1]] *= gate[actual_batch:]
+        for s in invert:
+            x[:, s[0]:s[1]] *= gate[:actual_batch]
+        return x
 
 #############################################################################
 #                               Core NextDiT Model                              #
@@ -258,6 +301,7 @@ class JointTransformerBlock(nn.Module):
         x_mask: torch.Tensor,
         freqs_cis: torch.Tensor,
         adaln_input: Optional[torch.Tensor]=None,
+        timestep_zero_index=None,
         transformer_options={},
     ):
         """
@@ -276,18 +320,18 @@ class JointTransformerBlock(nn.Module):
             assert adaln_input is not None
             scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
 
-            x = x + gate_msa.unsqueeze(1).tanh() * self.attention_norm2(
+            x = x + apply_gate(gate_msa.unsqueeze(1).tanh(), self.attention_norm2(
                 clamp_fp16(self.attention(
-                    modulate(self.attention_norm1(x), scale_msa),
+                    modulate(self.attention_norm1(x), scale_msa, timestep_zero_index=timestep_zero_index),
                     x_mask,
                     freqs_cis,
                     transformer_options=transformer_options,
-                ))
+                ))), timestep_zero_index=timestep_zero_index
             )
-            x = x + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(
+            x = x + apply_gate(gate_mlp.unsqueeze(1).tanh(), self.ffn_norm2(
                 clamp_fp16(self.feed_forward(
-                    modulate(self.ffn_norm1(x), scale_mlp),
-                ))
+                    modulate(self.ffn_norm1(x), scale_mlp, timestep_zero_index=timestep_zero_index),
+                ))), timestep_zero_index=timestep_zero_index
             )
         else:
             assert adaln_input is None
@@ -345,13 +389,37 @@ class FinalLayer(nn.Module):
             ),
         )
 
-    def forward(self, x, c):
+    def forward(self, x, c, timestep_zero_index=None):
         scale = self.adaLN_modulation(c)
-        x = modulate(self.norm_final(x), scale)
+        x = modulate(self.norm_final(x), scale, timestep_zero_index=timestep_zero_index)
         x = self.linear(x)
         return x
 
 
+def pad_zimage(feats, pad_token, pad_tokens_multiple):
+    pad_extra = (-feats.shape[1]) % pad_tokens_multiple
+    return torch.cat((feats, pad_token.to(device=feats.device, dtype=feats.dtype, copy=True).unsqueeze(0).repeat(feats.shape[0], pad_extra, 1)), dim=1), pad_extra
+
+
+def pos_ids_x(start_t, H_tokens, W_tokens, batch_size, device, transformer_options={}):
+    rope_options = transformer_options.get("rope_options", None)
+    h_scale = 1.0
+    w_scale = 1.0
+    h_start = 0
+    w_start = 0
+    if rope_options is not None:
+        h_scale = rope_options.get("scale_y", 1.0)
+        w_scale = rope_options.get("scale_x", 1.0)
+
+        h_start = rope_options.get("shift_y", 0.0)
+        w_start = rope_options.get("shift_x", 0.0)
+    x_pos_ids = torch.zeros((batch_size, H_tokens * W_tokens, 3), dtype=torch.float32, device=device)
+    x_pos_ids[:, :, 0] = start_t
+    x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
+    x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
+    return x_pos_ids
+
+
 class NextDiT(nn.Module):
     """
     Diffusion model with a Transformer backbone.
@@ -378,6 +446,7 @@ class NextDiT(nn.Module):
         time_scale=1.0,
         pad_tokens_multiple=None,
         clip_text_dim=None,
+        siglip_feat_dim=None,
         image_model=None,
         device=None,
         dtype=None,
@@ -491,6 +560,41 @@ class NextDiT(nn.Module):
                 for layer_id in range(n_layers)
             ]
         )
+
+        if siglip_feat_dim is not None:
+            self.siglip_embedder = nn.Sequential(
+                operation_settings.get("operations").RMSNorm(siglip_feat_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
+                operation_settings.get("operations").Linear(
+                    siglip_feat_dim,
+                    dim,
+                    bias=True,
+                    device=operation_settings.get("device"),
+                    dtype=operation_settings.get("dtype"),
+                ),
+            )
+            self.siglip_refiner = nn.ModuleList(
+                [
+                    JointTransformerBlock(
+                        layer_id,
+                        dim,
+                        n_heads,
+                        n_kv_heads,
+                        multiple_of,
+                        ffn_dim_multiplier,
+                        norm_eps,
+                        qk_norm,
+                        modulation=False,
+                        operation_settings=operation_settings,
+                    )
+                    for layer_id in range(n_refiner_layers)
+                ]
+            )
+            self.siglip_pad_token = nn.Parameter(torch.empty((1, dim), device=device, dtype=dtype))
+        else:
+            self.siglip_embedder = None
+            self.siglip_refiner = None
+            self.siglip_pad_token = None
+
         # This norm final is in the lumina 2.0 code but isn't actually used for anything.
         # self.norm_final = operation_settings.get("operations").RMSNorm(dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype"))
         self.final_layer = FinalLayer(dim, patch_size, self.out_channels, z_image_modulation=z_image_modulation, operation_settings=operation_settings)
@@ -531,70 +635,166 @@ class NextDiT(nn.Module):
             imgs = torch.stack(imgs, dim=0)
         return imgs
 
-    def patchify_and_embed(
-        self, x: List[torch.Tensor] | torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, transformer_options={}
-    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
-        bsz = len(x)
-        pH = pW = self.patch_size
-        device = x[0].device
-        orig_x = x
-
-        if self.pad_tokens_multiple is not None:
-            pad_extra = (-cap_feats.shape[1]) % self.pad_tokens_multiple
-            cap_feats = torch.cat((cap_feats, self.cap_pad_token.to(device=cap_feats.device, dtype=cap_feats.dtype, copy=True).unsqueeze(0).repeat(cap_feats.shape[0], pad_extra, 1)), dim=1)
+    def embed_cap(self, cap_feats=None, offset=0, bsz=1, device=None, dtype=None):
+        if cap_feats is not None:
+            cap_feats = self.cap_embedder(cap_feats)
+            cap_feats_len = cap_feats.shape[1]
+            if self.pad_tokens_multiple is not None:
+                cap_feats, _ = pad_zimage(cap_feats, self.cap_pad_token, self.pad_tokens_multiple)
+        else:
+            cap_feats_len = 0
+            cap_feats = self.cap_pad_token.to(device=device, dtype=dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
 
         cap_pos_ids = torch.zeros(bsz, cap_feats.shape[1], 3, dtype=torch.float32, device=device)
-        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0
+        cap_pos_ids[:, :, 0] = torch.arange(cap_feats.shape[1], dtype=torch.float32, device=device) + 1.0 + offset
+        embeds = (cap_feats,)
+        freqs_cis = (self.rope_embedder(cap_pos_ids).movedim(1, 2),)
+        return embeds, freqs_cis, cap_feats_len
+
+    def embed_all(self, x, cap_feats=None, siglip_feats=None, offset=0, omni=False, transformer_options={}):
+        bsz = 1
+        pH = pW = self.patch_size
+        device = x.device
+        embeds, freqs_cis, cap_feats_len = self.embed_cap(cap_feats, offset=offset, bsz=bsz, device=device, dtype=x.dtype)
+
+        if not omni:
+            cap_feats_len = embeds[0].shape[1] + offset
+            embeds += (None,)
+            freqs_cis += (None,)
+        else:
+            cap_feats_len += offset
+            if siglip_feats is not None:
+                b, h, w, c = siglip_feats.shape
+                siglip_feats = siglip_feats.permute(0, 3, 1, 2).reshape(b, h * w, c)
+                siglip_feats = self.siglip_embedder(siglip_feats)
+                siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
+                siglip_pos_ids[:, :, 0] = cap_feats_len + 2
+                siglip_pos_ids[:, :, 1] = (torch.linspace(0, h * 8 - 1, steps=h, dtype=torch.float32, device=device).floor()).view(-1, 1).repeat(1, w).flatten()
+                siglip_pos_ids[:, :, 2] = (torch.linspace(0, w * 8 - 1, steps=w, dtype=torch.float32, device=device).floor()).view(1, -1).repeat(h, 1).flatten()
+                if self.siglip_pad_token is not None:
+                    siglip_feats, pad_extra = pad_zimage(siglip_feats, self.siglip_pad_token, self.pad_tokens_multiple)  # TODO: double check
+                    siglip_pos_ids = torch.nn.functional.pad(siglip_pos_ids, (0, 0, 0, pad_extra))
+            else:
+                siglip_feats = self.siglip_pad_token.to(device=device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
+                siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
+
+            if siglip_feats is None:
+                embeds += (None,)
+                freqs_cis += (None,)
+            else:
+                embeds += (siglip_feats,)
+                freqs_cis += (self.rope_embedder(siglip_pos_ids).movedim(1, 2),)
 
         B, C, H, W = x.shape
         x = self.x_embedder(x.view(B, C, H // pH, pH, W // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2))
-
-        rope_options = transformer_options.get("rope_options", None)
-        h_scale = 1.0
-        w_scale = 1.0
-        h_start = 0
-        w_start = 0
-        if rope_options is not None:
-            h_scale = rope_options.get("scale_y", 1.0)
-            w_scale = rope_options.get("scale_x", 1.0)
-
-            h_start = rope_options.get("shift_y", 0.0)
-            w_start = rope_options.get("shift_x", 0.0)
-
-        H_tokens, W_tokens = H // pH, W // pW
-        x_pos_ids = torch.zeros((bsz, x.shape[1], 3), dtype=torch.float32, device=device)
-        x_pos_ids[:, :, 0] = cap_feats.shape[1] + 1
-        x_pos_ids[:, :, 1] = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten()
-        x_pos_ids[:, :, 2] = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten()
-
+        x_pos_ids = pos_ids_x(cap_feats_len + 1, H // pH, W // pW, bsz, device, transformer_options=transformer_options)
         if self.pad_tokens_multiple is not None:
-            pad_extra = (-x.shape[1]) % self.pad_tokens_multiple
-            x = torch.cat((x, self.x_pad_token.to(device=x.device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(x.shape[0], pad_extra, 1)), dim=1)
+            x, pad_extra = pad_zimage(x, self.x_pad_token, self.pad_tokens_multiple)
             x_pos_ids = torch.nn.functional.pad(x_pos_ids, (0, 0, 0, pad_extra))
 
-        freqs_cis = self.rope_embedder(torch.cat((cap_pos_ids, x_pos_ids), dim=1)).movedim(1, 2)
+        embeds += (x,)
+        freqs_cis += (self.rope_embedder(x_pos_ids).movedim(1, 2),)
+        return embeds, freqs_cis, cap_feats_len + len(freqs_cis) - 1
+
+
+    def patchify_and_embed(
+        self, x: torch.Tensor, cap_feats: torch.Tensor, cap_mask: torch.Tensor, t: torch.Tensor, num_tokens, ref_latents=[], ref_contexts=[], siglip_feats=[], transformer_options={}
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[Tuple[int, int]], List[int], torch.Tensor]:
+        bsz = x.shape[0]
+        cap_mask = None  # TODO?
+        main_siglip = None
+        orig_x = x
+
+        embeds = ([], [], [])
+        freqs_cis = ([], [], [])
+        leftover_cap = []
+
+        start_t = 0
+        omni = len(ref_latents) > 0
+        if omni:
+            for i, ref in enumerate(ref_latents):
+                if i < len(ref_contexts):
+                    ref_con = ref_contexts[i]
+                else:
+                    ref_con = None
+                if i < len(siglip_feats):
+                    sig_feat = siglip_feats[i]
+                else:
+                    sig_feat = None
+
+                out = self.embed_all(ref, ref_con, sig_feat, offset=start_t, omni=omni, transformer_options=transformer_options)
+                for i, e in enumerate(out[0]):
+                    embeds[i].append(comfy.utils.repeat_to_batch_size(e, bsz))
+                    freqs_cis[i].append(out[1][i])
+                start_t = out[2]
+            leftover_cap = ref_contexts[len(ref_latents):]
+
+        H, W = x.shape[-2], x.shape[-1]
+        img_sizes = [(H, W)] * bsz
+        out = self.embed_all(x, cap_feats, main_siglip, offset=start_t, omni=omni, transformer_options=transformer_options)
+        img_len = out[0][-1].shape[1]
+        cap_len = out[0][0].shape[1]
+        for i, e in enumerate(out[0]):
+            if e is not None:
+                e = comfy.utils.repeat_to_batch_size(e, bsz)
+                embeds[i].append(e)
+                freqs_cis[i].append(out[1][i])
+        start_t = out[2]
+
+        for cap in leftover_cap:
+            out = self.embed_cap(cap, offset=start_t, bsz=bsz, device=x.device, dtype=x.dtype)
+            cap_len += out[0][0].shape[1]
+            embeds[0].append(comfy.utils.repeat_to_batch_size(out[0][0], bsz))
+            freqs_cis[0].append(out[1][0])
+            start_t += out[2]
 
         patches = transformer_options.get("patches", {})
 
         # refine context
+        cap_feats = torch.cat(embeds[0], dim=1)
+        cap_freqs_cis = torch.cat(freqs_cis[0], dim=1)
         for layer in self.context_refiner:
-            cap_feats = layer(cap_feats, cap_mask, freqs_cis[:, :cap_pos_ids.shape[1]], transformer_options=transformer_options)
+            cap_feats = layer(cap_feats, cap_mask, cap_freqs_cis, transformer_options=transformer_options)
+
+        feats = (cap_feats,)
+        fc = (cap_freqs_cis,)
+
+        if omni:
+            siglip_mask = None
+            siglip_feats_combined = torch.cat(embeds[1], dim=1)
+            siglip_feats_freqs_cis = torch.cat(freqs_cis[1], dim=1)
+            if self.siglip_refiner is not None:
+                for layer in self.siglip_refiner:
+                    siglip_feats_combined = layer(siglip_feats_combined, siglip_mask, siglip_feats_freqs_cis, transformer_options=transformer_options)
+            feats += (siglip_feats_combined,)
+            fc += (siglip_feats_freqs_cis,)
 
         padded_img_mask = None
+        x = torch.cat(embeds[-1], dim=1)
+        fc_x = torch.cat(freqs_cis[-1], dim=1)
+        if omni:
+            timestep_zero_index = [(x.shape[1] - img_len, x.shape[1])]
+        else:
+            timestep_zero_index = None
+
         x_input = x
         for i, layer in enumerate(self.noise_refiner):
-            x = layer(x, padded_img_mask, freqs_cis[:, cap_pos_ids.shape[1]:], t, transformer_options=transformer_options)
+            x = layer(x, padded_img_mask, fc_x, t, timestep_zero_index=timestep_zero_index, transformer_options=transformer_options)
             if "noise_refiner" in patches:
                 for p in patches["noise_refiner"]:
-                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": freqs_cis[:, cap_pos_ids.shape[1]:], "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
+                    out = p({"img": x, "img_input": x_input, "txt": cap_feats, "pe": fc_x, "vec": t, "x": orig_x, "block_index": i, "transformer_options": transformer_options, "block_type": "noise_refiner"})
                     if "img" in out:
                         x = out["img"]
 
-        padded_full_embed = torch.cat((cap_feats, x), dim=1)
+        padded_full_embed = torch.cat(feats + (x,), dim=1)
+        if timestep_zero_index is not None:
+            ind = padded_full_embed.shape[1] - x.shape[1]
+            timestep_zero_index = [(ind + x.shape[1] - img_len, ind + x.shape[1])]
+            timestep_zero_index.append((feats[0].shape[1] - cap_len, feats[0].shape[1]))
+
         mask = None
-        img_sizes = [(H, W)] * bsz
-        l_effective_cap_len = [cap_feats.shape[1]] * bsz
-        return padded_full_embed, mask, img_sizes, l_effective_cap_len, freqs_cis
+        l_effective_cap_len = [padded_full_embed.shape[1] - img_len] * bsz
+        return padded_full_embed, mask, img_sizes, l_effective_cap_len, torch.cat(fc + (fc_x,), dim=1), timestep_zero_index
 
     def forward(self, x, timesteps, context, num_tokens, attention_mask=None, **kwargs):
         return comfy.patcher_extension.WrapperExecutor.new_class_executor(
@@ -604,7 +804,11 @@ class NextDiT(nn.Module):
         ).execute(x, timesteps, context, num_tokens, attention_mask, **kwargs)
 
     # def forward(self, x, t, cap_feats, cap_mask):
-    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timesteps, context, num_tokens, attention_mask=None, ref_latents=[], ref_contexts=[], siglip_feats=[], transformer_options={}, **kwargs):
+        omni = len(ref_latents) > 0
+        if omni:
+            timesteps = torch.cat([timesteps * 0, timesteps], dim=0)
+
         t = 1.0 - timesteps
         cap_feats = context
         cap_mask = attention_mask
@@ -619,8 +823,6 @@ class NextDiT(nn.Module):
         t = self.t_embedder(t * self.time_scale, dtype=x.dtype)  # (N, D)
         adaln_input = t
 
-        cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
-
         if self.clip_text_pooled_proj is not None:
             pooled = kwargs.get("clip_text_pooled", None)
             if pooled is not None:
@@ -632,7 +834,7 @@ class NextDiT(nn.Module):
 
         patches = transformer_options.get("patches", {})
         x_is_tensor = isinstance(x, torch.Tensor)
-        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
+        img, mask, img_size, cap_size, freqs_cis, timestep_zero_index = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, ref_latents=ref_latents, ref_contexts=ref_contexts, siglip_feats=siglip_feats, transformer_options=transformer_options)
         freqs_cis = freqs_cis.to(img.device)
 
         transformer_options["total_blocks"] = len(self.layers)
@@ -640,7 +842,7 @@ class NextDiT(nn.Module):
         img_input = img
         for i, layer in enumerate(self.layers):
             transformer_options["block_index"] = i
-            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+            img = layer(img, mask, freqs_cis, adaln_input, timestep_zero_index=timestep_zero_index, transformer_options=transformer_options)
             if "double_block" in patches:
                 for p in patches["double_block"]:
                     out = p({"img": img[:, cap_size[0]:], "img_input": img_input[:, cap_size[0]:], "txt": img[:, :cap_size[0]], "pe": freqs_cis[:, cap_size[0]:], "vec": adaln_input, "x": x, "block_index": i, "transformer_options": transformer_options})
@@ -649,8 +851,7 @@ class NextDiT(nn.Module):
                     if "txt" in out:
                         img[:, :cap_size[0]] = out["txt"]
 
-        img = self.final_layer(img, adaln_input)
+        img = self.final_layer(img, adaln_input, timestep_zero_index=timestep_zero_index)
         img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)[:, :, :h, :w]
-
         return -img
 
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 49efd700b..28ba2643e 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -1150,6 +1150,7 @@ class CosmosPredict2(BaseModel):
 class Lumina2(BaseModel):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
         super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
+        self.memory_usage_factor_conds = ("ref_latents",)
 
     def extra_conds(self, **kwargs):
         out = super().extra_conds(**kwargs)
@@ -1169,6 +1170,35 @@ class Lumina2(BaseModel):
         if clip_text_pooled is not None:
             out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
 
+        clip_vision_outputs = kwargs.get("clip_vision_outputs", list(map(lambda a: a.get("clip_vision_output"), kwargs.get("unclip_conditioning", [{}]))))  # Z Image omni
+        if clip_vision_outputs is not None and len(clip_vision_outputs) > 0:
+            sigfeats = []
+            for clip_vision_output in clip_vision_outputs:
+                if clip_vision_output is not None:
+                    image_size = clip_vision_output.image_sizes[0]
+                    shape = clip_vision_output.last_hidden_state.shape
+                    sigfeats.append(clip_vision_output.last_hidden_state.reshape(shape[0], image_size[1] // 16, image_size[2] // 16, shape[-1]))
+            if len(sigfeats) > 0:
+                out['siglip_feats'] = comfy.conds.CONDList(sigfeats)
+
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            latents = []
+            for lat in ref_latents:
+                latents.append(self.process_latent_in(lat))
+            out['ref_latents'] = comfy.conds.CONDList(latents)
+
+        ref_contexts = kwargs.get("reference_latents_text_embeds", None)
+        if ref_contexts is not None:
+            out['ref_contexts'] = comfy.conds.CONDList(ref_contexts)
+
+        return out
+
+    def extra_conds_shapes(self, **kwargs):
+        out = {}
+        ref_latents = kwargs.get("reference_latents", None)
+        if ref_latents is not None:
+            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
         return out
 
 class WAN21(BaseModel):
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index aff5a50b9..42884f797 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -446,6 +446,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["time_scale"] = 1000.0
             if '{}cap_pad_token'.format(key_prefix) in state_dict_keys:
                 dit_config["pad_tokens_multiple"] = 32
+            sig_weight = state_dict.get('{}siglip_embedder.0.weight'.format(key_prefix), None)
+            if sig_weight is not None:
+                dit_config["siglip_feat_dim"] = sig_weight.shape[0]
 
         return dit_config
 
diff --git a/comfy_extras/nodes_zimage.py b/comfy_extras/nodes_zimage.py
new file mode 100644
index 000000000..2ee3c43b1
--- /dev/null
+++ b/comfy_extras/nodes_zimage.py
@@ -0,0 +1,88 @@
+import node_helpers
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+import math
+import comfy.utils
+
+
+class TextEncodeZImageOmni(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeZImageOmni",
+            category="advanced/conditioning",
+            is_experimental=True,
+            inputs=[
+                io.Clip.Input("clip"),
+                io.ClipVision.Input("image_encoder", optional=True),
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
+                io.Boolean.Input("auto_resize_images", default=True),
+                io.Vae.Input("vae", optional=True),
+                io.Image.Input("image1", optional=True),
+                io.Image.Input("image2", optional=True),
+                io.Image.Input("image3", optional=True),
+            ],
+            outputs=[
+                io.Conditioning.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, clip, prompt, image_encoder=None, auto_resize_images=True, vae=None, image1=None, image2=None, image3=None) -> io.NodeOutput:
+        ref_latents = []
+        images = list(filter(lambda a: a is not None, [image1, image2, image3]))
+
+        prompt_list = []
+        template = None
+        if len(images) > 0:
+            prompt_list = ["<|im_start|>user\n<|vision_start|>"]
+            prompt_list += ["<|vision_end|><|vision_start|>"] * (len(images) - 1)
+            prompt_list += ["<|vision_end|><|im_end|>"]
+            template = "<|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n<|vision_start|>"
+
+        encoded_images = []
+
+        for i, image in enumerate(images):
+            if image_encoder is not None:
+                encoded_images.append(image_encoder.encode_image(image))
+
+            if vae is not None:
+                if auto_resize_images:
+                    samples = image.movedim(-1, 1)
+                    total = int(1024 * 1024)
+                    scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+                    width = round(samples.shape[3] * scale_by / 8.0) * 8
+                    height = round(samples.shape[2] * scale_by / 8.0) * 8
+
+                    image = comfy.utils.common_upscale(samples, width, height, "area", "disabled").movedim(1, -1)
+                ref_latents.append(vae.encode(image))
+
+        tokens = clip.tokenize(prompt, llama_template=template)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+
+        extra_text_embeds = []
+        for p in prompt_list:
+            tokens = clip.tokenize(p, llama_template="{}")
+            text_embeds = clip.encode_from_tokens_scheduled(tokens)
+            extra_text_embeds.append(text_embeds[0][0])
+
+        if len(ref_latents) > 0:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True)
+        if len(encoded_images) > 0:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"clip_vision_outputs": encoded_images}, append=True)
+        if len(extra_text_embeds) > 0:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents_text_embeds": extra_text_embeds}, append=True)
+
+        return io.NodeOutput(conditioning)
+
+
+class ZImageExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            TextEncodeZImageOmni,
+        ]
+
+
+async def comfy_entrypoint() -> ZImageExtension:
+    return ZImageExtension()
diff --git a/nodes.py b/nodes.py
index cba8eacc2..ea5d6e525 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2373,6 +2373,7 @@ async def init_builtin_extra_nodes():
         "nodes_kandinsky5.py",
         "nodes_wanmove.py",
         "nodes_image_compare.py",
+        "nodes_zimage.py",
     ]
 
     import_failed = []

From 0fc3b6e3a6f1d8fdffca3a51cb4d10a06f4e079d Mon Sep 17 00:00:00 2001
From: ComfyUI Wiki <contact@comfyui-wiki.com>
Date: Tue, 20 Jan 2026 12:17:56 +0800
Subject: [PATCH 09/19] chore: update workflow templates to v0.8.15 (#11984)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 312c7c137..35543525d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.36.14
-comfyui-workflow-templates==0.8.14
+comfyui-workflow-templates==0.8.15
 comfyui-embedded-docs==0.4.0
 torch
 torchsde

From 4edb87aa50190139a38a2ccd6b6ee35ba9df4da1 Mon Sep 17 00:00:00 2001
From: Comfy Org PR Bot <snomiao+comfy-pr@gmail.com>
Date: Tue, 20 Jan 2026 13:57:50 +0900
Subject: [PATCH 10/19] Bump comfyui-frontend-package to 1.37.11 (#11976)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 35543525d..ec89dccd2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-comfyui-frontend-package==1.36.14
+comfyui-frontend-package==1.37.11
 comfyui-workflow-templates==0.8.15
 comfyui-embedded-docs==0.4.0
 torch

From 8ccc0c94fa0d8e43fffe7190e6a36551a53df54a Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 19 Jan 2026 21:32:00 -0800
Subject: [PATCH 11/19] Make omni stuff work on regular z image for easier
 testing. (#11985)

---
 comfy/ldm/lumina/model.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index 139f879a1..b114d9e31 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -657,7 +657,7 @@ class NextDiT(nn.Module):
         device = x.device
         embeds, freqs_cis, cap_feats_len = self.embed_cap(cap_feats, offset=offset, bsz=bsz, device=device, dtype=x.dtype)
 
-        if not omni:
+        if (not omni) or self.siglip_embedder is None:
             cap_feats_len = embeds[0].shape[1] + offset
             embeds += (None,)
             freqs_cis += (None,)
@@ -675,8 +675,9 @@ class NextDiT(nn.Module):
                     siglip_feats, pad_extra = pad_zimage(siglip_feats, self.siglip_pad_token, self.pad_tokens_multiple)  # TODO: double check
                     siglip_pos_ids = torch.nn.functional.pad(siglip_pos_ids, (0, 0, 0, pad_extra))
             else:
-                siglip_feats = self.siglip_pad_token.to(device=device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
-                siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
+                if self.siglip_pad_token is not None:
+                    siglip_feats = self.siglip_pad_token.to(device=device, dtype=x.dtype, copy=True).unsqueeze(0).repeat(bsz, self.pad_tokens_multiple, 1)
+                    siglip_pos_ids = torch.zeros((bsz, siglip_feats.shape[1], 3), dtype=torch.float32, device=device)
 
             if siglip_feats is None:
                 embeds += (None,)
@@ -724,8 +725,9 @@ class NextDiT(nn.Module):
 
                 out = self.embed_all(ref, ref_con, sig_feat, offset=start_t, omni=omni, transformer_options=transformer_options)
                 for i, e in enumerate(out[0]):
-                    embeds[i].append(comfy.utils.repeat_to_batch_size(e, bsz))
-                    freqs_cis[i].append(out[1][i])
+                    if e is not None:
+                        embeds[i].append(comfy.utils.repeat_to_batch_size(e, bsz))
+                        freqs_cis[i].append(out[1][i])
                 start_t = out[2]
             leftover_cap = ref_contexts[len(ref_latents):]
 
@@ -759,7 +761,7 @@ class NextDiT(nn.Module):
         feats = (cap_feats,)
         fc = (cap_freqs_cis,)
 
-        if omni:
+        if omni and len(embeds[1]) > 0:
             siglip_mask = None
             siglip_feats_combined = torch.cat(embeds[1], dim=1)
             siglip_feats_freqs_cis = torch.cat(freqs_cis[1], dim=1)

From ddc541ffdae0fe626de5a33192001f31c6ab93c6 Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Tue, 20 Jan 2026 23:05:40 +0200
Subject: [PATCH 12/19] feat(api-nodes): add WaveSpeed nodes (#11945)

---
 comfy_api_nodes/apis/wavespeed.py  |  35 ++++++
 comfy_api_nodes/nodes_wavespeed.py | 178 +++++++++++++++++++++++++++++
 2 files changed, 213 insertions(+)
 create mode 100644 comfy_api_nodes/apis/wavespeed.py
 create mode 100644 comfy_api_nodes/nodes_wavespeed.py

diff --git a/comfy_api_nodes/apis/wavespeed.py b/comfy_api_nodes/apis/wavespeed.py
new file mode 100644
index 000000000..07a7bfa5d
--- /dev/null
+++ b/comfy_api_nodes/apis/wavespeed.py
@@ -0,0 +1,35 @@
+from pydantic import BaseModel, Field
+
+
+class SeedVR2ImageRequest(BaseModel):
+    image: str = Field(...)
+    target_resolution: str = Field(...)
+    output_format: str = Field("png")
+    enable_sync_mode: bool = Field(False)
+
+
+class FlashVSRRequest(BaseModel):
+    target_resolution: str = Field(...)
+    video: str = Field(...)
+    duration: float = Field(...)
+
+
+class TaskCreatedDataResponse(BaseModel):
+    id: str = Field(...)
+
+
+class TaskCreatedResponse(BaseModel):
+    code: int = Field(...)
+    message: str = Field(...)
+    data: TaskCreatedDataResponse | None = Field(None)
+
+
+class TaskResultDataResponse(BaseModel):
+    status: str = Field(...)
+    outputs: list[str] = Field([])
+
+
+class TaskResultResponse(BaseModel):
+    code: int = Field(...)
+    message: str = Field(...)
+    data: TaskResultDataResponse | None = Field(None)
diff --git a/comfy_api_nodes/nodes_wavespeed.py b/comfy_api_nodes/nodes_wavespeed.py
new file mode 100644
index 000000000..c59fafd3b
--- /dev/null
+++ b/comfy_api_nodes/nodes_wavespeed.py
@@ -0,0 +1,178 @@
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.wavespeed import (
+    FlashVSRRequest,
+    TaskCreatedResponse,
+    TaskResultResponse,
+    SeedVR2ImageRequest,
+)
+from comfy_api_nodes.util import (
+    ApiEndpoint,
+    download_url_to_video_output,
+    poll_op,
+    sync_op,
+    upload_video_to_comfyapi,
+    validate_container_format_is_mp4,
+    validate_video_duration,
+    upload_images_to_comfyapi,
+    get_number_of_images,
+    download_url_to_image_tensor,
+)
+
+
+class WavespeedFlashVSRNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="WavespeedFlashVSRNode",
+            display_name="FlashVSR Video Upscale",
+            category="api node/video/WaveSpeed",
+            description="Fast, high-quality video upscaler that "
+            "boosts resolution and restores clarity for low-resolution or blurry footage.",
+            inputs=[
+                IO.Video.Input("video"),
+                IO.Combo.Input("target_resolution", options=["720p", "1080p", "2K", "4K"]),
+            ],
+            outputs=[
+                IO.Video.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["target_resolution"]),
+                expr="""
+                (
+                  $price_for_1sec := {"720p": 0.012, "1080p": 0.018, "2k": 0.024, "4k": 0.032};
+                  {
+                    "type":"usd",
+                    "usd": $lookup($price_for_1sec, widgets.target_resolution),
+                    "format":{"suffix": "/second", "approximate": true}
+                  }
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        video: Input.Video,
+        target_resolution: str,
+    ) -> IO.NodeOutput:
+        validate_container_format_is_mp4(video)
+        validate_video_duration(video, min_duration=5, max_duration=60 * 10)
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path="/proxy/wavespeed/api/v3/wavespeed-ai/flashvsr", method="POST"),
+            response_model=TaskCreatedResponse,
+            data=FlashVSRRequest(
+                target_resolution=target_resolution.lower(),
+                video=await upload_video_to_comfyapi(cls, video),
+                duration=video.get_duration(),
+            ),
+        )
+        if initial_res.code != 200:
+            raise ValueError(f"Task creation fails with code={initial_res.code} and message={initial_res.message}")
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wavespeed/api/v3/predictions/{initial_res.data.id}/result"),
+            response_model=TaskResultResponse,
+            status_extractor=lambda x: "failed" if x.data is None else x.data.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        if final_response.code != 200:
+            raise ValueError(
+                f"Task processing failed with code={final_response.code} and message={final_response.message}"
+            )
+        return IO.NodeOutput(await download_url_to_video_output(final_response.data.outputs[0]))
+
+
+class WavespeedImageUpscaleNode(IO.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return IO.Schema(
+            node_id="WavespeedImageUpscaleNode",
+            display_name="WaveSpeed Image Upscale",
+            category="api node/image/WaveSpeed",
+            description="Boost image resolution and quality, upscaling photos to 4K or 8K for sharp, detailed results.",
+            inputs=[
+                IO.Combo.Input("model", options=["SeedVR2", "Ultimate"]),
+                IO.Image.Input("image"),
+                IO.Combo.Input("target_resolution", options=["2K", "4K", "8K"]),
+            ],
+            outputs=[
+                IO.Image.Output(),
+            ],
+            hidden=[
+                IO.Hidden.auth_token_comfy_org,
+                IO.Hidden.api_key_comfy_org,
+                IO.Hidden.unique_id,
+            ],
+            is_api_node=True,
+            price_badge=IO.PriceBadge(
+                depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+                expr="""
+                (
+                  $prices := {"seedvr2": 0.01, "ultimate": 0.06};
+                  {"type":"usd", "usd": $lookup($prices, widgets.model)}
+                )
+                """,
+            ),
+        )
+
+    @classmethod
+    async def execute(
+        cls,
+        model: str,
+        image: Input.Image,
+        target_resolution: str,
+    ) -> IO.NodeOutput:
+        if get_number_of_images(image) != 1:
+            raise ValueError("Exactly one input image is required.")
+        if model == "SeedVR2":
+            model_path = "seedvr2/image"
+        else:
+            model_path = "ultimate-image-upscaler"
+        initial_res = await sync_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wavespeed/api/v3/wavespeed-ai/{model_path}", method="POST"),
+            response_model=TaskCreatedResponse,
+            data=SeedVR2ImageRequest(
+                target_resolution=target_resolution.lower(),
+                image=(await upload_images_to_comfyapi(cls, image, max_images=1))[0],
+            ),
+        )
+        if initial_res.code != 200:
+            raise ValueError(f"Task creation fails with code={initial_res.code} and message={initial_res.message}")
+        final_response = await poll_op(
+            cls,
+            ApiEndpoint(path=f"/proxy/wavespeed/api/v3/predictions/{initial_res.data.id}/result"),
+            response_model=TaskResultResponse,
+            status_extractor=lambda x: "failed" if x.data is None else x.data.status,
+            poll_interval=10.0,
+            max_poll_attempts=480,
+        )
+        if final_response.code != 200:
+            raise ValueError(
+                f"Task processing failed with code={final_response.code} and message={final_response.message}"
+            )
+        return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.outputs[0]))
+
+
+class WavespeedExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+        return [
+            WavespeedFlashVSRNode,
+            WavespeedImageUpscaleNode,
+        ]
+
+
+async def comfy_entrypoint() -> WavespeedExtension:
+    return WavespeedExtension()

From 965d0ed509ce46a3328c342aee23a234ba6e4f88 Mon Sep 17 00:00:00 2001
From: Ivan Zorin <izorin@lightricks.com>
Date: Wed, 21 Jan 2026 01:44:28 +0200
Subject: [PATCH 13/19] fix: remove normalization of audio in LTX Mel
 spectrogram creation (#11990)

For LTX Audio VAE, remove normalization of audio during MEL spectrogram creation.
This aligs inference with training and prevents loud audio from being attenuated.
---
 comfy/ldm/lightricks/vae/audio_vae.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py
index 29d9e6c29..55a074661 100644
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@@ -103,20 +103,10 @@ class AudioPreprocessor:
             return waveform
         return torchaudio.functional.resample(waveform, source_rate, self.target_sample_rate)
 
-    @staticmethod
-    def normalize_amplitude(
-        waveform: torch.Tensor, max_amplitude: float = 0.5, eps: float = 1e-5
-    ) -> torch.Tensor:
-        waveform = waveform - waveform.mean(dim=2, keepdim=True)
-        peak = torch.max(torch.abs(waveform)) + eps
-        scale = peak.clamp(max=max_amplitude) / peak
-        return waveform * scale
-
     def waveform_to_mel(
         self, waveform: torch.Tensor, waveform_sample_rate: int, device
     ) -> torch.Tensor:
         waveform = self.resample(waveform, waveform_sample_rate)
-        waveform = self.normalize_amplitude(waveform)
 
         mel_transform = torchaudio.transforms.MelSpectrogram(
             sample_rate=self.target_sample_rate,

From c4a14df9a35336dbfff096683c5015ce726c269d Mon Sep 17 00:00:00 2001
From: Mylo <36931363+gitmylo@users.noreply.github.com>
Date: Wed, 21 Jan 2026 00:46:11 +0100
Subject: [PATCH 14/19] Dynamically detect chroma radiance patch size (#11991)

---
 comfy/model_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 42884f797..dad206a2f 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -253,7 +253,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["image_model"] = "chroma_radiance"
                 dit_config["in_channels"] = 3
                 dit_config["out_channels"] = 3
-                dit_config["patch_size"] = 16
+                dit_config["patch_size"] = state_dict.get('{}img_in_patch.weight'.format(key_prefix)).size(dim=-1)
                 dit_config["nerf_hidden_size"] = 64
                 dit_config["nerf_mlp_ratio"] = 4
                 dit_config["nerf_depth"] = 4

From e755268e7b7843695f52b87595afcb09c1e9fd87 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Tue, 20 Jan 2026 20:08:31 -0800
Subject: [PATCH 15/19] Config for Qwen 3 0.6B model. (#11998)

---
 comfy/text_encoders/llama.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index 331a30f61..3080a3e09 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -77,6 +77,28 @@ class Qwen25_3BConfig:
     rope_scale = None
     final_norm: bool = True
 
+@dataclass
+class Qwen3_06BConfig:
+    vocab_size: int = 151936
+    hidden_size: int = 1024
+    intermediate_size: int = 3072
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    max_position_embeddings: int = 32768
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    transformer_type: str = "llama"
+    head_dim = 128
+    rms_norm_add = False
+    mlp_activation = "silu"
+    qkv_bias = False
+    rope_dims = None
+    q_norm = "gemma3"
+    k_norm = "gemma3"
+    rope_scale = None
+    final_norm: bool = True
+
 @dataclass
 class Qwen3_4BConfig:
     vocab_size: int = 151936
@@ -641,6 +663,15 @@ class Qwen25_3B(BaseLlama, torch.nn.Module):
         self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
         self.dtype = dtype
 
+class Qwen3_06B(BaseLlama, torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        config = Qwen3_06BConfig(**config_dict)
+        self.num_layers = config.num_hidden_layers
+
+        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
+        self.dtype = dtype
+
 class Qwen3_4B(BaseLlama, torch.nn.Module):
     def __init__(self, config_dict, dtype, device, operations):
         super().__init__()

From 0fc15700be9b555f351034942b5bd7243bdf6bcc Mon Sep 17 00:00:00 2001
From: Markury <mr.catborg@gmail.com>
Date: Tue, 20 Jan 2026 23:18:33 -0500
Subject: [PATCH 16/19] Add LyCoris LoKr MLP layer support for Flux2 (#11997)

---
 comfy/utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/comfy/utils.py b/comfy/utils.py
index 5e79fb449..d97d753e6 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -611,6 +611,14 @@ def flux_to_diffusers(mmdit_config, output_prefix=""):
                         "ff_context.net.0.proj.bias": "txt_mlp.0.bias",
                         "ff_context.net.2.weight": "txt_mlp.2.weight",
                         "ff_context.net.2.bias": "txt_mlp.2.bias",
+                        "ff.linear_in.weight": "img_mlp.0.weight",  # LyCoris LoKr
+                        "ff.linear_in.bias": "img_mlp.0.bias",
+                        "ff.linear_out.weight": "img_mlp.2.weight",
+                        "ff.linear_out.bias": "img_mlp.2.bias",
+                        "ff_context.linear_in.weight": "txt_mlp.0.weight",
+                        "ff_context.linear_in.bias": "txt_mlp.0.bias",
+                        "ff_context.linear_out.weight": "txt_mlp.2.weight",
+                        "ff_context.linear_out.bias": "txt_mlp.2.bias",
                         "attn.norm_q.weight": "img_attn.norm.query_norm.scale",
                         "attn.norm_k.weight": "img_attn.norm.key_norm.scale",
                         "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale",

From 451af7015435df22e6313ae79f25fe2ef336a96d Mon Sep 17 00:00:00 2001
From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com>
Date: Wed, 21 Jan 2026 14:03:45 +0200
Subject: [PATCH 17/19] fix(api-nodes-Vidu): allow passing up to 7 subjects in
 Vidu Reference node (#12002)

---
 comfy_api_nodes/nodes_vidu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py
index 8edb02f39..b9114c4bb 100644
--- a/comfy_api_nodes/nodes_vidu.py
+++ b/comfy_api_nodes/nodes_vidu.py
@@ -703,7 +703,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode):
                     "subjects",
                     template=IO.Autogrow.TemplateNames(
                         IO.Image.Input("reference_images"),
-                        names=["subject1", "subject2", "subject3"],
+                        names=["subject1", "subject2", "subject3", "subject4", "subject5", "subject6", "subject7"],
                         min=1,
                     ),
                     tooltip="For each subject, provide up to 3 reference images (7 images total across all subjects). "
@@ -738,7 +738,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode):
                     control_after_generate=True,
                 ),
                 IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "4:3", "3:4", "1:1"]),
-                IO.Combo.Input("resolution", options=["720p"]),
+                IO.Combo.Input("resolution", options=["720p", "1080p"]),
                 IO.Combo.Input(
                     "movement_amplitude",
                     options=["auto", "small", "medium", "large"],

From bdeac8897e522b9637a6a427fdc8a50a6abd6b20 Mon Sep 17 00:00:00 2001
From: Christian Byrne <cbyrne@comfy.org>
Date: Wed, 21 Jan 2026 15:36:02 -0800
Subject: [PATCH 18/19] feat: Add search_aliases field to node schema (#12010)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: Add search_aliases field to node schema

Adds `search_aliases` field to improve node discoverability. Users can define alternative search terms for nodes (e.g., "text concat" → StringConcatenate).

Changes:
- Add `search_aliases: list[str]` to V3 Schema
- Add `SEARCH_ALIASES` support for V1 nodes
- Include field in `/object_info` response
- Add aliases to high-priority core nodes

V1 usage:
```python
class MyNode:
    SEARCH_ALIASES = ["alt name", "synonym"]
```

V3 usage:
```python
io.Schema(
    node_id="MyNode",
    search_aliases=["alt name", "synonym"],
    ...
)
```

## Related PRs
- Frontend: Comfy-Org/ComfyUI_frontend#XXXX (draft - merge after this)
- Docs: Comfy-Org/docs#XXXX (draft - merge after stable)

* Propagate search_aliases through V3 Schema.get_v1_info to NodeInfoV1
---
 comfy_api/latest/_io.py               |  4 ++++
 comfy_extras/nodes_post_processing.py |  1 +
 comfy_extras/nodes_preview_any.py     |  1 +
 comfy_extras/nodes_string.py          |  1 +
 comfy_extras/nodes_upscale_model.py   |  1 +
 nodes.py                              | 15 +++++++++++++++
 server.py                             |  2 ++
 7 files changed, 25 insertions(+)

diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index 4969d3506..a60020ca8 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -1249,6 +1249,7 @@ class NodeInfoV1:
     experimental: bool=None
     api_node: bool=None
     price_badge: dict | None = None
+    search_aliases: list[str]=None
 
 @dataclass
 class NodeInfoV3:
@@ -1346,6 +1347,8 @@ class Schema:
     hidden: list[Hidden] = field(default_factory=list)
     description: str=""
     """Node description, shown as a tooltip when hovering over the node."""
+    search_aliases: list[str] = field(default_factory=list)
+    """Alternative names for search. Useful for synonyms, abbreviations, or old names after renaming."""
     is_input_list: bool = False
     """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
 
@@ -1483,6 +1486,7 @@ class Schema:
             api_node=self.is_api_node,
             python_module=getattr(cls, "RELATIVE_PYTHON_MODULE", "nodes"),
             price_badge=self.price_badge.as_dict(self.inputs) if self.price_badge is not None else None,
+            search_aliases=self.search_aliases if self.search_aliases else None,
         )
         return info
 
diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py
index 2e559c35c..6011275d6 100644
--- a/comfy_extras/nodes_post_processing.py
+++ b/comfy_extras/nodes_post_processing.py
@@ -550,6 +550,7 @@ class BatchImagesNode(io.ComfyNode):
             node_id="BatchImagesNode",
             display_name="Batch Images",
             category="image",
+            search_aliases=["batch", "image batch", "batch images", "combine images", "merge images", "stack images"],
             inputs=[
                 io.Autogrow.Input("images", template=autogrow_template)
             ],
diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py
index 139b07c93..91502ebf2 100644
--- a/comfy_extras/nodes_preview_any.py
+++ b/comfy_extras/nodes_preview_any.py
@@ -16,6 +16,7 @@ class PreviewAny():
     OUTPUT_NODE = True
 
     CATEGORY = "utils"
+    SEARCH_ALIASES = ["preview", "show", "display", "view", "show text", "display text", "preview text", "show output", "inspect", "debug"]
 
     def main(self, source=None):
         value = 'None'
diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py
index 571d89f62..a2d5f0d94 100644
--- a/comfy_extras/nodes_string.py
+++ b/comfy_extras/nodes_string.py
@@ -11,6 +11,7 @@ class StringConcatenate(io.ComfyNode):
             node_id="StringConcatenate",
             display_name="Concatenate",
             category="utils/string",
+            search_aliases=["text concat", "join text", "merge text", "combine strings", "concat", "concatenate", "append text", "combine text", "string"],
             inputs=[
                 io.String.Input("string_a", multiline=True),
                 io.String.Input("string_b", multiline=True),
diff --git a/comfy_extras/nodes_upscale_model.py b/comfy_extras/nodes_upscale_model.py
index ed587851c..97b9e948d 100644
--- a/comfy_extras/nodes_upscale_model.py
+++ b/comfy_extras/nodes_upscale_model.py
@@ -53,6 +53,7 @@ class ImageUpscaleWithModel(io.ComfyNode):
             node_id="ImageUpscaleWithModel",
             display_name="Upscale Image (using Model)",
             category="image/upscaling",
+            search_aliases=["upscale", "upscaler", "upsc", "enlarge image", "super resolution", "hires", "superres", "increase resolution"],
             inputs=[
                 io.UpscaleModel.Input("upscale_model"),
                 io.Image.Input("image"),
diff --git a/nodes.py b/nodes.py
index ea5d6e525..67b61dcfe 100644
--- a/nodes.py
+++ b/nodes.py
@@ -70,6 +70,7 @@ class CLIPTextEncode(ComfyNodeABC):
 
     CATEGORY = "conditioning"
     DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images."
+    SEARCH_ALIASES = ["text", "prompt", "text prompt", "positive prompt", "negative prompt", "encode text", "text encoder", "encode prompt"]
 
     def encode(self, clip, text):
         if clip is None:
@@ -86,6 +87,7 @@ class ConditioningCombine:
     FUNCTION = "combine"
 
     CATEGORY = "conditioning"
+    SEARCH_ALIASES = ["combine", "merge conditioning", "combine prompts", "merge prompts", "mix prompts", "add prompt"]
 
     def combine(self, conditioning_1, conditioning_2):
         return (conditioning_1 + conditioning_2, )
@@ -294,6 +296,7 @@ class VAEDecode:
 
     CATEGORY = "latent"
     DESCRIPTION = "Decodes latent images back into pixel space images."
+    SEARCH_ALIASES = ["decode", "decode latent", "latent to image", "render latent"]
 
     def decode(self, vae, samples):
         latent = samples["samples"]
@@ -346,6 +349,7 @@ class VAEEncode:
     FUNCTION = "encode"
 
     CATEGORY = "latent"
+    SEARCH_ALIASES = ["encode", "encode image", "image to latent"]
 
     def encode(self, vae, pixels):
         t = vae.encode(pixels)
@@ -581,6 +585,7 @@ class CheckpointLoaderSimple:
 
     CATEGORY = "loaders"
     DESCRIPTION = "Loads a diffusion model checkpoint, diffusion models are used to denoise latents."
+    SEARCH_ALIASES = ["load model", "checkpoint", "model loader", "load checkpoint", "ckpt", "model"]
 
     def load_checkpoint(self, ckpt_name):
         ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
@@ -667,6 +672,7 @@ class LoraLoader:
 
     CATEGORY = "loaders"
     DESCRIPTION = "LoRAs are used to modify diffusion and CLIP models, altering the way in which latents are denoised such as applying styles. Multiple LoRA nodes can be linked together."
+    SEARCH_ALIASES = ["lora", "load lora", "apply lora", "lora loader", "lora model"]
 
     def load_lora(self, model, clip, lora_name, strength_model, strength_clip):
         if strength_model == 0 and strength_clip == 0:
@@ -814,6 +820,7 @@ class ControlNetLoader:
     FUNCTION = "load_controlnet"
 
     CATEGORY = "loaders"
+    SEARCH_ALIASES = ["controlnet", "control net", "cn", "load controlnet", "controlnet loader"]
 
     def load_controlnet(self, control_net_name):
         controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name)
@@ -890,6 +897,7 @@ class ControlNetApplyAdvanced:
     FUNCTION = "apply_controlnet"
 
     CATEGORY = "conditioning/controlnet"
+    SEARCH_ALIASES = ["controlnet", "apply controlnet", "use controlnet", "control net"]
 
     def apply_controlnet(self, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None, extra_concat=[]):
         if strength == 0:
@@ -1200,6 +1208,7 @@ class EmptyLatentImage:
 
     CATEGORY = "latent"
     DESCRIPTION = "Create a new batch of empty latent images to be denoised via sampling."
+    SEARCH_ALIASES = ["empty", "empty latent", "new latent", "create latent", "blank latent", "blank"]
 
     def generate(self, width, height, batch_size=1):
         latent = torch.zeros([batch_size, 4, height // 8, width // 8], device=self.device)
@@ -1540,6 +1549,7 @@ class KSampler:
 
     CATEGORY = "sampling"
     DESCRIPTION = "Uses the provided model, positive and negative conditioning to denoise the latent image."
+    SEARCH_ALIASES = ["sampler", "sample", "generate", "denoise", "diffuse", "txt2img", "img2img"]
 
     def sample(self, model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0):
         return common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=denoise)
@@ -1604,6 +1614,7 @@ class SaveImage:
 
     CATEGORY = "image"
     DESCRIPTION = "Saves the input images to your ComfyUI output directory."
+    SEARCH_ALIASES = ["save", "save image", "export image", "output image", "write image", "download"]
 
     def save_images(self, images, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
         filename_prefix += self.prefix_append
@@ -1640,6 +1651,8 @@ class PreviewImage(SaveImage):
         self.prefix_append = "_temp_" + ''.join(random.choice("abcdefghijklmnopqrstupvxyz") for x in range(5))
         self.compress_level = 1
 
+    SEARCH_ALIASES = ["preview", "preview image", "show image", "view image", "display image", "image viewer"]
+
     @classmethod
     def INPUT_TYPES(s):
         return {"required":
@@ -1658,6 +1671,7 @@ class LoadImage:
                 }
 
     CATEGORY = "image"
+    SEARCH_ALIASES = ["load image", "open image", "import image", "image input", "upload image", "read image", "image loader"]
 
     RETURN_TYPES = ("IMAGE", "MASK")
     FUNCTION = "load_image"
@@ -1810,6 +1824,7 @@ class ImageScale:
     FUNCTION = "upscale"
 
     CATEGORY = "image/upscaling"
+    SEARCH_ALIASES = ["resize", "resize image", "scale image", "image resize", "zoom", "zoom in", "change size"]
 
     def upscale(self, image, upscale_method, width, height, crop):
         if width == 0 and height == 0:
diff --git a/server.py b/server.py
index 04a577488..1888745b7 100644
--- a/server.py
+++ b/server.py
@@ -682,6 +682,8 @@ class PromptServer():
 
             if hasattr(obj_class, 'API_NODE'):
                 info['api_node'] = obj_class.API_NODE
+
+            info['search_aliases'] = getattr(obj_class, 'SEARCH_ALIASES', [])
             return info
 
         @routes.get("/object_info")

From abe2ec26a61ff670b9c0e71e4821c873368c8728 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 21 Jan 2026 16:44:28 -0800
Subject: [PATCH 19/19] Support the Anima model. (#12012)

---
 comfy/ldm/anima/model.py     | 202 +++++++++++++++++++++++++++++++++++
 comfy/model_base.py          |  22 ++++
 comfy/model_detection.py     |   2 +
 comfy/sd.py                  |   7 ++
 comfy/supported_models.py    |  33 +++++-
 comfy/text_encoders/anima.py |  61 +++++++++++
 6 files changed, 326 insertions(+), 1 deletion(-)
 create mode 100644 comfy/ldm/anima/model.py
 create mode 100644 comfy/text_encoders/anima.py

diff --git a/comfy/ldm/anima/model.py b/comfy/ldm/anima/model.py
new file mode 100644
index 000000000..2e6ed58fa
--- /dev/null
+++ b/comfy/ldm/anima/model.py
@@ -0,0 +1,202 @@
+from comfy.ldm.cosmos.predict2 import MiniTrainDIT
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(x, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim):
+        super().__init__()
+        self.rope_theta = 10000
+        inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).to(dtype=torch.float) / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, head_dim, device=None, dtype=None, operations=None):
+        super().__init__()
+
+        inner_dim = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.query_dim = query_dim
+        self.context_dim = context_dim
+
+        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+
+        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+
+        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+
+        self.o_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
+
+    def forward(self, x, mask=None, context=None, position_embeddings=None, position_embeddings_context=None):
+        context = x if context is None else context
+        input_shape = x.shape[:-1]
+        q_shape = (*input_shape, self.n_heads, self.head_dim)
+        context_shape = context.shape[:-1]
+        kv_shape = (*context_shape, self.n_heads, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(x).view(q_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(context).view(kv_shape)).transpose(1, 2)
+        value_states = self.v_proj(context).view(kv_shape).transpose(1, 2)
+
+        if position_embeddings is not None:
+            assert position_embeddings_context is not None
+            cos, sin = position_embeddings
+            query_states = apply_rotary_pos_emb(query_states, cos, sin)
+            cos, sin = position_embeddings_context
+            key_states = apply_rotary_pos_emb(key_states, cos, sin)
+
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=mask)
+
+        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+
+    def init_weights(self):
+        torch.nn.init.zeros_(self.o_proj.weight)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, source_dim, model_dim, num_heads=16, mlp_ratio=4.0, use_self_attn=False, layer_norm=False, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.use_self_attn = use_self_attn
+
+        if self.use_self_attn:
+            self.norm_self_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+            self.self_attn = Attention(
+                query_dim=model_dim,
+                context_dim=model_dim,
+                n_heads=num_heads,
+                head_dim=model_dim//num_heads,
+                device=device,
+                dtype=dtype,
+                operations=operations,
+            )
+
+        self.norm_cross_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.cross_attn = Attention(
+            query_dim=model_dim,
+            context_dim=source_dim,
+            n_heads=num_heads,
+            head_dim=model_dim//num_heads,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+
+        self.norm_mlp = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = nn.Sequential(
+            operations.Linear(model_dim, int(model_dim * mlp_ratio), device=device, dtype=dtype),
+            nn.GELU(),
+            operations.Linear(int(model_dim * mlp_ratio), model_dim, device=device, dtype=dtype)
+        )
+
+    def forward(self, x, context, target_attention_mask=None, source_attention_mask=None, position_embeddings=None, position_embeddings_context=None):
+        if self.use_self_attn:
+            normed = self.norm_self_attn(x)
+            attn_out = self.self_attn(normed, mask=target_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings)
+            x = x + attn_out
+
+        normed = self.norm_cross_attn(x)
+        attn_out = self.cross_attn(normed, mask=source_attention_mask, context=context, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        x = x + attn_out
+
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+
+    def init_weights(self):
+        torch.nn.init.zeros_(self.mlp[2].weight)
+        self.cross_attn.init_weights()
+
+
+class LLMAdapter(nn.Module):
+    def __init__(
+            self,
+            source_dim=1024,
+            target_dim=1024,
+            model_dim=1024,
+            num_layers=6,
+            num_heads=16,
+            use_self_attn=True,
+            layer_norm=False,
+            device=None,
+            dtype=None,
+            operations=None,
+        ):
+        super().__init__()
+
+        self.embed = operations.Embedding(32128, target_dim, device=device, dtype=dtype)
+        if model_dim != target_dim:
+            self.in_proj = operations.Linear(target_dim, model_dim, device=device, dtype=dtype)
+        else:
+            self.in_proj = nn.Identity()
+        self.rotary_emb = RotaryEmbedding(model_dim//num_heads)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(source_dim, model_dim, num_heads=num_heads, use_self_attn=use_self_attn, layer_norm=layer_norm, device=device, dtype=dtype, operations=operations) for _ in range(num_layers)
+        ])
+        self.out_proj = operations.Linear(model_dim, target_dim, device=device, dtype=dtype)
+        self.norm = operations.RMSNorm(target_dim, eps=1e-6, device=device, dtype=dtype)
+
+    def forward(self, source_hidden_states, target_input_ids, target_attention_mask=None, source_attention_mask=None):
+        if target_attention_mask is not None:
+            target_attention_mask = target_attention_mask.to(torch.bool)
+            if target_attention_mask.ndim == 2:
+                target_attention_mask = target_attention_mask.unsqueeze(1).unsqueeze(1)
+
+        if source_attention_mask is not None:
+            source_attention_mask = source_attention_mask.to(torch.bool)
+            if source_attention_mask.ndim == 2:
+                source_attention_mask = source_attention_mask.unsqueeze(1).unsqueeze(1)
+
+        x = self.in_proj(self.embed(target_input_ids))
+        context = source_hidden_states
+        position_ids = torch.arange(x.shape[1], device=x.device).unsqueeze(0)
+        position_ids_context = torch.arange(context.shape[1], device=x.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(x, position_ids)
+        position_embeddings_context = self.rotary_emb(x, position_ids_context)
+        for block in self.blocks:
+            x = block(x, context, target_attention_mask=target_attention_mask, source_attention_mask=source_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        return self.norm(self.out_proj(x))
+
+
+class Anima(MiniTrainDIT):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations"))
+
+    def preprocess_text_embeds(self, text_embeds, text_ids):
+        if text_ids is not None:
+            return self.llm_adapter(text_embeds, text_ids)
+        else:
+            return text_embeds
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 28ba2643e..1d57562cc 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -49,6 +49,7 @@ import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
 import comfy.ldm.qwen_image.model
 import comfy.ldm.kandinsky5.model
+import comfy.ldm.anima.model
 
 import comfy.model_management
 import comfy.patcher_extension
@@ -1147,6 +1148,27 @@ class CosmosPredict2(BaseModel):
         sigma = (sigma / (sigma + 1))
         return latent_image / (1.0 - sigma)
 
+class Anima(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.anima.model.Anima)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        t5xxl_ids = kwargs.get("t5xxl_ids", None)
+        t5xxl_weights = kwargs.get("t5xxl_weights", None)
+        device = kwargs["device"]
+        if cross_attn is not None:
+            if t5xxl_ids is not None:
+                cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.unsqueeze(0).to(device=device))
+                if t5xxl_weights is not None:
+                    cross_attn *= t5xxl_weights.unsqueeze(0).unsqueeze(-1).to(cross_attn)
+
+                if cross_attn.shape[1] < 512:
+                    cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, 0, 512 - cross_attn.shape[1]))
+            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+        return out
+
 class Lumina2(BaseModel):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
         super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiT)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index dad206a2f..b29a033cc 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -550,6 +550,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
     if '{}blocks.0.mlp.layer1.weight'.format(key_prefix) in state_dict_keys:  # Cosmos predict2
         dit_config = {}
         dit_config["image_model"] = "cosmos_predict2"
+        if "{}llm_adapter.blocks.0.cross_attn.q_proj.weight".format(key_prefix) in state_dict_keys:
+            dit_config["image_model"] = "anima"
         dit_config["max_img_h"] = 240
         dit_config["max_img_w"] = 240
         dit_config["max_frames"] = 128
diff --git a/comfy/sd.py b/comfy/sd.py
index 77700dfd3..f7f6a44a0 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -57,6 +57,7 @@ import comfy.text_encoders.ovis
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.jina_clip_2
 import comfy.text_encoders.newbie
+import comfy.text_encoders.anima
 
 import comfy.model_patcher
 import comfy.lora
@@ -1048,6 +1049,7 @@ class TEModel(Enum):
     GEMMA_3_12B = 18
     JINA_CLIP_2 = 19
     QWEN3_8B = 20
+    QWEN3_06B = 21
 
 
 def detect_te_model(sd):
@@ -1093,6 +1095,8 @@ def detect_te_model(sd):
                 return TEModel.QWEN3_2B
             elif weight.shape[0] == 4096:
                 return TEModel.QWEN3_8B
+            elif weight.shape[0] == 1024:
+                return TEModel.QWEN3_06B
         if weight.shape[0] == 5120:
             if "model.layers.39.post_attention_layernorm.weight" in sd:
                 return TEModel.MISTRAL3_24B
@@ -1233,6 +1237,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         elif te_model == TEModel.JINA_CLIP_2:
             clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
             clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
+        elif te_model == TEModel.QWEN3_06B:
+            clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
         else:
             # clip_l
             if clip_type == CLIPType.SD3:
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index c8a7f6efb..70abebf46 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -23,6 +23,7 @@ import comfy.text_encoders.qwen_image
 import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
+import comfy.text_encoders.anima
 
 from . import supported_models_base
 from . import latent_formats
@@ -992,6 +993,36 @@ class CosmosT2IPredict2(supported_models_base.BASE):
         t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
         return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))
 
+class Anima(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "anima",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Wan21
+
+    memory_usage_factor = 1.0
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Anima(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_06b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.anima.AnimaTokenizer, comfy.text_encoders.anima.te(**detect))
+
 class CosmosI2VPredict2(CosmosT2IPredict2):
     unet_config = {
         "image_model": "cosmos_predict2",
@@ -1551,6 +1582,6 @@ class Kandinsky5Image(Kandinsky5):
         return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
 
 
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
 
 models += [SVD_img2vid]
diff --git a/comfy/text_encoders/anima.py b/comfy/text_encoders/anima.py
new file mode 100644
index 000000000..41f95bcb6
--- /dev/null
+++ b/comfy/text_encoders/anima.py
@@ -0,0 +1,61 @@
+from transformers import Qwen2Tokenizer, T5TokenizerFast
+import comfy.text_encoders.llama
+from comfy import sd1_clip
+import os
+import torch
+
+
+class Qwen3Tokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
+        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=1024, embedding_key='qwen3_06b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
+
+class T5XXLTokenizer(sd1_clip.SDTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer")
+        super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data)
+
+class AnimaTokenizer:
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        self.qwen3_06b = Qwen3Tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+        self.t5xxl = T5XXLTokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
+
+    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
+        out = {}
+        qwen_ids = self.qwen3_06b.tokenize_with_weights(text, return_word_ids, **kwargs)
+        out["qwen3_06b"] = [[(token, 1.0) for token, _ in inner_list] for inner_list in qwen_ids]  # Set weights to 1.0
+        out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs)
+        return out
+
+    def untokenize(self, token_weight_pair):
+        return self.t5xxl.untokenize(token_weight_pair)
+
+    def state_dict(self):
+        return {}
+
+
+class Qwen3_06BModel(sd1_clip.SDClipModel):
+    def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_06B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+
+class AnimaTEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        super().__init__(device=device, dtype=dtype, name="qwen3_06b", clip_model=Qwen3_06BModel, model_options=model_options)
+
+    def encode_token_weights(self, token_weight_pairs):
+        out = super().encode_token_weights(token_weight_pairs)
+        out[2]["t5xxl_ids"] = torch.tensor(list(map(lambda a: a[0], token_weight_pairs["t5xxl"][0])), dtype=torch.int)
+        out[2]["t5xxl_weights"] = torch.tensor(list(map(lambda a: a[1], token_weight_pairs["t5xxl"][0])))
+        return out
+
+def te(dtype_llama=None, llama_quantization_metadata=None):
+    class AnimaTEModel_(AnimaTEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return AnimaTEModel_