Merge branch 'comfyanonymous:master' into offloader-maifee

2025-12-20 03:23:00 +08:00 · 2025-11-25 22:08:51 +06:00 · 2025-11-25 22:08:51 +06:00 · 6d96d26795
commit 6d96d26795
parent e07a32c9b8 6b573ae0cb
62 changed files with 3558 additions and 709 deletions
--- a/.github/PULL_REQUEST_TEMPLATE/api-node.md
+++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md
@ -0,0 +1,21 @@
 <!-- API_NODE_PR_CHECKLIST: do not remove -->
 ## API Node PR Checklist
 ### Scope
 - [ ] **Is API Node Change**
 ### Pricing & Billing
 - [ ] **Need pricing update**
 - [ ] **No pricing update**
 If **Need pricing update**:
 - [ ] Metronome rate cards updated
 - [ ] Auto‑billing tests updated and passing
 ### QA
 - [ ] **QA done**
 - [ ] **QA not required**
 ### Comms
 - [ ] Informed **Kosinkadink**
--- a/.github/workflows/api-node-template.yml
+++ b/.github/workflows/api-node-template.yml
@ -0,0 +1,58 @@
 name: Append API Node PR template
 on:
  pull_request_target:
    types: [opened, reopened, synchronize, ready_for_review]
    paths:
      - 'comfy_api_nodes/**'   # only run if these files changed
 permissions:
  contents: read
  pull-requests: write
 jobs:
  inject:
    runs-on: ubuntu-latest
    steps:
      - name: Ensure template exists and append to PR body
        uses: actions/github-script@v7
        with:
          script: |
            const { owner, repo } = context.repo;
            const number = context.payload.pull_request.number;
            const templatePath = '.github/PULL_REQUEST_TEMPLATE/api-node.md';
            const marker = '<!-- API_NODE_PR_CHECKLIST: do not remove -->';
            const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: number });
            let templateText;
            try {
              const res = await github.rest.repos.getContent({
                owner,
                repo,
                path: templatePath,
                ref: pr.base.ref
              });
              const buf = Buffer.from(res.data.content, res.data.encoding || 'base64');
              templateText = buf.toString('utf8');
            } catch (e) {
              core.setFailed(`Required PR template not found at "${templatePath}" on ${pr.base.ref}. Please add it to the repo.`);
              return;
            }
            // Enforce the presence of the marker inside the template (for idempotence)
            if (!templateText.includes(marker)) {
              core.setFailed(`Template at "${templatePath}" does not contain the required marker:\n${marker}\nAdd it so we can detect duplicates safely.`);
              return;
            }
            // If the PR already contains the marker, do not append again.
            const body = pr.body || '';
            if (body.includes(marker)) {
              core.info('Template already present in PR body; nothing to inject.');
              return;
            }
            const newBody = (body ? body + '\n\n' : '') + templateText + '\n';
            await github.rest.pulls.update({ owner, repo, pull_number: number, body: newBody });
            core.notice('API Node template appended to PR description.');
--- a/.github/workflows/release-stable-all.yml
+++ b/.github/workflows/release-stable-all.yml
@ -14,7 +14,7 @@ jobs:
      contents: "write"
      packages: "write"
      pull-requests: "read"
-    name: "Release NVIDIA Default (cu129)"
+    name: "Release NVIDIA Default (cu130)"
    uses: ./.github/workflows/stable-release.yml
    with:
      git_tag: ${{ inputs.git_tag }}
@ -43,6 +43,23 @@ jobs:
      test_release: true
    secrets: inherit
  release_nvidia_cu126:
    permissions:
      contents: "write"
      packages: "write"
      pull-requests: "read"
    name: "Release NVIDIA cu126"
    uses: ./.github/workflows/stable-release.yml
    with:
      git_tag: ${{ inputs.git_tag }}
      cache_tag: "cu126"
      python_minor: "12"
      python_patch: "10"
      rel_name: "nvidia"
      rel_extra_name: "_cu126"
      test_release: true
    secrets: inherit
  release_amd_rocm:
    permissions:
      contents: "write"
--- a/.github/workflows/test-ci.yml
+++ b/.github/workflows/test-ci.yml
@ -21,14 +21,15 @@ jobs:
      fail-fast: false
      matrix:
        # os: [macos, linux, windows]
-        os: [macos, linux]
+        # os: [macos, linux]
-        python_version: ["3.9", "3.10", "3.11", "3.12"]
+        os: [linux]
        python_version: ["3.10", "3.11", "3.12"]
        cuda_version: ["12.1"]
        torch_version: ["stable"]
        include:
-          - os: macos
+          # - os: macos
-            runner_label: [self-hosted, macOS]
+          #   runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
+          #   flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
@ -73,14 +74,15 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos, linux]
+        # os: [macos, linux]
        os: [linux]
        python_version: ["3.11"]
        cuda_version: ["12.1"]
        torch_version: ["nightly"]
        include:
-          - os: macos
+          # - os: macos
-            runner_label: [self-hosted, macOS]
+          #   runner_label: [self-hosted, macOS]
-            flags: "--use-pytorch-cross-attention"
+          #   flags: "--use-pytorch-cross-attention"
          - os: linux
            runner_label: [self-hosted, Linux]
            flags: ""
--- a/QUANTIZATION.md
+++ b/QUANTIZATION.md
@ -0,0 +1,168 @@
 # The Comfy guide to Quantization
 ## How does quantization work?
 Quantization aims to map a high-precision value x_f to a lower precision format with minimal loss in accuracy. These smaller formats then serve to reduce the models memory footprint and increase throughput by using specialized hardware.
 When simply converting a value from FP16 to FP8 using the round-nearest method we might hit two issues:
 - The dynamic range of FP16 (-65,504, 65,504) far exceeds FP8 formats like E4M3 (-448, 448) or E5M2 (-57,344, 57,344), potentially resulting in clipped values
 - The original values are concentrated in a small range (e.g. -1,1) leaving many FP8-bits "unused"
 By using a scaling factor, we aim to map these values into the quantized-dtype range, making use of the full spectrum. One of the easiest approaches, and common, is using per-tensor absolute-maximum scaling.
 ```
 absmax = max(abs(tensor))
 scale = amax / max_dynamic_range_low_precision
 # Quantization
 tensor_q = (tensor / scale).to(low_precision_dtype)
 # De-Quantization
 tensor_dq = tensor_q.to(fp16) * scale
 tensor_dq ~ tensor
 ```
 Given that additional information (scaling factor) is needed to "interpret" the quantized values, we describe those as derived datatypes.
 ## Quantization in Comfy
 ```
 QuantizedTensor (torch.Tensor subclass)
  ↓ __torch_dispatch__
 Two-Level Registry (generic + layout handlers)
  ↓
 MixedPrecisionOps + Metadata Detection
 ```
 ### Representation
 To represent these derived datatypes, ComfyUI uses a subclass of torch.Tensor to implements these using the `QuantizedTensor` class found in `comfy/quant_ops.py`
 A `Layout` class defines how a specific quantization format behaves:
 - Required parameters
 - Quantize method
 - De-Quantize method
 ```python
 from comfy.quant_ops import QuantizedLayout
 class MyLayout(QuantizedLayout):
    @classmethod
    def quantize(cls, tensor, **kwargs):
        # Convert to quantized format
        qdata = ...
        params = {'scale': ..., 'orig_dtype': tensor.dtype}
        return qdata, params
    @staticmethod
    def dequantize(qdata, scale, orig_dtype, **kwargs):
        return qdata.to(orig_dtype) * scale
 ```
 To then run operations using these QuantizedTensors we use two registry systems to define supported operations. 
 The first is a **generic registry** that handles operations common to all quantized formats (e.g., `.to()`, `.clone()`, `.reshape()`).
 The second registry is layout-specific and allows to implement fast-paths like nn.Linear.
 ```python
 from comfy.quant_ops import register_layout_op
@register_layout_op(torch.ops.aten.linear.default, MyLayout)
 def my_linear(func, args, kwargs):
    # Extract tensors, call optimized kernel
    ...
 ```
 When `torch.nn.functional.linear()` is called with QuantizedTensor arguments, `__torch_dispatch__` automatically routes to the registered implementation.
 For any unsupported operation, QuantizedTensor will fallback to call `dequantize` and dispatch using the high-precision implementation.
 ### Mixed Precision
 The `MixedPrecisionOps` class (lines 542-648 in `comfy/ops.py`) enables per-layer quantization decisions, allowing different layers in a model to use different precisions. This is activated when a model config contains a `layer_quant_config` dictionary that specifies which layers should be quantized and how.
 **Architecture:**
 ```python
 class MixedPrecisionOps(disable_weight_init):
    _layer_quant_config = {}  # Maps layer names to quantization configs
    _compute_dtype = torch.bfloat16  # Default compute / dequantize precision
 ```
 **Key mechanism:**
 The custom `Linear._load_from_state_dict()` method inspects each layer during model loading:
 - If the layer name is **not** in `_layer_quant_config`: load weight as regular tensor in `_compute_dtype`
 - If the layer name **is** in `_layer_quant_config`: 
  - Load weight as `QuantizedTensor` with the specified layout (e.g., `TensorCoreFP8Layout`)
  - Load associated quantization parameters (scales, block_size, etc.)
 **Why it's needed:**
 Not all layers tolerate quantization equally. Sensitive operations like final projections can be kept in higher precision, while compute-heavy matmuls are quantized. This provides most of the performance benefits while maintaining quality.
 The system is selected in `pick_operations()` when `model_config.layer_quant_config` is present, making it the highest-priority operation mode.
 ## Checkpoint Format
 Quantized checkpoints are stored as standard safetensors files with quantized weight tensors and associated scaling parameters, plus a `_quantization_metadata` JSON entry describing the quantization scheme.
 The quantized checkpoint will contain the same layers as the original checkpoint but:
 - The weights are stored as quantized values, sometimes using a different storage datatype. E.g. uint8 container for fp8.
 - For each quantized weight a number of additional scaling parameters are stored alongside depending on the recipe.
 - We store a metadata.json in the metadata of the final safetensor containing the `_quantization_metadata` describing which layers are quantized and what layout has been used.
 ### Scaling Parameters details
 We define 4 possible scaling parameters that should cover most recipes in the near-future:
 - **weight_scale**: quantization scalers for the weights
 - **weight_scale_2**: global scalers in the context of double scaling
 - **pre_quant_scale**: scalers used for smoothing salient weights
 - **input_scale**: quantization scalers for the activations
 | Format | Storage dtype | weight_scale | weight_scale_2 | pre_quant_scale | input_scale |
 |--------|---------------|--------------|----------------|-----------------|-------------|
 | float8_e4m3fn | float32 | float32 (scalar) | - | - | float32 (scalar) |
 You can find the defined formats in `comfy/quant_ops.py` (QUANT_ALGOS).
 ### Quantization Metadata
 The metadata stored alongside the checkpoint contains:
 - **format_version**: String to define a version of the standard
 - **layers**: A dictionary mapping layer names to their quantization format. The format string maps to the definitions found in `QUANT_ALGOS`. 
 Example:
 ```json
 {
  "_quantization_metadata": {
    "format_version": "1.0",
    "layers": {
      "model.layers.0.mlp.up_proj": "float8_e4m3fn",
      "model.layers.0.mlp.down_proj": "float8_e4m3fn",
      "model.layers.1.mlp.up_proj": "float8_e4m3fn"
    }
  }
 }
 ```
 ## Creating Quantized Checkpoints
 To create compatible checkpoints, use any quantization tool provided the output follows the checkpoint format described above and uses a layout defined in `QUANT_ALGOS`.
 ### Weight Quantization
 Weight quantization is straightforward - compute the scaling factor directly from the weight tensor using the absolute maximum method described earlier. Each layer's weights are quantized independently and stored with their corresponding `weight_scale` parameter.
 ### Calibration (for Activation Quantization)
 Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_scale` parameters that cannot be determined from static weights alone. Since activation values depend on actual inputs, we use **post-training calibration (PTQ)**:
 1. **Collect statistics**: Run inference on N representative samples
 2. **Track activations**: Record the absolute maximum (`amax`) of inputs to each quantized layer
 3. **Compute scales**: Derive `input_scale` from collected statistics
 4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
 The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
--- a/README.md
+++ b/README.md
@ -173,7 +173,7 @@ There is a portable standalone build for Windows that should work for running on
 ### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z)
-Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints
+Simply download, extract with [7-Zip](https://7-zip.org) or with the windows explorer on recent windows versions and run. For smaller models you normally only need to put the checkpoints (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints but many of the larger models have multiple files. Make sure to follow the instructions to know which subfolder to put them in ComfyUI\models\
 If you have trouble extracting it, right click the file -> properties -> unblock
@ -183,7 +183,9 @@ Update your Nvidia drivers if it doesn't start.
 [Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
-[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z) (Supports Nvidia 10 series and older GPUs).
+[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z).
 [Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
 #### How do I share models between another UI and ComfyUI?
@ -200,7 +202,7 @@ comfy install
 ## Manual Install (Windows, Linux)
-Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) but it is not recommended.
+Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies.
 Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12
@ -221,7 +223,7 @@ AMD users can install rocm and pytorch with pip if you don't have it already ins
 This is the command to install the nightly with ROCm 7.0 which might have some performance improvements:
-```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.0```
+```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1```
 ### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only.
@ -242,7 +244,7 @@ RDNA 4 (RX 9000 series):
 ### Intel GPUs (Windows and Linux)
-(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
+Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html)
 1. To install PyTorch xpu, use the following command:
@ -252,10 +254,6 @@ This is the command to install the Pytorch xpu nightly which might have some per
 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu```
 (Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance.
 1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information.
 ### NVIDIA
 Nvidia users should install stable pytorch using this command:
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -10,7 +10,8 @@ import importlib
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import TypedDict, Optional
+from typing import Dict, TypedDict, Optional
 from aiohttp import web
 from importlib.metadata import version
 import requests
@ -257,7 +258,54 @@ comfyui-frontend-package is not installed.
            sys.exit(-1)
    @classmethod
-    def templates_path(cls) -> str:
+    def template_asset_map(cls) -> Optional[Dict[str, str]]:
        """Return a mapping of template asset names to their absolute paths."""
        try:
            from comfyui_workflow_templates import (
                get_asset_path,
                iter_templates,
            )
        except ImportError:
            logging.error(
                f"""
 ********** ERROR ***********
 comfyui-workflow-templates is not installed.
 {frontend_install_warning_message()}
 ********** ERROR ***********
 """.strip()
            )
            return None
        try:
            template_entries = list(iter_templates())
        except Exception as exc:
            logging.error(f"Failed to enumerate workflow templates: {exc}")
            return None
        asset_map: Dict[str, str] = {}
        try:
            for entry in template_entries:
                for asset in entry.assets:
                    asset_map[asset.filename] = get_asset_path(
                        entry.template_id, asset.filename
                    )
        except Exception as exc:
            logging.error(f"Failed to resolve template asset paths: {exc}")
            return None
        if not asset_map:
            logging.error("No workflow template assets found. Did the packages install correctly?")
            return None
        return asset_map
    @classmethod
    def legacy_templates_path(cls) -> Optional[str]:
        """Return the legacy templates directory shipped inside the meta package."""
        try:
            import comfyui_workflow_templates
@ -276,6 +324,7 @@ comfyui-workflow-templates is not installed.
 ********** ERROR ***********
 """.strip()
            )
            return None
    @classmethod
    def embedded_docs_path(cls) -> str:
@ -392,3 +441,17 @@ comfyui-workflow-templates is not installed.
            logging.info("Falling back to the default frontend.")
            check_frontend_version()
            return cls.default_frontend_path()
    @classmethod
    def template_asset_handler(cls):
        assets = cls.template_asset_map()
        if not assets:
            return None
        async def serve_template(request: web.Request) -> web.StreamResponse:
            rel_path = request.match_info.get("path", "")
            target = assets.get(rel_path)
            if target is None:
                raise web.HTTPNotFound()
            return web.FileResponse(target)
        return serve_template
--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@ -413,7 +413,8 @@ class ControlNet(nn.Module):
        out_middle = []
        if self.num_classes is not None:
-            assert y.shape[0] == x.shape[0]
+            if y is None:
                raise ValueError("y is None, did you try using a controlnet for SDXL on SD1?")
            emb = emb + self.label_emb(y)
        h = x
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -171,7 +171,7 @@ parser.add_argument("--windows-standalone-build", action="store_true", help="Win
 parser.add_argument("--disable-metadata", action="store_true", help="Disable saving prompt metadata in files.")
 parser.add_argument("--disable-all-custom-nodes", action="store_true", help="Disable loading all custom nodes.")
 parser.add_argument("--whitelist-custom-nodes", type=str, nargs='+', default=[], help="Specify custom node folders to load even when --disable-all-custom-nodes is enabled.")
-parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes.")
+parser.add_argument("--disable-api-nodes", action="store_true", help="Disable loading all api nodes. Also prevents the frontend from communicating with the internet.")
 parser.add_argument("--multi-user", action="store_true", help="Enables per-user storage.")
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -178,6 +178,15 @@ class Flux(SD3):
    def process_out(self, latent):
        return (latent / self.scale_factor) + self.shift_factor
 class Flux2(LatentFormat):
    latent_channels = 128
    def process_in(self, latent):
        return latent
    def process_out(self, latent):
        return latent
 class Mochi(LatentFormat):
    latent_channels = 12
    latent_dimensions = 3
@ -611,6 +620,66 @@ class HunyuanImage21Refiner(LatentFormat):
    latent_dimensions = 3
    scale_factor = 1.03682
    def process_in(self, latent):
        out = latent * self.scale_factor
        out = torch.cat((out[:, :, :1], out), dim=2)
        out = out.permute(0, 2, 1, 3, 4)
        b, f_times_2, c, h, w = out.shape
        out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
        out = out.permute(0, 2, 1, 3, 4).contiguous()
        return out
    def process_out(self, latent):
        z = latent / self.scale_factor
        z = z.permute(0, 2, 1, 3, 4)
        b, f, c, h, w = z.shape
        z = z.reshape(b, f, 2, c // 2, h, w)
        z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
        z = z.permute(0, 2, 1, 3, 4)
        z = z[:, :, 1:]
        return z
 class HunyuanVideo15(LatentFormat):
    latent_rgb_factors = [
        [ 0.0568, -0.0521, -0.0131],
        [ 0.0014,  0.0735,  0.0326],
        [ 0.0186,  0.0531, -0.0138],
        [-0.0031,  0.0051,  0.0288],
        [ 0.0110,  0.0556,  0.0432],
        [-0.0041, -0.0023, -0.0485],
        [ 0.0530,  0.0413,  0.0253],
        [ 0.0283,  0.0251,  0.0339],
        [ 0.0277, -0.0372, -0.0093],
        [ 0.0393,  0.0944,  0.1131],
        [ 0.0020,  0.0251,  0.0037],
        [-0.0017,  0.0012,  0.0234],
        [ 0.0468,  0.0436,  0.0203],
        [ 0.0354,  0.0439, -0.0233],
        [ 0.0090,  0.0123,  0.0346],
        [ 0.0382,  0.0029,  0.0217],
        [ 0.0261, -0.0300,  0.0030],
        [-0.0088, -0.0220, -0.0283],
        [-0.0272, -0.0121, -0.0363],
        [-0.0664, -0.0622,  0.0144],
        [ 0.0414,  0.0479,  0.0529],
        [ 0.0355,  0.0612, -0.0247],
        [ 0.0147,  0.0264,  0.0174],
        [ 0.0438,  0.0038,  0.0542],
        [ 0.0431, -0.0573, -0.0033],
        [-0.0162, -0.0211, -0.0406],
        [-0.0487, -0.0295, -0.0393],
        [ 0.0005, -0.0109,  0.0253],
        [ 0.0296,  0.0591,  0.0353],
        [ 0.0119,  0.0181, -0.0306],
        [-0.0085, -0.0362,  0.0229],
        [ 0.0005, -0.0106,  0.0242]
    ]
    latent_rgb_factors_bias = [ 0.0456, -0.0202, -0.0644]
    latent_channels = 32
    latent_dimensions = 3
    scale_factor = 1.03682
 class Hunyuan3Dv2(LatentFormat):
    latent_channels = 64
    latent_dimensions = 1
--- a/comfy/ldm/chroma/layers.py
+++ b/comfy/ldm/chroma/layers.py
@ -1,15 +1,15 @@
 import torch
 from torch import Tensor, nn
 from comfy.ldm.flux.math import attention
 from comfy.ldm.flux.layers import (
    MLPEmbedder,
    RMSNorm,
    QKNorm,
    SelfAttention,
    ModulationOut,
 )
 # TODO: remove this in a few months
 SingleStreamBlock = None
 DoubleStreamBlock = None
 class ChromaModulationOut(ModulationOut):
@ -48,124 +48,6 @@ class Approximator(nn.Module):
        return x
 class DoubleStreamBlock(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
        super().__init__()
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.txt_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
        self.flipped_img_txt = flipped_img_txt
    def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}):
        (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
        # prepare image for attention
        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
        img_qkv = self.img_attn.qkv(img_modulated)
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
        # run actual attention
        attn = attention(torch.cat((txt_q, img_q), dim=2),
                         torch.cat((txt_k, img_k), dim=2),
                         torch.cat((txt_v, img_v), dim=2),
                         pe=pe, mask=attn_mask, transformer_options=transformer_options)
        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
        # calculate the img bloks
        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
        # calculate the txt bloks
        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
        if txt.dtype == torch.float16:
            txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
        return img, txt
 class SingleStreamBlock(nn.Module):
    """
    A DiT block with parallel linear layers as described in
    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
    """
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float = None,
        dtype=None,
        device=None,
        operations=None
    ):
        super().__init__()
        self.hidden_dim = hidden_size
        self.num_heads = num_heads
        head_dim = hidden_size // num_heads
        self.scale = qk_scale or head_dim**-0.5
        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
        # qkv and mlp_in
        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
        # proj and mlp_out
        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
        self.hidden_size = hidden_size
        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.mlp_act = nn.GELU(approximate="tanh")
    def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor:
        mod = vec
        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        q, k = self.norm(q, k, v)
        # compute attention
        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
        # compute activation in mlp stream, cat again and run second linear layer
        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
        x.addcmul_(mod.gate, output)
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
        return x
 class LastLayer(nn.Module):
    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
        super().__init__()
--- a/comfy/ldm/chroma/model.py
+++ b/comfy/ldm/chroma/model.py
@ -11,12 +11,12 @@ import comfy.ldm.common_dit
 from comfy.ldm.flux.layers import (
    EmbedND,
    timestep_embedding,
    DoubleStreamBlock,
    SingleStreamBlock,
 )
 from .layers import (
    DoubleStreamBlock,
    LastLayer,
    SingleStreamBlock,
    Approximator,
    ChromaModulationOut,
 )
@ -90,6 +90,7 @@ class Chroma(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                    modulation=False,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -98,7 +99,7 @@ class Chroma(nn.Module):
        self.single_blocks = nn.ModuleList(
            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )
@ -178,7 +179,10 @@ class Chroma(nn.Module):
        pe = self.pe_embedder(ids)
        blocks_replace = patches_replace.get("dit", {})
        transformer_options["total_blocks"] = len(self.double_blocks)
        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
            transformer_options["block_index"] = i
            if i not in self.skip_mmdit:
                double_mod = (
                    self.get_modulations(mod_vectors, "double_img", idx=i),
@ -221,7 +225,10 @@ class Chroma(nn.Module):
        img = torch.cat((txt, img), 1)
        transformer_options["total_blocks"] = len(self.single_blocks)
        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
            transformer_options["block_index"] = i
            if i not in self.skip_dit:
                single_mod = self.get_modulations(mod_vectors, "single", idx=i)
                if ("single_block", i) in blocks_replace:
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@ -10,12 +10,10 @@ from torch import Tensor, nn
 from einops import repeat
 import comfy.ldm.common_dit
-from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock
 from comfy.ldm.chroma.model import Chroma, ChromaParams
 from comfy.ldm.chroma.layers import (
    DoubleStreamBlock,
    SingleStreamBlock,
    Approximator,
 )
 from .layers import (
@ -89,7 +87,6 @@ class ChromaRadiance(Chroma):
                    dtype=dtype, device=device, operations=operations
                )
        self.double_blocks = nn.ModuleList(
            [
                DoubleStreamBlock(
@ -97,6 +94,7 @@ class ChromaRadiance(Chroma):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                    modulation=False,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -109,6 +107,7 @@ class ChromaRadiance(Chroma):
                    self.hidden_size,
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    modulation=False,
                    dtype=dtype, device=device, operations=operations,
                )
                for _ in range(params.depth_single_blocks)
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@ -48,11 +48,11 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
    return embedding
 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int, dtype=None, device=None, operations=None):
+    def __init__(self, in_dim: int, hidden_dim: int, bias=True, dtype=None, device=None, operations=None):
        super().__init__()
-        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.in_layer = operations.Linear(in_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
        self.silu = nn.SiLU()
-        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=True, dtype=dtype, device=device)
+        self.out_layer = operations.Linear(hidden_dim, hidden_dim, bias=bias, dtype=dtype, device=device)
    def forward(self, x: Tensor) -> Tensor:
        return self.out_layer(self.silu(self.in_layer(x)))
@ -80,14 +80,14 @@ class QKNorm(torch.nn.Module):
 class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False, proj_bias: bool = True, dtype=None, device=None, operations=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
-        self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+        self.proj = operations.Linear(dim, dim, bias=proj_bias, dtype=dtype, device=device)
@dataclass
@ -98,11 +98,11 @@ class ModulationOut:
 class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool, dtype=None, device=None, operations=None):
+    def __init__(self, dim: int, double: bool, bias=True, dtype=None, device=None, operations=None):
        super().__init__()
        self.is_double = double
        self.multiplier = 6 if double else 3
-        self.lin = operations.Linear(dim, self.multiplier * dim, bias=True, dtype=dtype, device=device)
+        self.lin = operations.Linear(dim, self.multiplier * dim, bias=bias, dtype=dtype, device=device)
    def forward(self, vec: Tensor) -> tuple:
        if vec.ndim == 2:
@ -129,77 +129,129 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None):
        return tensor
 class SiLUActivation(nn.Module):
    def __init__(self):
        super().__init__()
        self.gate_fn = nn.SiLU()
    def forward(self, x: Tensor) -> Tensor:
        x1, x2 = x.chunk(2, dim=-1)
        return self.gate_fn(x1) * x2
 class DoubleStreamBlock(nn.Module):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, mlp_silu_act=False, proj_bias=True, dtype=None, device=None, operations=None):
        super().__init__()
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.num_heads = num_heads
        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        self.modulation = modulation
        if self.modulation:
            self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
        self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
        self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
        self.img_mlp = nn.Sequential(
            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
            nn.GELU(approximate="tanh"),
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )
-        self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
+        if mlp_silu_act:
            self.img_mlp = nn.Sequential(
                operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
                SiLUActivation(),
                operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
            )
        else:
            self.img_mlp = nn.Sequential(
                operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
                nn.GELU(approximate="tanh"),
                operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
            )
        if self.modulation:
            self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations)
        self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, proj_bias=proj_bias, dtype=dtype, device=device, operations=operations)
        self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.txt_mlp = nn.Sequential(
+
-            operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
+        if mlp_silu_act:
-            nn.GELU(approximate="tanh"),
+            self.txt_mlp = nn.Sequential(
-            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
+                operations.Linear(hidden_size, mlp_hidden_dim * 2, bias=False, dtype=dtype, device=device),
-        )
+                SiLUActivation(),
                operations.Linear(mlp_hidden_dim, hidden_size, bias=False, dtype=dtype, device=device),
            )
        else:
            self.txt_mlp = nn.Sequential(
                operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device),
                nn.GELU(approximate="tanh"),
                operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
            )
        self.flipped_img_txt = flipped_img_txt
    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}):
-        img_mod1, img_mod2 = self.img_mod(vec)
+        if self.modulation:
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
+            img_mod1, img_mod2 = self.img_mod(vec)
            txt_mod1, txt_mod2 = self.txt_mod(vec)
        else:
            (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
        # prepare image for attention
        img_modulated = self.img_norm1(img)
        img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img)
        img_qkv = self.img_attn.qkv(img_modulated)
        del img_modulated
        img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        del img_qkv
        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
        # prepare txt for attention
        txt_modulated = self.txt_norm1(txt)
        txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt)
        txt_qkv = self.txt_attn.qkv(txt_modulated)
        del txt_modulated
        txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        del txt_qkv
        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
        if self.flipped_img_txt:
            q = torch.cat((img_q, txt_q), dim=2)
            del img_q, txt_q
            k = torch.cat((img_k, txt_k), dim=2)
            del img_k, txt_k
            v = torch.cat((img_v, txt_v), dim=2)
            del img_v, txt_v
            # run actual attention
-            attn = attention(torch.cat((img_q, txt_q), dim=2),
+            attn = attention(q, k, v,
                             torch.cat((img_k, txt_k), dim=2),
                             torch.cat((img_v, txt_v), dim=2),
                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
            del q, k, v
            img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:]
        else:
            q = torch.cat((txt_q, img_q), dim=2)
            del txt_q, img_q
            k = torch.cat((txt_k, img_k), dim=2)
            del txt_k, img_k
            v = torch.cat((txt_v, img_v), dim=2)
            del txt_v, img_v
            # run actual attention
-            attn = attention(torch.cat((txt_q, img_q), dim=2),
+            attn = attention(q, k, v,
                             torch.cat((txt_k, img_k), dim=2),
                             torch.cat((txt_v, img_v), dim=2),
                             pe=pe, mask=attn_mask, transformer_options=transformer_options)
            del q, k, v
            txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:]
        # calculate the img bloks
        img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img)
        del img_attn
        img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img)
        # calculate the txt bloks
        txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt)
        del txt_attn
        txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt)
        if txt.dtype == torch.float16:
@ -220,6 +272,9 @@ class SingleStreamBlock(nn.Module):
        num_heads: int,
        mlp_ratio: float = 4.0,
        qk_scale: float = None,
        modulation=True,
        mlp_silu_act=False,
        bias=True,
        dtype=None,
        device=None,
        operations=None
@ -231,30 +286,47 @@ class SingleStreamBlock(nn.Module):
        self.scale = qk_scale or head_dim**-0.5
        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
        self.mlp_hidden_dim_first = self.mlp_hidden_dim
        if mlp_silu_act:
            self.mlp_hidden_dim_first = int(hidden_size * mlp_ratio * 2)
            self.mlp_act = SiLUActivation()
        else:
            self.mlp_act = nn.GELU(approximate="tanh")
        # qkv and mlp_in
-        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device)
+        self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim_first, bias=bias, dtype=dtype, device=device)
        # proj and mlp_out
-        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device)
+        self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, bias=bias, dtype=dtype, device=device)
        self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations)
        self.hidden_size = hidden_size
        self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.mlp_act = nn.GELU(approximate="tanh")
+        if modulation:
-        self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
+            self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations)
        else:
            self.modulation = None
    def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor:
-        mod, _ = self.modulation(vec)
+        if self.modulation:
-        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+            mod, _ = self.modulation(vec)
        else:
            mod = vec
        qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1)
        q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        del qkv
        q, k = self.norm(q, k, v)
        # compute attention
        attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options)
        del q, k, v
        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        mlp = self.mlp_act(mlp)
        output = self.linear2(torch.cat((attn, mlp), 2))
        x += apply_mod(output, mod.gate, None, modulation_dims)
        if x.dtype == torch.float16:
            x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
@ -262,11 +334,11 @@ class SingleStreamBlock(nn.Module):
 class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int, bias=True, dtype=None, device=None, operations=None):
        super().__init__()
        self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device)
-        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, dtype=dtype, device=device)
+        self.linear = operations.Linear(hidden_size, patch_size * patch_size * out_channels, bias=bias, dtype=dtype, device=device)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=True, dtype=dtype, device=device))
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), operations.Linear(hidden_size, 2 * hidden_size, bias=bias, dtype=dtype, device=device))
    def forward(self, x: Tensor, vec: Tensor, modulation_dims=None) -> Tensor:
        if vec.ndim == 2:
--- a/comfy/ldm/flux/math.py
+++ b/comfy/ldm/flux/math.py
@ -7,7 +7,8 @@ import comfy.model_management
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor:
-    q, k = apply_rope(q, k, pe)
+    if pe is not None:
        q, k = apply_rope(q, k, pe)
    heads = q.shape[1]
    x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options)
    return x
--- a/comfy/ldm/flux/model.py
+++ b/comfy/ldm/flux/model.py
@ -15,6 +15,7 @@ from .layers import (
    MLPEmbedder,
    SingleStreamBlock,
    timestep_embedding,
    Modulation
 )
@dataclass
@ -33,6 +34,11 @@ class FluxParams:
    patch_size: int
    qkv_bias: bool
    guidance_embed: bool
    global_modulation: bool = False
    mlp_silu_act: bool = False
    ops_bias: bool = True
    default_ref_method: str = "offset"
    ref_index_scale: float = 1.0
 class Flux(nn.Module):
@ -58,13 +64,17 @@ class Flux(nn.Module):
        self.hidden_size = params.hidden_size
        self.num_heads = params.num_heads
        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
-        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device)
+        self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
+        if params.vec_in_dim is not None:
            self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size, dtype=dtype, device=device, operations=operations)
        else:
            self.vector_in = None
        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device, operations=operations) if params.guidance_embed else nn.Identity()
        )
-        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device)
+        self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, bias=params.ops_bias, dtype=dtype, device=device)
        self.double_blocks = nn.ModuleList(
            [
@ -73,6 +83,9 @@ class Flux(nn.Module):
                    self.num_heads,
                    mlp_ratio=params.mlp_ratio,
                    qkv_bias=params.qkv_bias,
                    modulation=params.global_modulation is False,
                    mlp_silu_act=params.mlp_silu_act,
                    proj_bias=params.ops_bias,
                    dtype=dtype, device=device, operations=operations
                )
                for _ in range(params.depth)
@ -81,13 +94,30 @@ class Flux(nn.Module):
        self.single_blocks = nn.ModuleList(
            [
-                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations)
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=params.global_modulation is False, mlp_silu_act=params.mlp_silu_act, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
                for _ in range(params.depth_single_blocks)
            ]
        )
        if final_layer:
-            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations)
+            self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, bias=params.ops_bias, dtype=dtype, device=device, operations=operations)
        if params.global_modulation:
            self.double_stream_modulation_img = Modulation(
                self.hidden_size,
                double=True,
                bias=False,
                dtype=dtype, device=device, operations=operations
            )
            self.double_stream_modulation_txt = Modulation(
                self.hidden_size,
                double=True,
                bias=False,
                dtype=dtype, device=device, operations=operations
            )
            self.single_stream_modulation = Modulation(
                self.hidden_size, double=False, bias=False, dtype=dtype, device=device, operations=operations
            )
    def forward_orig(
        self,
@ -103,9 +133,6 @@ class Flux(nn.Module):
        attn_mask: Tensor = None,
    ) -> Tensor:
        if y is None:
            y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
        patches = transformer_options.get("patches", {})
        patches_replace = transformer_options.get("patches_replace", {})
        if img.ndim != 3 or txt.ndim != 3:
@ -118,9 +145,17 @@ class Flux(nn.Module):
            if guidance is not None:
                vec = vec + self.guidance_in(timestep_embedding(guidance, 256).to(img.dtype))
-        vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
+        if self.vector_in is not None:
            if y is None:
                y = torch.zeros((img.shape[0], self.params.vec_in_dim), device=img.device, dtype=img.dtype)
            vec = vec + self.vector_in(y[:, :self.params.vec_in_dim])
        txt = self.txt_in(txt)
        vec_orig = vec
        if self.params.global_modulation:
            vec = (self.double_stream_modulation_img(vec_orig), self.double_stream_modulation_txt(vec_orig))
        if "post_input" in patches:
            for p in patches["post_input"]:
                out = p({"img": img, "txt": txt, "img_ids": img_ids, "txt_ids": txt_ids})
@ -177,6 +212,9 @@ class Flux(nn.Module):
        img = torch.cat((txt, img), 1)
        if self.params.global_modulation:
            vec, _ = self.single_stream_modulation(vec_orig)
        for i, block in enumerate(self.single_blocks):
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
@ -207,7 +245,7 @@ class Flux(nn.Module):
        img = img[:, txt.shape[1] :, ...]
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        img = self.final_layer(img, vec_orig)  # (N, T, patch_size ** 2 * out_channels)
        return img
    def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}):
@ -234,10 +272,10 @@ class Flux(nn.Module):
            h_offset += rope_options.get("shift_y", 0.0)
            w_offset += rope_options.get("shift_x", 0.0)
-        img_ids = torch.zeros((steps_h, steps_w, 3), device=x.device, dtype=x.dtype)
+        img_ids = torch.zeros((steps_h, steps_w, len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
        img_ids[:, :, 0] = img_ids[:, :, 1] + index
-        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=x.dtype).unsqueeze(1)
+        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=torch.float32).unsqueeze(1)
-        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=x.dtype).unsqueeze(0)
+        img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=torch.float32).unsqueeze(0)
        return img, repeat(img_ids, "h w c -> b (h w) c", b=bs)
    def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs):
@ -259,10 +297,10 @@ class Flux(nn.Module):
            h = 0
            w = 0
            index = 0
-            ref_latents_method = kwargs.get("ref_latents_method", "offset")
+            ref_latents_method = kwargs.get("ref_latents_method", self.params.default_ref_method)
            for ref in ref_latents:
                if ref_latents_method == "index":
-                    index += 1
+                    index += self.params.ref_index_scale
                    h_offset = 0
                    w_offset = 0
                elif ref_latents_method == "uxo":
@ -286,7 +324,11 @@ class Flux(nn.Module):
                img = torch.cat([img, kontext], dim=1)
                img_ids = torch.cat([img_ids, kontext_ids], dim=1)
-        txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype)
+        txt_ids = torch.zeros((bs, context.shape[1], len(self.params.axes_dim)), device=x.device, dtype=torch.float32)
        if len(self.params.axes_dim) == 4: # Flux 2
            txt_ids[:, :, 3] = torch.linspace(0, context.shape[1] - 1, steps=context.shape[1], device=x.device, dtype=torch.float32)
        out = self.forward_orig(img, img_ids, context, txt_ids, timestep, y, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None))
        out = out[:, :img_tokens]
-        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h_orig,:w_orig]
+        return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=self.patch_size, pw=self.patch_size)[:,:,:h_orig,:w_orig]
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -6,7 +6,6 @@ import comfy.ldm.flux.layers
 import comfy.ldm.modules.diffusionmodules.mmdit
 from comfy.ldm.modules.attention import optimized_attention
 from dataclasses import dataclass
 from einops import repeat
@ -42,6 +41,8 @@ class HunyuanVideoParams:
    guidance_embed: bool
    byt5: bool
    meanflow: bool
    use_cond_type_embedding: bool
    vision_in_dim: int
 class SelfAttentionRef(nn.Module):
@ -157,7 +158,10 @@ class TokenRefiner(nn.Module):
        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
        # m = mask.float().unsqueeze(-1)
        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
-        c = x.sum(dim=1) / x.shape[1]
+        if x.dtype == torch.float16:
            c = x.float().sum(dim=1) / x.shape[1]
        else:
            c = x.sum(dim=1) / x.shape[1]
        c = t + self.c_embedder(c.to(x.dtype))
        x = self.input_embedder(x)
@ -196,11 +200,15 @@ class HunyuanVideo(nn.Module):
    def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs):
        super().__init__()
        self.dtype = dtype
        operation_settings = {"operations": operations, "device": device, "dtype": dtype}
        params = HunyuanVideoParams(**kwargs)
        self.params = params
        self.patch_size = params.patch_size
        self.in_channels = params.in_channels
        self.out_channels = params.out_channels
        self.use_cond_type_embedding = params.use_cond_type_embedding
        self.vision_in_dim = params.vision_in_dim
        if params.hidden_size % params.num_heads != 0:
            raise ValueError(
                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
@ -266,6 +274,18 @@ class HunyuanVideo(nn.Module):
        if final_layer:
            self.final_layer = LastLayer(self.hidden_size, self.patch_size[-1], self.out_channels, dtype=dtype, device=device, operations=operations)
        # HunyuanVideo 1.5 specific modules
        if self.vision_in_dim is not None:
            from comfy.ldm.wan.model import MLPProj
            self.vision_in = MLPProj(in_dim=self.vision_in_dim, out_dim=self.hidden_size, operation_settings=operation_settings)
        else:
            self.vision_in = None
        if self.use_cond_type_embedding:
            # 0: text_encoder feature 1: byt5 feature 2: vision_encoder feature
            self.cond_type_embedding = nn.Embedding(3, self.hidden_size)
        else:
            self.cond_type_embedding = None
    def forward_orig(
        self,
        img: Tensor,
@ -276,6 +296,7 @@ class HunyuanVideo(nn.Module):
        timesteps: Tensor,
        y: Tensor = None,
        txt_byt5=None,
        clip_fea=None,
        guidance: Tensor = None,
        guiding_frame_index=None,
        ref_latent=None,
@ -331,12 +352,31 @@ class HunyuanVideo(nn.Module):
        txt = self.txt_in(txt, timesteps, txt_mask, transformer_options=transformer_options)
        if self.cond_type_embedding is not None:
            self.cond_type_embedding.to(txt.device)
            cond_emb = self.cond_type_embedding(torch.zeros_like(txt[:, :, 0], device=txt.device, dtype=torch.long))
            txt = txt + cond_emb.to(txt.dtype)
        if self.byt5_in is not None and txt_byt5 is not None:
            txt_byt5 = self.byt5_in(txt_byt5)
            if self.cond_type_embedding is not None:
                cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
                txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
                txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
            else:
                txt = torch.cat((txt, txt_byt5), dim=1)
            txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
            txt = torch.cat((txt, txt_byt5), dim=1)
            txt_ids = torch.cat((txt_ids, txt_byt5_ids), dim=1)
        if clip_fea is not None:
            txt_vision_states = self.vision_in(clip_fea)
            if self.cond_type_embedding is not None:
                cond_emb = self.cond_type_embedding(2 * torch.ones_like(txt_vision_states[:, :, 0], dtype=torch.long, device=txt_vision_states.device))
                txt_vision_states = txt_vision_states + cond_emb
            txt = torch.cat((txt_vision_states.to(txt.dtype), txt), dim=1)
            extra_txt_ids = torch.zeros((txt_ids.shape[0], txt_vision_states.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
            txt_ids = torch.cat((txt_ids, extra_txt_ids), dim=1)
        ids = torch.cat((img_ids, txt_ids), dim=1)
        pe = self.pe_embedder(ids)
@ -349,7 +389,10 @@ class HunyuanVideo(nn.Module):
            attn_mask = None
        blocks_replace = patches_replace.get("dit", {})
        transformer_options["total_blocks"] = len(self.double_blocks)
        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.double_blocks):
            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -371,7 +414,10 @@ class HunyuanVideo(nn.Module):
        img = torch.cat((img, txt), 1)
        transformer_options["total_blocks"] = len(self.single_blocks)
        transformer_options["block_type"] = "single"
        for i, block in enumerate(self.single_blocks):
            transformer_options["block_index"] = i
            if ("single_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
@ -430,14 +476,14 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return repeat(img_ids, "h w c -> b (h w) c", b=bs)
-    def forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        return comfy.patcher_extension.WrapperExecutor.new_class_executor(
            self._forward,
            self,
            comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
-        ).execute(x, timestep, context, y, txt_byt5, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
+        ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)
-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        bs = x.shape[0]
        if len(self.patch_size) == 3:
            img_ids = self.img_ids(x)
@ -445,5 +491,5 @@ class HunyuanVideo(nn.Module):
        else:
            img_ids = self.img_ids_2d(x)
            txt_ids = torch.zeros((bs, context.shape[1], 2), device=x.device, dtype=x.dtype)
-        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
+        out = self.forward_orig(x, img_ids, context, txt_ids, attention_mask, timestep, y, txt_byt5, clip_fea, guidance, guiding_frame_index, ref_latent, disable_time_r=disable_time_r, control=control, transformer_options=transformer_options)
        return out
--- a/comfy/ldm/hunyuan_video/upsampler.py
+++ b/comfy/ldm/hunyuan_video/upsampler.py
@ -0,0 +1,120 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from comfy.ldm.hunyuan_video.vae_refiner import RMS_norm, ResnetBlock, VideoConv3d
 import model_management, model_patcher
 class SRResidualCausalBlock3D(nn.Module):
    def __init__(self, channels: int):
        super().__init__()
        self.block = nn.Sequential(
            VideoConv3d(channels, channels, kernel_size=3),
            nn.SiLU(inplace=True),
            VideoConv3d(channels, channels, kernel_size=3),
            nn.SiLU(inplace=True),
            VideoConv3d(channels, channels, kernel_size=3),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.block(x)
 class SRModel3DV2(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        hidden_channels: int = 64,
        num_blocks: int = 6,
        global_residual: bool = False,
    ):
        super().__init__()
        self.in_conv = VideoConv3d(in_channels, hidden_channels, kernel_size=3)
        self.blocks = nn.ModuleList([SRResidualCausalBlock3D(hidden_channels) for _ in range(num_blocks)])
        self.out_conv = VideoConv3d(hidden_channels, out_channels, kernel_size=3)
        self.global_residual = bool(global_residual)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        residual = x
        y = self.in_conv(x)
        for blk in self.blocks:
            y = blk(y)
        y = self.out_conv(y)
        if self.global_residual and (y.shape == residual.shape):
            y = y + residual
        return y
 class Upsampler(nn.Module):
    def __init__(
        self,
        z_channels: int,
        out_channels: int,
        block_out_channels: tuple[int, ...],
        num_res_blocks: int = 2,
    ):
        super().__init__()
        self.num_res_blocks = num_res_blocks
        self.block_out_channels = block_out_channels
        self.z_channels = z_channels
        ch = block_out_channels[0]
        self.conv_in = VideoConv3d(z_channels, ch, kernel_size=3)
        self.up = nn.ModuleList()
        for i, tgt in enumerate(block_out_channels):
            stage = nn.Module()
            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
                                                    out_channels=tgt,
                                                    temb_channels=0,
                                                    conv_shortcut=False,
                                                    conv_op=VideoConv3d, norm_op=RMS_norm)
                                        for j in range(num_res_blocks + 1)])
            ch = tgt
            self.up.append(stage)
        self.norm_out = RMS_norm(ch)
        self.conv_out = VideoConv3d(ch, out_channels, kernel_size=3)
    def forward(self, z):
        """
        Args:
            z: (B, C, T, H, W)
            target_shape: (H, W)
        """
        # z to block_in
        repeats = self.block_out_channels[0] // (self.z_channels)
        x = self.conv_in(z) + z.repeat_interleave(repeats=repeats, dim=1)
        # upsampling
        for stage in self.up:
            for blk in stage.block:
                x = blk(x)
        out = self.conv_out(F.silu(self.norm_out(x)))
        return out
 UPSAMPLERS = {
    "720p": SRModel3DV2,
    "1080p": Upsampler,
 }
 class HunyuanVideo15SRModel():
    def __init__(self, model_type, config):
        self.load_device = model_management.vae_device()
        offload_device = model_management.vae_offload_device()
        self.dtype = model_management.vae_dtype(self.load_device)
        self.model_class = UPSAMPLERS.get(model_type)
        self.model = self.model_class(**config).eval()
        self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
    def load_sd(self, sd):
        return self.model.load_state_dict(sd, strict=True)
    def get_sd(self):
        return self.model.state_dict()
    def resample_latent(self, latent):
        model_management.load_model_gpu(self.patcher)
        return self.model(latent.to(self.load_device))
--- a/comfy/ldm/hunyuan_video/vae_refiner.py
+++ b/comfy/ldm/hunyuan_video/vae_refiner.py
@ -4,8 +4,40 @@ import torch.nn.functional as F
 from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize
 import comfy.ops
 import comfy.ldm.models.autoencoder
 import comfy.model_management
 ops = comfy.ops.disable_weight_init
 class NoPadConv3d(nn.Module):
    def __init__(self, n_channels, out_channels, kernel_size, stride=1, dilation=1, padding=0, **kwargs):
        super().__init__()
        self.conv = ops.Conv3d(n_channels, out_channels, kernel_size, stride=stride, dilation=dilation, **kwargs)
    def forward(self, x):
        return self.conv(x)
 def conv_carry_causal_3d(xl, op, conv_carry_in=None, conv_carry_out=None):
    x = xl[0]
    xl.clear()
    if conv_carry_out is not None:
        to_push = x[:, :, -2:, :, :].clone()
        conv_carry_out.append(to_push)
    if isinstance(op, NoPadConv3d):
        if conv_carry_in is None:
            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2, 0), mode = 'replicate')
        else:
            carry_len = conv_carry_in[0].shape[2]
            x = torch.cat([conv_carry_in.pop(0), x], dim=2)
            x = torch.nn.functional.pad(x, (1, 1, 1, 1, 2 - carry_len, 0), mode = 'replicate')
    out = op(x)
    return out
 class RMS_norm(nn.Module):
    def __init__(self, dim):
        super().__init__()
@ -14,7 +46,7 @@ class RMS_norm(nn.Module):
        self.gamma = nn.Parameter(torch.empty(shape))
    def forward(self, x):
-        return F.normalize(x, dim=1) * self.scale * self.gamma
+        return F.normalize(x, dim=1) * self.scale * comfy.model_management.cast_to(self.gamma, dtype=x.dtype, device=x.device)
 class DnSmpl(nn.Module):
    def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d):
@ -27,11 +59,12 @@ class DnSmpl(nn.Module):
        self.tds = tds
        self.gs = fct * ic // oc
-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        r1 = 2 if self.tds else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
        if self.tds and self.refiner_vae and conv_carry_in is None:
        if self.tds and self.refiner_vae:
            hf = h[:, :, :1, :, :]
            b, c, f, ht, wd = hf.shape
            hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2)
@ -39,14 +72,7 @@ class DnSmpl(nn.Module):
            hf = hf.reshape(b, 2 * 2 * c, f, ht // 2, wd // 2)
            hf = torch.cat([hf, hf], dim=1)
-            hn = h[:, :, 1:, :, :]
+            h = h[:, :, 1:, :, :]
            b, c, frms, ht, wd = hn.shape
            nf = frms // r1
            hn = hn.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
            hn = hn.permute(0, 3, 5, 7, 1, 2, 4, 6)
            hn = hn.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
            h = torch.cat([hf, hn], dim=2)
            xf = x[:, :, :1, :, :]
            b, ci, f, ht, wd = xf.shape
@ -54,34 +80,32 @@ class DnSmpl(nn.Module):
            xf = xf.permute(0, 4, 6, 1, 2, 3, 5)
            xf = xf.reshape(b, 2 * 2 * ci, f, ht // 2, wd // 2)
            B, C, T, H, W = xf.shape
-            xf = xf.view(B, h.shape[1], self.gs // 2, T, H, W).mean(dim=2)
+            xf = xf.view(B, hf.shape[1], self.gs // 2, T, H, W).mean(dim=2)
-            xn = x[:, :, 1:, :, :]
+            x = x[:, :, 1:, :, :]
            b, ci, frms, ht, wd = xn.shape
            nf = frms // r1
            xn = xn.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
            xn = xn.permute(0, 3, 5, 7, 1, 2, 4, 6)
            xn = xn.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
            B, C, T, H, W = xn.shape
            xn = xn.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
            sc = torch.cat([xf, xn], dim=2)
        else:
            b, c, frms, ht, wd = h.shape
-            nf = frms // r1
+        if h.shape[2] == 0:
-            h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
+            return hf + xf
            h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
            h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
-            b, ci, frms, ht, wd = x.shape
+        b, c, frms, ht, wd = h.shape
-            nf = frms // r1
+        nf = frms // r1
-            sc = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
+        h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2)
-            sc = sc.permute(0, 3, 5, 7, 1, 2, 4, 6)
+        h = h.permute(0, 3, 5, 7, 1, 2, 4, 6)
-            sc = sc.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
+        h = h.reshape(b, r1 * 2 * 2 * c, nf, ht // 2, wd // 2)
            B, C, T, H, W = sc.shape
            sc = sc.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
-        return h + sc
+        b, ci, frms, ht, wd = x.shape
        nf = frms // r1
        x = x.reshape(b, ci, nf, r1, ht // 2, 2, wd // 2, 2)
        x = x.permute(0, 3, 5, 7, 1, 2, 4, 6)
        x = x.reshape(b, r1 * 2 * 2 * ci, nf, ht // 2, wd // 2)
        B, C, T, H, W = x.shape
        x = x.view(B, h.shape[1], self.gs, T, H, W).mean(dim=2)
        if self.tds and self.refiner_vae and conv_carry_in is None:
            h = torch.cat([hf, h], dim=2)
            x = torch.cat([xf, x], dim=2)
        return h + x
 class UpSmpl(nn.Module):
@ -94,11 +118,11 @@ class UpSmpl(nn.Module):
        self.tus = tus
        self.rp = fct * oc // ic
-    def forward(self, x):
+    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        r1 = 2 if self.tus else 1
-        h = self.conv(x)
+        h = conv_carry_causal_3d([x], self.conv, conv_carry_in, conv_carry_out)
-        if self.tus and self.refiner_vae:
+        if self.tus and self.refiner_vae and conv_carry_in is None:
            hf = h[:, :, :1, :, :]
            b, c, f, ht, wd = hf.shape
            nc = c // (2 * 2)
@ -107,14 +131,7 @@ class UpSmpl(nn.Module):
            hf = hf.reshape(b, nc, f, ht * 2, wd * 2)
            hf = hf[:, : hf.shape[1] // 2]
-            hn = h[:, :, 1:, :, :]
+            h = h[:, :, 1:, :, :]
            b, c, frms, ht, wd = hn.shape
            nc = c // (r1 * 2 * 2)
            hn = hn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
            hn = hn.permute(0, 4, 5, 1, 6, 2, 7, 3)
            hn = hn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
            h = torch.cat([hf, hn], dim=2)
            xf = x[:, :, :1, :, :]
            b, ci, f, ht, wd = xf.shape
@ -125,29 +142,43 @@ class UpSmpl(nn.Module):
            xf = xf.permute(0, 3, 4, 5, 1, 6, 2)
            xf = xf.reshape(b, nc, f, ht * 2, wd * 2)
-            xn = x[:, :, 1:, :, :]
+            x = x[:, :, 1:, :, :]
            xn = xn.repeat_interleave(repeats=self.rp, dim=1)
            b, c, frms, ht, wd = xn.shape
            nc = c // (r1 * 2 * 2)
            xn = xn.reshape(b, r1, 2, 2, nc, frms, ht, wd)
            xn = xn.permute(0, 4, 5, 1, 6, 2, 7, 3)
            xn = xn.reshape(b, nc, frms * r1, ht * 2, wd * 2)
            sc = torch.cat([xf, xn], dim=2)
        else:
            b, c, frms, ht, wd = h.shape
            nc = c // (r1 * 2 * 2)
            h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
            h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
            h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-            sc = x.repeat_interleave(repeats=self.rp, dim=1)
+        b, c, frms, ht, wd = h.shape
-            b, c, frms, ht, wd = sc.shape
+        nc = c // (r1 * 2 * 2)
-            nc = c // (r1 * 2 * 2)
+        h = h.reshape(b, r1, 2, 2, nc, frms, ht, wd)
-            sc = sc.reshape(b, r1, 2, 2, nc, frms, ht, wd)
+        h = h.permute(0, 4, 5, 1, 6, 2, 7, 3)
-            sc = sc.permute(0, 4, 5, 1, 6, 2, 7, 3)
+        h = h.reshape(b, nc, frms * r1, ht * 2, wd * 2)
            sc = sc.reshape(b, nc, frms * r1, ht * 2, wd * 2)
-        return h + sc
+        x = x.repeat_interleave(repeats=self.rp, dim=1)
        b, c, frms, ht, wd = x.shape
        nc = c // (r1 * 2 * 2)
        x = x.reshape(b, r1, 2, 2, nc, frms, ht, wd)
        x = x.permute(0, 4, 5, 1, 6, 2, 7, 3)
        x = x.reshape(b, nc, frms * r1, ht * 2, wd * 2)
        if self.tus and self.refiner_vae and conv_carry_in is None:
            h = torch.cat([hf, h], dim=2)
            x = torch.cat([xf, x], dim=2)
        return h + x
 class HunyuanRefinerResnetBlock(ResnetBlock):
    def __init__(self, in_channels, out_channels, conv_op=NoPadConv3d, norm_op=RMS_norm):
        super().__init__(in_channels=in_channels, out_channels=out_channels, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
    def forward(self, x, conv_carry_in=None, conv_carry_out=None):
        h = x
        h = [ self.swish(self.norm1(x)) ]
        h = conv_carry_causal_3d(h, self.conv1, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
        h = [ self.dropout(self.swish(self.norm2(h))) ]
        h = conv_carry_causal_3d(h, self.conv2, conv_carry_in=conv_carry_in, conv_carry_out=conv_carry_out)
        if self.in_channels != self.out_channels:
            x = self.nin_shortcut(x)
        return x+h
 class Encoder(nn.Module):
    def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks,
@ -160,7 +191,7 @@ class Encoder(nn.Module):
        self.refiner_vae = refiner_vae
        if self.refiner_vae:
-            conv_op = VideoConv3d
+            conv_op = NoPadConv3d
            norm_op = RMS_norm
        else:
            conv_op = ops.Conv3d
@ -175,10 +206,9 @@ class Encoder(nn.Module):
        for i, tgt in enumerate(block_out_channels):
            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+            stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                     out_channels=tgt,
+                                                                   out_channels=tgt,
-                                                     temb_channels=0,
+                                                                   conv_op=conv_op, norm_op=norm_op)
                                                     conv_op=conv_op, norm_op=norm_op)
                                        for j in range(num_res_blocks)])
            ch = tgt
            if i < depth:
@ -188,9 +218,9 @@ class Encoder(nn.Module):
            self.down.append(stage)
        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
        self.norm_out = norm_op(ch)
        self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1)
@ -201,31 +231,50 @@ class Encoder(nn.Module):
        if not self.refiner_vae and x.shape[2] == 1:
            x = x.expand(-1, -1, self.ffactor_temporal, -1, -1)
-        x = self.conv_in(x)
+        if self.refiner_vae:
            xl = [x[:, :, :1, :, :]]
            if x.shape[2] > self.ffactor_temporal:
                xl += torch.split(x[:, :, 1: 1 + ((x.shape[2] - 1) // self.ffactor_temporal) * self.ffactor_temporal, :, :], self.ffactor_temporal * 2, dim=2)
            x = xl
        else:
            x = [x]
        out = []
-        for stage in self.down:
+        conv_carry_in = None
            for blk in stage.block:
                x = blk(x)
            if hasattr(stage, 'downsample'):
                x = stage.downsample(x)
-        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
+        for i, x1 in enumerate(x):
            conv_carry_out = []
            if i == len(x) - 1:
                conv_carry_out = None
            x1 = [ x1 ]
            x1 = conv_carry_causal_3d(x1, self.conv_in, conv_carry_in, conv_carry_out)
            for stage in self.down:
                for blk in stage.block:
                    x1 = blk(x1, conv_carry_in, conv_carry_out)
                if hasattr(stage, 'downsample'):
                    x1 = stage.downsample(x1, conv_carry_in, conv_carry_out)
            out.append(x1)
            conv_carry_in = conv_carry_out
        if len(out) > 1:
            out = torch.cat(out, dim=2)
        else:
            out = out[0]
        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(out)))
        del out
        b, c, t, h, w = x.shape
        grp = c // (self.z_channels << 1)
        skip = x.view(b, c // grp, grp, t, h, w).mean(2)
-        out = self.conv_out(F.silu(self.norm_out(x))) + skip
+        out = conv_carry_causal_3d([F.silu(self.norm_out(x))], self.conv_out) + skip
        if self.refiner_vae:
            out = self.regul(out)[0]
            out = torch.cat((out[:, :, :1], out), dim=2)
            out = out.permute(0, 2, 1, 3, 4)
            b, f_times_2, c, h, w = out.shape
            out = out.reshape(b, f_times_2 // 2, 2 * c, h, w)
            out = out.permute(0, 2, 1, 3, 4).contiguous()
        return out
 class Decoder(nn.Module):
@ -239,7 +288,7 @@ class Decoder(nn.Module):
        self.refiner_vae = refiner_vae
        if self.refiner_vae:
-            conv_op = VideoConv3d
+            conv_op = NoPadConv3d
            norm_op = RMS_norm
        else:
            conv_op = ops.Conv3d
@ -249,9 +298,9 @@ class Decoder(nn.Module):
        self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1)
        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_1 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch, conv_op=conv_op, norm_op=norm_op)
        self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op)
-        self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op)
+        self.mid.block_2 = HunyuanRefinerResnetBlock(in_channels=ch, out_channels=ch,  conv_op=conv_op, norm_op=norm_op)
        self.up = nn.ModuleList()
        depth = (ffactor_spatial >> 1).bit_length()
@ -259,10 +308,9 @@ class Decoder(nn.Module):
        for i, tgt in enumerate(block_out_channels):
            stage = nn.Module()
-            stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt,
+            stage.block = nn.ModuleList([HunyuanRefinerResnetBlock(in_channels=ch if j == 0 else tgt,
-                                                     out_channels=tgt,
+                                                                   out_channels=tgt,
-                                                     temb_channels=0,
+                                                                   conv_op=conv_op, norm_op=norm_op)
                                                     conv_op=conv_op, norm_op=norm_op)
                                        for j in range(num_res_blocks + 1)])
            ch = tgt
            if i < depth:
@ -275,27 +323,41 @@ class Decoder(nn.Module):
        self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1)
    def forward(self, z):
-        if self.refiner_vae:
+        x = conv_carry_causal_3d([z], self.conv_in) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
            z = z.permute(0, 2, 1, 3, 4)
            b, f, c, h, w = z.shape
            z = z.reshape(b, f, 2, c // 2, h, w)
            z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w)
            z = z.permute(0, 2, 1, 3, 4)
            z = z[:, :, 1:]
        x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1)
        x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x)))
-        for stage in self.up:
+        if self.refiner_vae:
-            for blk in stage.block:
+            x = torch.split(x, 2, dim=2)
-                x = blk(x)
+        else:
-            if hasattr(stage, 'upsample'):
+            x = [ x ]
-                x = stage.upsample(x)
+        out = []
-        out = self.conv_out(F.silu(self.norm_out(x)))
+        conv_carry_in = None
        for i, x1 in enumerate(x):
            conv_carry_out = []
            if i == len(x) - 1:
                conv_carry_out = None
            for stage in self.up:
                for blk in stage.block:
                    x1 = blk(x1, conv_carry_in, conv_carry_out)
                if hasattr(stage, 'upsample'):
                    x1 = stage.upsample(x1, conv_carry_in, conv_carry_out)
            x1 = [ F.silu(self.norm_out(x1)) ]
            x1 = conv_carry_causal_3d(x1, self.conv_out, conv_carry_in, conv_carry_out)
            out.append(x1)
            conv_carry_in = conv_carry_out
        del x
        if len(out) > 1:
            out = torch.cat(out, dim=2)
        else:
            out = out[0]
        if not self.refiner_vae:
            if z.shape[-3] == 1:
                out = out[:, :, -1:]
        return out
--- a/comfy/ldm/models/autoencoder.py
+++ b/comfy/ldm/models/autoencoder.py
@ -9,6 +9,8 @@ from comfy.ldm.modules.distributions.distributions import DiagonalGaussianDistri
 from comfy.ldm.util import get_obj_from_str, instantiate_from_config
 from comfy.ldm.modules.ema import LitEma
 import comfy.ops
 from einops import rearrange
 import comfy.model_management
 class DiagonalGaussianRegularizer(torch.nn.Module):
    def __init__(self, sample: bool = False):
@ -179,6 +181,21 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
        self.post_quant_conv = conv_op(embed_dim, ddconfig["z_channels"], 1)
        self.embed_dim = embed_dim
        if ddconfig.get("batch_norm_latent", False):
            self.bn_eps = 1e-4
            self.bn_momentum = 0.1
            self.ps = [2, 2]
            self.bn = torch.nn.BatchNorm2d(math.prod(self.ps) * ddconfig["z_channels"],
                                           eps=self.bn_eps,
                                           momentum=self.bn_momentum,
                                           affine=False,
                                           track_running_stats=True,
                                           )
            self.bn.eval()
        else:
            self.bn = None
    def get_autoencoder_params(self) -> list:
        params = super().get_autoencoder_params()
        return params
@ -201,11 +218,36 @@ class AutoencodingEngineLegacy(AutoencodingEngine):
            z = torch.cat(z, 0)
        z, reg_log = self.regularization(z)
        if self.bn is not None:
            z = rearrange(z,
                          "... c (i pi) (j pj)  -> ... (c pi pj) i j",
                          pi=self.ps[0],
                          pj=self.ps[1],
                          )
            z = torch.nn.functional.batch_norm(z,
                                               comfy.model_management.cast_to(self.bn.running_mean, dtype=z.dtype, device=z.device),
                                               comfy.model_management.cast_to(self.bn.running_var, dtype=z.dtype, device=z.device),
                                               momentum=self.bn_momentum,
                                               eps=self.bn_eps)
        if return_reg_log:
            return z, reg_log
        return z
    def decode(self, z: torch.Tensor, **decoder_kwargs) -> torch.Tensor:
        if self.bn is not None:
            s = torch.sqrt(comfy.model_management.cast_to(self.bn.running_var.view(1, -1, 1, 1), dtype=z.dtype, device=z.device) + self.bn_eps)
            m = comfy.model_management.cast_to(self.bn.running_mean.view(1, -1, 1, 1), dtype=z.dtype, device=z.device)
            z = z * s + m
            z = rearrange(
                z,
                "... (c pi pj) i j -> ... c (i pi) (j pj)",
                pi=self.ps[0],
                pj=self.ps[1],
            )
        if self.max_batch_size is None:
            dec = self.post_quant_conv(z)
            dec = self.decoder(dec, **decoder_kwargs)
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@ -236,10 +236,10 @@ class QwenImageTransformerBlock(nn.Module):
        img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)
        txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)
-        img_normed = self.img_norm1(hidden_states)
+        img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1)
-        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
+        del img_mod1
-        txt_normed = self.txt_norm1(encoder_hidden_states)
+        txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
-        txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
+        del txt_mod1
        img_attn_output, txt_attn_output = self.attn(
            hidden_states=img_modulated,
@ -248,16 +248,20 @@ class QwenImageTransformerBlock(nn.Module):
            image_rotary_emb=image_rotary_emb,
            transformer_options=transformer_options,
        )
        del img_modulated
        del txt_modulated
        hidden_states = hidden_states + img_gate1 * img_attn_output
        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
        del img_attn_output
        del txt_attn_output
        del img_gate1
        del txt_gate1
-        img_normed2 = self.img_norm2(hidden_states)
+        img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2)
        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
        hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2))
-        txt_normed2 = self.txt_norm2(encoder_hidden_states)
+        txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
        txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
        encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2))
        return encoder_hidden_states, hidden_states
@ -435,7 +439,10 @@ class QwenImageTransformer2DModel(nn.Module):
        patches = transformer_options.get("patches", {})
        blocks_replace = patches_replace.get("dit", {})
        transformer_options["total_blocks"] = len(self.transformer_blocks)
        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.transformer_blocks):
            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -898,12 +898,13 @@ class Flux(BaseModel):
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            shape = kwargs["noise"].shape
-            mask_ref_size = kwargs["attention_mask_img_shape"]
+            mask_ref_size = kwargs.get("attention_mask_img_shape", None)
-            # the model will pad to the patch size, and then divide
+            if mask_ref_size is not None:
-            # essentially dividing and rounding up
+                # the model will pad to the patch size, and then divide
-            (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
+                # essentially dividing and rounding up
-            attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
+                (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size))
-            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+                attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok))
                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
        guidance = kwargs.get("guidance", 3.5)
        if guidance is not None:
@ -928,6 +929,16 @@ class Flux(BaseModel):
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out
 class Flux2(Flux):
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            target_text_len = 512
            if cross_attn.shape[1] < target_text_len:
                cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, target_text_len - cross_attn.shape[1], 0))
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out
 class GenmoMochi(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@ -1536,3 +1547,94 @@ class HunyuanImage21Refiner(HunyuanImage21):
        out = super().extra_conds(**kwargs)
        out['disable_time_r'] = comfy.conds.CONDConstant(True)
        return out
 class HunyuanVideo15(HunyuanVideo):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
    def concat_cond(self, **kwargs):
        noise = kwargs.get("noise", None)
        extra_channels = self.diffusion_model.img_in.proj.weight.shape[1] - noise.shape[1] - 1 #noise 32 img cond 32 + mask 1
        if extra_channels == 0:
            return None
        image = kwargs.get("concat_latent_image", None)
        device = kwargs["device"]
        if image is None:
            shape_image = list(noise.shape)
            shape_image[1] = extra_channels
            image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
        else:
            latent_dim = self.latent_format.latent_channels
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
            for i in range(0, image.shape[1], latent_dim):
                image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim])
            image = utils.resize_to_batch_size(image, noise.shape[0])
        mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
        if mask is None:
            mask = torch.zeros_like(noise)[:, :1]
        else:
            mask = 1.0 - mask
            mask = utils.common_upscale(mask.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
            if mask.shape[-3] < noise.shape[-3]:
                mask = torch.nn.functional.pad(mask, (0, 0, 0, 0, 0, noise.shape[-3] - mask.shape[-3]), mode='constant', value=0)
            mask = utils.resize_to_batch_size(mask, noise.shape[0])
        return torch.cat((image, mask), dim=1)
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        attention_mask = kwargs.get("attention_mask", None)
        if attention_mask is not None:
            if torch.numel(attention_mask) != attention_mask.sum():
                out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        conditioning_byt5small = kwargs.get("conditioning_byt5small", None)
        if conditioning_byt5small is not None:
            out['txt_byt5'] = comfy.conds.CONDRegular(conditioning_byt5small)
        guidance = kwargs.get("guidance", 6.0)
        if guidance is not None:
            out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance]))
        clip_vision_output = kwargs.get("clip_vision_output", None)
        if clip_vision_output is not None:
            out['clip_fea'] = comfy.conds.CONDRegular(clip_vision_output.last_hidden_state)
        return out
 class HunyuanVideo15_SR_Distilled(HunyuanVideo15):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device)
    def concat_cond(self, **kwargs):
        noise = kwargs.get("noise", None)
        image = kwargs.get("concat_latent_image", None)
        noise_augmentation = kwargs.get("noise_augmentation", 0.0)
        device = kwargs["device"]
        if image is None:
            image = torch.zeros([noise.shape[0], noise.shape[1] * 2 + 2, noise.shape[-3], noise.shape[-2], noise.shape[-1]], device=comfy.model_management.intermediate_device())
        else:
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
            #image = self.process_latent_in(image) # scaling wasn't applied in reference code
            image = utils.resize_to_batch_size(image, noise.shape[0])
            lq_image_slice = slice(noise.shape[1] + 1, 2 * noise.shape[1] + 1)
            if noise_augmentation > 0:
                generator = torch.Generator(device="cpu")
                generator.manual_seed(kwargs.get("seed", 0) - 10)
                noise = torch.randn(image[:, lq_image_slice].shape, generator=generator, dtype=image.dtype, device="cpu").to(image.device)
                image[:, lq_image_slice] = noise_augmentation * noise + min(1.0 - noise_augmentation, 0.75) * image[:, lq_image_slice]
            else:
                image[:, lq_image_slice] = 0.75 * image[:, lq_image_slice]
        return image
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        out['disable_time_r'] = comfy.conds.CONDConstant(False)
        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -186,30 +186,68 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
        guidance_keys = list(filter(lambda a: a.startswith("{}guidance_in.".format(key_prefix)), state_dict_keys))
        dit_config["guidance_embed"] = len(guidance_keys) > 0
        # HunyuanVideo 1.5
        if '{}cond_type_embedding.weight'.format(key_prefix) in state_dict_keys:
            dit_config["use_cond_type_embedding"] = True
        else:
            dit_config["use_cond_type_embedding"] = False
        if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys:
            dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0]
        else:
            dit_config["vision_in_dim"] = None
        return dit_config
    if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and ('{}img_in.weight'.format(key_prefix) in state_dict_keys or f"{key_prefix}distilled_guidance_layer.norms.0.scale" in state_dict_keys): #Flux, Chroma or Chroma Radiance (has no img_in.weight)
        dit_config = {}
-        dit_config["image_model"] = "flux"
+        if '{}double_stream_modulation_img.lin.weight'.format(key_prefix) in state_dict_keys:
            dit_config["image_model"] = "flux2"
            dit_config["axes_dim"] = [32, 32, 32, 32]
            dit_config["num_heads"] = 48
            dit_config["mlp_ratio"] = 3.0
            dit_config["theta"] = 2000
            dit_config["out_channels"] = 128
            dit_config["global_modulation"] = True
            dit_config["vec_in_dim"] = None
            dit_config["mlp_silu_act"] = True
            dit_config["qkv_bias"] = False
            dit_config["ops_bias"] = False
            dit_config["default_ref_method"] = "index"
            dit_config["ref_index_scale"] = 10.0
            patch_size = 1
        else:
            dit_config["image_model"] = "flux"
            dit_config["axes_dim"] = [16, 56, 56]
            dit_config["num_heads"] = 24
            dit_config["mlp_ratio"] = 4.0
            dit_config["theta"] = 10000
            dit_config["out_channels"] = 16
            dit_config["qkv_bias"] = True
            patch_size = 2
        dit_config["in_channels"] = 16
-        patch_size = 2
+        dit_config["hidden_size"] = 3072
        dit_config["context_in_dim"] = 4096
        dit_config["patch_size"] = patch_size
        in_key = "{}img_in.weight".format(key_prefix)
        if in_key in state_dict_keys:
-            dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size)
+            w = state_dict[in_key]
-        dit_config["out_channels"] = 16
+            dit_config["in_channels"] = w.shape[1] // (patch_size * patch_size)
            dit_config["hidden_size"] = w.shape[0]
        txt_in_key = "{}txt_in.weight".format(key_prefix)
        if txt_in_key in state_dict_keys:
            w = state_dict[txt_in_key]
            dit_config["context_in_dim"] = w.shape[1]
            dit_config["hidden_size"] = w.shape[0]
        vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix)
        if vec_in_key in state_dict_keys:
            dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1]
-        dit_config["context_in_dim"] = 4096
+
        dit_config["hidden_size"] = 3072
        dit_config["mlp_ratio"] = 4.0
        dit_config["num_heads"] = 24
        dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.')
        dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.')
        dit_config["axes_dim"] = [16, 56, 56]
        dit_config["theta"] = 10000
        dit_config["qkv_bias"] = True
        if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma
            dit_config["image_model"] = "chroma"
            dit_config["in_channels"] = 64
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -504,6 +504,7 @@ class LoadedModel:
        if use_more_vram == 0:
            use_more_vram = 1e32
        self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights)
        real_model = self.model.model
        if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None:
@ -689,7 +690,10 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
            current_free_mem = get_free_memory(torch_dev) + loaded_memory
            lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
-            lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
+            lowvram_model_memory = lowvram_model_memory - loaded_memory
            if lowvram_model_memory == 0:
                lowvram_model_memory = 0.1
        if vram_set_state == VRAMState.NO_VRAM:
            lowvram_model_memory = 0.1
@ -1094,12 +1098,16 @@ if not args.disable_pinned_memory:
            MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95
        logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
 PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"])
 def pin_memory(tensor):
    global TOTAL_PINNED_MEMORY
    if MAX_PINNED_MEMORY <= 0:
        return False
    if type(tensor).__name__ not in PINNING_ALLOWED_TYPES:
        return False
    if not is_device_cpu(tensor.device):
        return False
@ -1109,11 +1117,17 @@ def pin_memory(tensor):
        #on the GPU async. So dont trust the CUDA API and guard here
        return False
    if not tensor.is_contiguous():
        return False
    size = tensor.numel() * tensor.element_size()
    if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
        return False
    ptr = tensor.data_ptr()
    if ptr == 0:
        return False
    if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0:
        PINNED_MEMORY[ptr] = size
        TOTAL_PINNED_MEMORY += size
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -231,7 +231,6 @@ class ModelPatcher:
        self.object_patches_backup = {}
        self.weight_wrapper_patches = {}
        self.model_options = {"transformer_options":{}}
        self.model_size()
        self.load_device = load_device
        self.offload_device = offload_device
        self.weight_inplace_update = weight_inplace_update
@ -286,7 +285,7 @@ class ModelPatcher:
        return self.model.lowvram_patch_counter
    def clone(self):
-        n = self.__class__(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
+        n = self.__class__(self.model, self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
        for k in self.patches:
            n.patches[k] = self.patches[k][:]
@ -843,7 +842,7 @@ class ModelPatcher:
        self.object_patches_backup.clear()
-    def partially_unload(self, device_to, memory_to_free=0):
+    def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=False):
        with self.use_ejected():
            hooks_unpatched = False
            memory_freed = 0
@ -887,13 +886,19 @@ class ModelPatcher:
                        module_mem += move_weight_functions(m, device_to)
                        if lowvram_possible:
                            if weight_key in self.patches:
-                                _, set_func, convert_func = get_key_weight(self.model, weight_key)
+                                if force_patch_weights:
-                                m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func))
+                                    self.patch_weight_to_device(weight_key)
-                                patch_counter += 1
+                                else:
                                    _, set_func, convert_func = get_key_weight(self.model, weight_key)
                                    m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func))
                                    patch_counter += 1
                            if bias_key in self.patches:
-                                _, set_func, convert_func = get_key_weight(self.model, bias_key)
+                                if force_patch_weights:
-                                m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func))
+                                    self.patch_weight_to_device(bias_key)
-                                patch_counter += 1
+                                else:
                                    _, set_func, convert_func = get_key_weight(self.model, bias_key)
                                    m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func))
                                    patch_counter += 1
                            cast_weight = True
                        if cast_weight:
@ -909,6 +914,7 @@ class ModelPatcher:
            self.model.model_lowvram = True
            self.model.lowvram_patch_counter += patch_counter
            self.model.model_loaded_weight_memory -= memory_freed
            logging.info("loaded partially: {:.2f} MB loaded, lowvram patches: {}".format(self.model.model_loaded_weight_memory / (1024 * 1024), self.model.lowvram_patch_counter))
            return memory_freed
    def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
@ -921,6 +927,9 @@ class ModelPatcher:
                extra_memory += (used - self.model.model_loaded_weight_memory)
            self.patch_model(load_weights=False)
            if extra_memory < 0 and not unpatch_weights:
                self.partially_unload(self.offload_device, -extra_memory, force_patch_weights=force_patch_weights)
                return 0
            full_load = False
            if self.model.model_lowvram == False and self.model.model_loaded_weight_memory > 0:
                self.apply_hooks(self.forced_hooks, force_apply=True)
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -58,7 +58,8 @@ except (ModuleNotFoundError, TypeError):
 NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
 try:
    if comfy.model_management.is_nvidia():
-        if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
+        cudnn_version = torch.backends.cudnn.version()
        if (cudnn_version >= 91002 and cudnn_version < 91500) and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
            #TODO: change upper bound version once it's fixed'
            NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
            logging.info("working around nvidia conv3d memory bug.")
@ -77,7 +78,10 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
    # will add async-offload support to your cast and improve performance.
    if input is not None:
        if dtype is None:
-            dtype = input.dtype
+            if isinstance(input, QuantizedTensor):
                dtype = input._layout_params["orig_dtype"]
            else:
                dtype = input.dtype
        if bias_dtype is None:
            bias_dtype = dtype
        if device is None:
@ -110,9 +114,9 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
                for f in s.bias_function:
                    bias = f(bias)
-    weight = weight.to(dtype=dtype)
+    if weight_has_function or weight.dtype != dtype:
    if weight_has_function:
        with wf_context:
            weight = weight.to(dtype=dtype)
            for f in s.weight_function:
                weight = f(weight)
@ -534,127 +538,120 @@ if CUBLAS_IS_AVAILABLE:
 # ==============================================================================
 # Mixed Precision Operations
 # ==============================================================================
-from .quant_ops import QuantizedTensor
+from .quant_ops import QuantizedTensor, QUANT_ALGOS
 QUANT_FORMAT_MIXINS = {
    "float8_e4m3fn": {
        "dtype": torch.float8_e4m3fn,
        "layout_type": "TensorCoreFP8Layout",
        "parameters": {
            "weight_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False),
            "input_scale": torch.nn.Parameter(torch.zeros((), dtype=torch.float32), requires_grad=False),
        }
    }
 }
-class MixedPrecisionOps(disable_weight_init):
+def mixed_precision_ops(layer_quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False):
-    _layer_quant_config = {}
+    class MixedPrecisionOps(manual_cast):
-    _compute_dtype = torch.bfloat16
+        _layer_quant_config = layer_quant_config
        _compute_dtype = compute_dtype
        _full_precision_mm = full_precision_mm
-    class Linear(torch.nn.Module, CastWeightBiasOp):
+        class Linear(torch.nn.Module, CastWeightBiasOp):
-        def __init__(
+            def __init__(
-            self,
+                self,
-            in_features: int,
+                in_features: int,
-            out_features: int,
+                out_features: int,
-            bias: bool = True,
+                bias: bool = True,
-            device=None,
+                device=None,
-            dtype=None,
+                dtype=None,
-        ) -> None:
+            ) -> None:
-            super().__init__()
+                super().__init__()
-            self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
+                self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
-            # self.factory_kwargs = {"device": device, "dtype": dtype}
+                # self.factory_kwargs = {"device": device, "dtype": dtype}
-            self.in_features = in_features
+                self.in_features = in_features
-            self.out_features = out_features
+                self.out_features = out_features
-            if bias:
+                if bias:
-                self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
+                    self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
-            else:
+                else:
-                self.register_parameter("bias", None)
+                    self.register_parameter("bias", None)
-            self.tensor_class = None
+                self.tensor_class = None
                self._full_precision_mm = MixedPrecisionOps._full_precision_mm
-        def reset_parameters(self):
+            def reset_parameters(self):
-            return None
+                return None
-        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
+            def _load_from_state_dict(self, state_dict, prefix, local_metadata,
-                                  strict, missing_keys, unexpected_keys, error_msgs):
+                                    strict, missing_keys, unexpected_keys, error_msgs):
-            device = self.factory_kwargs["device"]
+                device = self.factory_kwargs["device"]
-            layer_name = prefix.rstrip('.')
+                layer_name = prefix.rstrip('.')
-            weight_key = f"{prefix}weight"
+                weight_key = f"{prefix}weight"
-            weight = state_dict.pop(weight_key, None)
+                weight = state_dict.pop(weight_key, None)
-            if weight is None:
+                if weight is None:
-                raise ValueError(f"Missing weight for layer {layer_name}")
+                    raise ValueError(f"Missing weight for layer {layer_name}")
-            manually_loaded_keys = [weight_key]
+                manually_loaded_keys = [weight_key]
-            if layer_name not in MixedPrecisionOps._layer_quant_config:
+                if layer_name not in MixedPrecisionOps._layer_quant_config:
-                self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
+                    self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
-            else:
+                else:
-                quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None)
+                    quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None)
-                if quant_format is None:
+                    if quant_format is None:
-                    raise ValueError(f"Unknown quantization format for layer {layer_name}")
+                        raise ValueError(f"Unknown quantization format for layer {layer_name}")
-                mixin = QUANT_FORMAT_MIXINS[quant_format]
+                    qconfig = QUANT_ALGOS[quant_format]
-                self.layout_type = mixin["layout_type"]
+                    self.layout_type = qconfig["comfy_tensor_layout"]
-                scale_key = f"{prefix}weight_scale"
+                    weight_scale_key = f"{prefix}weight_scale"
-                layout_params = {
+                    layout_params = {
-                    'scale': state_dict.pop(scale_key, None),
+                        'scale': state_dict.pop(weight_scale_key, None),
-                    'orig_dtype': MixedPrecisionOps._compute_dtype
+                        'orig_dtype': MixedPrecisionOps._compute_dtype,
-                }
+                        'block_size': qconfig.get("group_size", None),
-                if layout_params['scale'] is not None:
+                    }
-                    manually_loaded_keys.append(scale_key)
+                    if layout_params['scale'] is not None:
                        manually_loaded_keys.append(weight_scale_key)
-                self.weight = torch.nn.Parameter(
+                    self.weight = torch.nn.Parameter(
-                    QuantizedTensor(weight.to(device=device, dtype=mixin["dtype"]), self.layout_type, layout_params),
+                        QuantizedTensor(weight.to(device=device), self.layout_type, layout_params),
-                    requires_grad=False
+                        requires_grad=False
-                )
+                    )
-                for param_name, param_value in mixin["parameters"].items():
+                    for param_name in qconfig["parameters"]:
-                    param_key = f"{prefix}{param_name}"
+                        param_key = f"{prefix}{param_name}"
-                    _v = state_dict.pop(param_key, None)
+                        _v = state_dict.pop(param_key, None)
-                    if _v is None:
+                        if _v is None:
-                        continue
+                            continue
-                    setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
+                        setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
-                    manually_loaded_keys.append(param_key)
+                        manually_loaded_keys.append(param_key)
-            super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+                super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
-            for key in manually_loaded_keys:
+                for key in manually_loaded_keys:
-                if key in missing_keys:
+                    if key in missing_keys:
-                    missing_keys.remove(key)
+                        missing_keys.remove(key)
-        def _forward(self, input, weight, bias):
+            def _forward(self, input, weight, bias):
-            return torch.nn.functional.linear(input, weight, bias)
+                return torch.nn.functional.linear(input, weight, bias)
-        def forward_comfy_cast_weights(self, input):
+            def forward_comfy_cast_weights(self, input):
-            weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
+                weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
-            x = self._forward(input, weight, bias)
+                x = self._forward(input, weight, bias)
-            uncast_bias_weight(self, weight, bias, offload_stream)
+                uncast_bias_weight(self, weight, bias, offload_stream)
-            return x
+                return x
-        def forward(self, input, *args, **kwargs):
+            def forward(self, input, *args, **kwargs):
-            run_every_op()
+                run_every_op()
            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
                return self.forward_comfy_cast_weights(input, *args, **kwargs)
            if (getattr(self, 'layout_type', None) is not None and
                getattr(self, 'input_scale', None) is not None and
                not isinstance(input, QuantizedTensor)):
                input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, fp8_dtype=self.weight.dtype)
            return self._forward(input, self.weight, self.bias)
                if self._full_precision_mm or self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
                    return self.forward_comfy_cast_weights(input, *args, **kwargs)
                if (getattr(self, 'layout_type', None) is not None and
                    getattr(self, 'input_scale', None) is not None and
                    not isinstance(input, QuantizedTensor)):
                    input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype)
                return self._forward(input, self.weight, self.bias)
    return MixedPrecisionOps
 def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None):
-    if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config:
+    fp8_compute = comfy.model_management.supports_fp8_compute(load_device) # TODO: if we support more ops this needs to be more granular
-        MixedPrecisionOps._layer_quant_config = model_config.layer_quant_config
+
-        MixedPrecisionOps._compute_dtype = compute_dtype
+    if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config:
-        logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers")
+        logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers")
-        return MixedPrecisionOps
+        return mixed_precision_ops(model_config.layer_quant_config, compute_dtype, full_precision_mm=not fp8_compute)
    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
    if scaled_fp8 is not None:
        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@ -74,6 +74,12 @@ def _copy_layout_params(params):
            new_params[k] = v
    return new_params
 def _copy_layout_params_inplace(src, dst, non_blocking=False):
    for k, v in src.items():
        if isinstance(v, torch.Tensor):
            dst[k].copy_(v, non_blocking=non_blocking)
        else:
            dst[k] = v
 class QuantizedLayout:
    """
@ -222,6 +228,14 @@ class QuantizedTensor(torch.Tensor):
        new_kwargs = dequant_arg(kwargs)
        return func(*new_args, **new_kwargs)
    def data_ptr(self):
        return self._qdata.data_ptr()
    def is_pinned(self):
        return self._qdata.is_pinned()
    def is_contiguous(self):
        return self._qdata.is_contiguous()
 # ==============================================================================
 # Generic Utilities (Layout-Agnostic Operations)
@ -318,13 +332,13 @@ def generic_to_dtype_layout(func, args, kwargs):
 def generic_copy_(func, args, kwargs):
    qt_dest = args[0]
    src = args[1]
-
+    non_blocking = args[2] if len(args) > 2 else False
    if isinstance(qt_dest, QuantizedTensor):
        if isinstance(src, QuantizedTensor):
            # Copy from another quantized tensor
-            qt_dest._qdata.copy_(src._qdata)
+            qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking)
            qt_dest._layout_type = src._layout_type
-            qt_dest._layout_params = _copy_layout_params(src._layout_params)
+            _copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking)
        else:
            # Copy from regular tensor - just copy raw data
            qt_dest._qdata.copy_(src)
@ -332,10 +346,42 @@ def generic_copy_(func, args, kwargs):
    return func(*args, **kwargs)
@register_generic_util(torch.ops.aten.to.dtype)
 def generic_to_dtype(func, args, kwargs):
    """Handle .to(dtype) calls - dtype conversion only."""
    src = args[0]
    if isinstance(src, QuantizedTensor):
        # For dtype-only conversion, just change the orig_dtype, no real cast is needed
        target_dtype = args[1] if len(args) > 1 else kwargs.get('dtype')
        src._layout_params["orig_dtype"] = target_dtype
        return src
    return func(*args, **kwargs)
@register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default)
 def generic_has_compatible_shallow_copy_type(func, args, kwargs):
    return True
@register_generic_util(torch.ops.aten.empty_like.default)
 def generic_empty_like(func, args, kwargs):
    """Empty_like operation - creates an empty tensor with the same quantized structure."""
    qt = args[0]
    if isinstance(qt, QuantizedTensor):
        # Create empty tensor with same shape and dtype as the quantized data
        hp_dtype = kwargs.pop('dtype', qt._layout_params["orig_dtype"])
        new_qdata = torch.empty_like(qt._qdata, **kwargs)
        # Handle device transfer for layout params
        target_device = kwargs.get('device', new_qdata.device)
        new_params = _move_layout_params_to_device(qt._layout_params, target_device)
        # Update orig_dtype if dtype is specified
        new_params['orig_dtype'] = hp_dtype
        return QuantizedTensor(new_qdata, qt._layout_type, new_params)
    return func(*args, **kwargs)
 # ==============================================================================
 # FP8 Layout + Operation Handlers
 # ==============================================================================
@ -359,8 +405,8 @@ class TensorCoreFP8Layout(QuantizedLayout):
        tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype)
        # TODO: uncomment this if it's actually needed because the clamp has a small performance penality'
-        # lp_amax = torch.finfo(dtype).max
+        lp_amax = torch.finfo(dtype).max
-        # torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled)
+        torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled)
        qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format)
        layout_params = {
@ -378,6 +424,13 @@ class TensorCoreFP8Layout(QuantizedLayout):
    def get_plain_tensors(cls, qtensor):
        return qtensor._qdata, qtensor._layout_params['scale']
 QUANT_ALGOS = {
    "float8_e4m3fn": {
        "storage_t": torch.float8_e4m3fn,
        "parameters": {"weight_scale", "input_scale"},
        "comfy_tensor_layout": "TensorCoreFP8Layout",
    },
 }
 LAYOUTS = {
    "TensorCoreFP8Layout": TensorCoreFP8Layout,
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -356,7 +356,7 @@ class VAE:
                    self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype)
                    self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype)
-                elif sd['decoder.conv_in.weight'].shape[1] == 32:
+                elif sd['decoder.conv_in.weight'].shape[1] == 32 and sd['decoder.conv_in.weight'].ndim == 5:
                    ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False}
                    self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
                    self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
@ -382,6 +382,17 @@ class VAE:
                        self.upscale_ratio = 4
                    self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1]
                    if 'decoder.post_quant_conv.weight' in sd:
                        sd = comfy.utils.state_dict_prefix_replace(sd, {"decoder.post_quant_conv.": "post_quant_conv.", "encoder.quant_conv.": "quant_conv."})
                    if 'bn.running_mean' in sd:
                        ddconfig["batch_norm_latent"] = True
                        self.downscale_ratio *= 2
                        self.upscale_ratio *= 2
                        self.latent_channels *= 4
                        old_memory_used_decode = self.memory_used_decode
                        self.memory_used_decode = lambda shape, dtype: old_memory_used_decode(shape, dtype) *  4.0
                    if 'post_quant_conv.weight' in sd:
                        self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1])
                    else:
@ -441,20 +452,20 @@ class VAE:
            elif "decoder.conv_in.conv.weight" in sd and sd['decoder.conv_in.conv.weight'].shape[1] == 32:
                ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True}
                ddconfig['z_channels'] = sd["decoder.conv_in.conv.weight"].shape[1]
-                self.latent_channels = 64
+                self.latent_channels = 32
                self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16)
                self.upscale_index_formula = (4, 16, 16)
                self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16)
                self.downscale_index_formula = (4, 16, 16)
                self.latent_dim = 3
-                self.not_video = True
+                self.not_video = False
                self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32]
                self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.EmptyRegularizer"},
                                                            encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig},
                                                            decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig})
-                self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
-                self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (2800 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
            elif "decoder.conv_in.conv.weight" in sd:
                ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
                ddconfig["conv3d"] = True
@ -911,12 +922,18 @@ class CLIPType(Enum):
    OMNIGEN2 = 17
    QWEN_IMAGE = 18
    HUNYUAN_IMAGE = 19
    HUNYUAN_VIDEO_15 = 20
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
    clip_data = []
    for p in ckpt_paths:
-        clip_data.append(comfy.utils.load_torch_file(p, safe_load=True))
+        sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True)
        if metadata is not None:
            quant_metadata = metadata.get("_quantization_metadata", None)
            if quant_metadata is not None:
                sd["_quantization_metadata"] = quant_metadata
        clip_data.append(sd)
    return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
@ -934,6 +951,8 @@ class TEModel(Enum):
    QWEN25_7B = 11
    BYT5_SMALL_GLYPH = 12
    GEMMA_3_4B = 13
    MISTRAL3_24B = 14
    MISTRAL3_24B_PRUNED_FLUX2 = 15
 def detect_te_model(sd):
    if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@ -966,6 +985,13 @@ def detect_te_model(sd):
        if weight.shape[0] == 512:
            return TEModel.QWEN25_7B
    if "model.layers.0.post_attention_layernorm.weight" in sd:
        weight = sd['model.layers.0.post_attention_layernorm.weight']
        if weight.shape[0] == 5120:
            if "model.layers.39.post_attention_layernorm.weight" in sd:
                return TEModel.MISTRAL3_24B
            else:
                return TEModel.MISTRAL3_24B_PRUNED_FLUX2
        return TEModel.LLAMA3_8
    return None
@ -1080,6 +1106,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            else:
                clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer
        elif te_model == TEModel.MISTRAL3_24B or te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2:
            clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2)
            clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer
            tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
        else:
            # clip_l
            if clip_type == CLIPType.SD3:
@ -1126,6 +1156,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        elif clip_type == CLIPType.HUNYUAN_IMAGE:
            clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
        elif clip_type == CLIPType.HUNYUAN_VIDEO_15:
            clip_target.clip = comfy.text_encoders.hunyuan_image.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
        else:
            clip_target.clip = sdxl_clip.SDXLClipModel
            clip_target.tokenizer = sdxl_clip.SDXLTokenizer
@ -1138,6 +1171,8 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
    parameters = 0
    for c in clip_data:
        if "_quantization_metadata" in c:
            c.pop("_quantization_metadata")
        parameters += comfy.utils.calculate_parameters(c)
        tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@ -109,13 +109,23 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        operations = model_options.get("custom_operations", None)
        scaled_fp8 = None
        quantization_metadata = model_options.get("quantization_metadata", None)
        if operations is None:
-            scaled_fp8 = model_options.get("scaled_fp8", None)
+            layer_quant_config = None
-            if scaled_fp8 is not None:
+            if quantization_metadata is not None:
-                operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
+                layer_quant_config = json.loads(quantization_metadata).get("layers", None)
            if layer_quant_config is not None:
                operations = comfy.ops.mixed_precision_ops(layer_quant_config, dtype, full_precision_mm=True)
                logging.info(f"Using MixedPrecisionOps for text encoder: {len(layer_quant_config)} quantized layers")
            else:
-                operations = comfy.ops.manual_cast
+                # Fallback to scaled_fp8_ops for backward compatibility
                scaled_fp8 = model_options.get("scaled_fp8", None)
                if scaled_fp8 is not None:
                    operations = comfy.ops.scaled_fp8_ops(fp8_matrix_mult=False, override_dtype=scaled_fp8)
                else:
                    operations = comfy.ops.manual_cast
        self.operations = operations
        self.transformer = model_class(config, dtype, device, self.operations)
@ -460,7 +470,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
    return embed_out
 class SDTokenizer:
-    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data={}, tokenizer_args={}):
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data={}, tokenizer_args={}):
        if tokenizer_path is None:
            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
@ -468,6 +478,7 @@ class SDTokenizer:
        self.min_length = tokenizer_data.get("{}_min_length".format(embedding_key), min_length)
        self.end_token = None
        self.min_padding = min_padding
        self.pad_left = pad_left
        empty = self.tokenizer('')["input_ids"]
        self.tokenizer_adds_end_token = has_end_token
@ -522,6 +533,12 @@ class SDTokenizer:
                return (embed, "{} {}".format(embedding_name[len(stripped):], leftover))
        return (embed, leftover)
    def pad_tokens(self, tokens, amount):
        if self.pad_left:
            for i in range(amount):
                tokens.insert(0, (self.pad_token, 1.0, 0))
        else:
            tokens.extend([(self.pad_token, 1.0, 0)] * amount)
    def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_options={}, **kwargs):
        '''
@ -600,7 +617,7 @@ class SDTokenizer:
                        if self.end_token is not None:
                            batch.append((self.end_token, 1.0, 0))
                        if self.pad_to_max_length:
-                            batch.extend([(self.pad_token, 1.0, 0)] * (remaining_length))
+                            self.pad_tokens(batch, remaining_length)
                    #start new batch
                    batch = []
                    if self.start_token is not None:
@ -614,11 +631,11 @@ class SDTokenizer:
        if self.end_token is not None:
            batch.append((self.end_token, 1.0, 0))
        if min_padding is not None:
-            batch.extend([(self.pad_token, 1.0, 0)] * min_padding)
+            self.pad_tokens(batch, min_padding)
        if self.pad_to_max_length and len(batch) < self.max_length:
-            batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch)))
+            self.pad_tokens(batch, self.max_length - len(batch))
        if min_length is not None and len(batch) < min_length:
-            batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch)))
+            self.pad_tokens(batch, min_length - len(batch))
        if not return_word_ids:
            batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens]
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -741,6 +741,37 @@ class FluxSchnell(Flux):
        out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
        return out
 class Flux2(Flux):
    unet_config = {
        "image_model": "flux2",
    }
    sampling_settings = {
        "shift": 2.02,
    }
    unet_extra_config = {}
    latent_format = latent_formats.Flux2
    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def __init__(self, unet_config):
        super().__init__(unet_config)
        self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Flux2(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        return None # TODO
        pref = self.text_encoder_key_prefix[0]
        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
 class GenmoMochi(supported_models_base.BASE):
    unet_config = {
        "image_model": "mochi_preview",
@ -1374,6 +1405,55 @@ class HunyuanImage21Refiner(HunyuanVideo):
        out = model_base.HunyuanImage21Refiner(self, device=device)
        return out
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage]
+class HunyuanVideo15(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "vision_in_dim": 1152,
    }
    sampling_settings = {
        "shift": 7.0,
    }
    memory_usage_factor = 4.0 #TODO
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
    latent_format = latent_formats.HunyuanVideo15
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanVideo15(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
 class HunyuanVideo15_SR_Distilled(HunyuanVideo):
    unet_config = {
        "image_model": "hunyuan_video",
        "vision_in_dim": 1152,
        "in_channels": 98,
    }
    sampling_settings = {
        "shift": 2.0,
    }
    memory_usage_factor = 4.0 #TODO
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
    latent_format = latent_formats.HunyuanVideo15
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.HunyuanVideo15_SR_Distilled(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@ -1,10 +1,13 @@
 from comfy import sd1_clip
 import comfy.text_encoders.t5
 import comfy.text_encoders.sd3_clip
 import comfy.text_encoders.llama
 import comfy.model_management
-from transformers import T5TokenizerFast
+from transformers import T5TokenizerFast, LlamaTokenizerFast
 import torch
 import os
 import json
 import base64
 class T5XXLTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
@ -68,3 +71,105 @@ def flux_clip(dtype_t5=None, t5xxl_scaled_fp8=None):
                model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8
            super().__init__(dtype_t5=dtype_t5, device=device, dtype=dtype, model_options=model_options)
    return FluxClipModel_
 def load_mistral_tokenizer(data):
    if torch.is_tensor(data):
        data = data.numpy().tobytes()
    try:
        from transformers.integrations.mistral import MistralConverter
    except ModuleNotFoundError:
        from transformers.models.pixtral.convert_pixtral_weights_to_hf import MistralConverter
    mistral_vocab = json.loads(data)
    special_tokens = {}
    vocab = {}
    max_vocab = mistral_vocab["config"]["default_vocab_size"]
    for w in mistral_vocab["vocab"]:
        r = w["rank"]
        if r >= max_vocab:
            continue
        vocab[base64.b64decode(w["token_bytes"])] = r
    for w in mistral_vocab["special_tokens"]:
        if "token_bytes" in w:
            special_tokens[base64.b64decode(w["token_bytes"])] = w["rank"]
        else:
            special_tokens[w["token_str"]] = w["rank"]
    all_special = []
    for v in special_tokens:
        all_special.append(v)
    special_tokens.update(vocab)
    vocab = special_tokens
    return {"tokenizer_object": MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(), "legacy": False}
 class MistralTokenizerClass:
    @staticmethod
    def from_pretrained(path, **kwargs):
        return LlamaTokenizerFast(**kwargs)
 class Mistral3Tokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        self.tekken_data = tokenizer_data.get("tekken_model", None)
        super().__init__("", pad_with_end=False, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_class=MistralTokenizerClass, has_end_token=False, pad_to_max_length=False, pad_token=11, max_length=99999999, min_length=1, pad_left=True, tokenizer_args=load_mistral_tokenizer(self.tekken_data), tokenizer_data=tokenizer_data)
    def state_dict(self):
        return {"tekken_model": self.tekken_data}
 class Flux2Tokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="mistral3_24b", tokenizer=Mistral3Tokenizer)
        self.llama_template = '[SYSTEM_PROMPT]You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object\nattribution and actions without speculation.[/SYSTEM_PROMPT][INST]{}[/INST]'
    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
        if llama_template is None:
            llama_text = self.llama_template.format(text)
        else:
            llama_text = llama_template.format(text)
        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
        return tokens
 class Mistral3_24BModel(sd1_clip.SDClipModel):
    def __init__(self, device="cpu", layer="all", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
        textmodel_json_config = {}
        num_layers = model_options.get("num_layers", None)
        if num_layers is not None:
            textmodel_json_config["num_hidden_layers"] = num_layers
            if num_layers < 40:
                textmodel_json_config["final_norm"] = False
        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"start": 1, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Mistral3Small24B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
 class Flux2TEModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}, name="mistral3_24b", clip_model=Mistral3_24BModel):
        super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
    def encode_token_weights(self, token_weight_pairs):
        out, pooled, extra = super().encode_token_weights(token_weight_pairs)
        out = torch.stack((out[:, 10], out[:, 20], out[:, 30]), dim=1)
        out = out.movedim(1, 2)
        out = out.reshape(out.shape[0], out.shape[1], -1)
        return out, pooled, extra
 def flux2_te(dtype_llama=None, llama_scaled_fp8=None, llama_quantization_metadata=None, pruned=False):
    class Flux2TEModel_(Flux2TEModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
                model_options = model_options.copy()
                model_options["scaled_fp8"] = llama_scaled_fp8
            if dtype_llama is not None:
                dtype = dtype_llama
            if llama_quantization_metadata is not None:
                model_options["quantization_metadata"] = llama_quantization_metadata
            if pruned:
                model_options = model_options.copy()
                model_options["num_layers"] = 30
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return Flux2TEModel_
--- a/comfy/text_encoders/hunyuan_video.py
+++ b/comfy/text_encoders/hunyuan_video.py
@ -1,6 +1,7 @@
 from comfy import sd1_clip
 import comfy.model_management
 import comfy.text_encoders.llama
 from .hunyuan_image import HunyuanImageTokenizer
 from transformers import LlamaTokenizerFast
 import torch
 import os
@ -17,6 +18,9 @@ def llama_detect(state_dict, prefix=""):
    if scaled_fp8_key in state_dict:
        out["llama_scaled_fp8"] = state_dict[scaled_fp8_key].dtype
    if "_quantization_metadata" in state_dict:
        out["llama_quantization_metadata"] = state_dict["_quantization_metadata"]
    return out
@ -73,6 +77,14 @@ class HunyuanVideoTokenizer:
        return {}
 class HunyuanVideo15Tokenizer(HunyuanImageTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
        self.llama_template = "<|im_start|>system\nYou are a helpful assistant. Describe the video by detailing the following aspects:\n1. The main content and theme of the video.\n2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.\n3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.\n4. background environment, light, style and atmosphere.\n5. camera angles, movements, and transitions used in the video.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
    def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
        return super().tokenize_with_weights(text, return_word_ids, prevent_empty_text=True, **kwargs)
 class HunyuanVideoClipModel(torch.nn.Module):
    def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}):
        super().__init__()
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -32,6 +32,29 @@ class Llama2Config:
    q_norm = None
    k_norm = None
    rope_scale = None
    final_norm: bool = True
@dataclass
 class Mistral3Small24BConfig:
    vocab_size: int = 131072
    hidden_size: int = 5120
    intermediate_size: int = 32768
    num_hidden_layers: int = 40
    num_attention_heads: int = 32
    num_key_value_heads: int = 8
    max_position_embeddings: int = 8192
    rms_norm_eps: float = 1e-5
    rope_theta: float = 1000000000.0
    transformer_type: str = "llama"
    head_dim = 128
    rms_norm_add = False
    mlp_activation = "silu"
    qkv_bias = False
    rope_dims = None
    q_norm = None
    k_norm = None
    rope_scale = None
    final_norm: bool = True
@dataclass
 class Qwen25_3BConfig:
@ -53,6 +76,7 @@ class Qwen25_3BConfig:
    q_norm = None
    k_norm = None
    rope_scale = None
    final_norm: bool = True
@dataclass
 class Qwen25_7BVLI_Config:
@ -74,6 +98,7 @@ class Qwen25_7BVLI_Config:
    q_norm = None
    k_norm = None
    rope_scale = None
    final_norm: bool = True
@dataclass
 class Gemma2_2B_Config:
@ -96,6 +121,7 @@ class Gemma2_2B_Config:
    k_norm = None
    sliding_attention = None
    rope_scale = None
    final_norm: bool = True
@dataclass
 class Gemma3_4B_Config:
@ -118,6 +144,7 @@ class Gemma3_4B_Config:
    k_norm = "gemma3"
    sliding_attention = [False, False, False, False, False, 1024]
    rope_scale = [1.0, 8.0]
    final_norm: bool = True
 class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
@ -366,7 +393,12 @@ class Llama2_(nn.Module):
            transformer(config, index=i, device=device, dtype=dtype, ops=ops)
            for i in range(config.num_hidden_layers)
        ])
-        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
+
        if config.final_norm:
            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
        else:
            self.norm = None
        # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[]):
@ -421,14 +453,16 @@ class Llama2_(nn.Module):
            if i == intermediate_output:
                intermediate = x.clone()
-        x = self.norm(x)
+        if self.norm is not None:
            x = self.norm(x)
        if all_intermediate is not None:
            all_intermediate.append(x.unsqueeze(1).clone())
        if all_intermediate is not None:
            intermediate = torch.cat(all_intermediate, dim=1)
-        if intermediate is not None and final_layer_norm_intermediate:
+        if intermediate is not None and final_layer_norm_intermediate and self.norm is not None:
            intermediate = self.norm(intermediate)
        return x, intermediate
@ -453,6 +487,15 @@ class Llama2(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype
 class Mistral3Small24B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
        config = Mistral3Small24BConfig(**config_dict)
        self.num_layers = config.num_hidden_layers
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype
 class Qwen25_3B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
--- a/comfy/text_encoders/qwen_image.py
+++ b/comfy/text_encoders/qwen_image.py
@ -17,12 +17,14 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer):
        self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
        self.llama_template_images = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], **kwargs):
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, **kwargs):
        skip_template = False
        if text.startswith('<|im_start|>'):
            skip_template = True
        if text.startswith('<|start_header_id|>'):
            skip_template = True
        if prevent_empty_text and text == '':
            text = ' '
        if skip_template:
            llama_text = text
--- a/comfy_api/internal/async_to_sync.py
+++ b/comfy_api/internal/async_to_sync.py
@ -8,7 +8,7 @@ import os
 import textwrap
 import threading
 from enum import Enum
-from typing import Optional, Type, get_origin, get_args
+from typing import Optional, Type, get_origin, get_args, get_type_hints
 class TypeTracker:
@ -220,11 +220,18 @@ class AsyncToSyncConverter:
            self._async_instance = async_class(*args, **kwargs)
            # Handle annotated class attributes (like execution: Execution)
-            # Get all annotations from the class hierarchy
+            # Get all annotations from the class hierarchy and resolve string annotations
-            all_annotations = {}
+            try:
-            for base_class in reversed(inspect.getmro(async_class)):
+                # get_type_hints resolves string annotations to actual type objects
-                if hasattr(base_class, "__annotations__"):
+                # This handles classes using 'from __future__ import annotations'
-                    all_annotations.update(base_class.__annotations__)
+                all_annotations = get_type_hints(async_class)
            except Exception:
                # Fallback to raw annotations if get_type_hints fails
                # (e.g., for undefined forward references)
                all_annotations = {}
                for base_class in reversed(inspect.getmro(async_class)):
                    if hasattr(base_class, "__annotations__"):
                        all_annotations.update(base_class.__annotations__)
            # For each annotated attribute, check if it needs to be created or wrapped
            for attr_name, attr_type in all_annotations.items():
@ -625,15 +632,19 @@ class AsyncToSyncConverter:
        """Extract class attributes that are classes themselves."""
        class_attributes = []
        # Get resolved type hints to handle string annotations
        try:
            type_hints = get_type_hints(async_class)
        except Exception:
            type_hints = {}
        # Look for class attributes that are classes
        for name, attr in sorted(inspect.getmembers(async_class)):
            if isinstance(attr, type) and not name.startswith("_"):
                class_attributes.append((name, attr))
-            elif (
+            elif name in type_hints:
-                hasattr(async_class, "__annotations__")
+                # Use resolved type hint instead of raw annotation
-                and name in async_class.__annotations__
+                annotation = type_hints[name]
            ):
                annotation = async_class.__annotations__[name]
                if isinstance(annotation, type):
                    class_attributes.append((name, annotation))
@ -908,11 +919,15 @@ class AsyncToSyncConverter:
            attribute_mappings = {}
            # First check annotations for typed attributes (including from parent classes)
-            # Collect all annotations from the class hierarchy
+            # Resolve string annotations to actual types
-            all_annotations = {}
+            try:
-            for base_class in reversed(inspect.getmro(async_class)):
+                all_annotations = get_type_hints(async_class)
-                if hasattr(base_class, "__annotations__"):
+            except Exception:
-                    all_annotations.update(base_class.__annotations__)
+                # Fallback to raw annotations
                all_annotations = {}
                for base_class in reversed(inspect.getmro(async_class)):
                    if hasattr(base_class, "__annotations__"):
                        all_annotations.update(base_class.__annotations__)
            for attr_name, attr_type in sorted(all_annotations.items()):
                for class_name, class_type in class_attributes:
--- a/comfy_api/latest/init.py
+++ b/comfy_api/latest/init.py
@ -7,7 +7,7 @@ from comfy_api.internal.singleton import ProxiedSingleton
 from comfy_api.internal.async_to_sync import create_sync_class
 from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput
 from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents
-from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents
+from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL
 from . import _io as io
 from . import _ui as ui
 # from comfy_api.latest._resources import _RESOURCES as resources  #noqa: F401
@ -104,6 +104,8 @@ class Types:
    VideoCodec = VideoCodec
    VideoContainer = VideoContainer
    VideoComponents = VideoComponents
    MESH = MESH
    VOXEL = VOXEL
 ComfyAPI = ComfyAPI_latest
--- a/comfy_api/latest/_input/video_types.py
+++ b/comfy_api/latest/_input/video_types.py
@ -1,5 +1,6 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from fractions import Fraction
 from typing import Optional, Union, IO
 import io
 import av
@ -72,6 +73,33 @@ class VideoInput(ABC):
        frame_count = components.images.shape[0]
        return float(frame_count / components.frame_rate)
    def get_frame_count(self) -> int:
        """
        Returns the number of frames in the video.
        Default implementation uses :meth:`get_components`, which may require
        loading all frames into memory. File-based implementations should
        override this method and use container/stream metadata instead.
        Returns:
            Total number of frames as an integer.
        """
        return int(self.get_components().images.shape[0])
    def get_frame_rate(self) -> Fraction:
        """
        Returns the frame rate of the video.
        Default implementation materializes the video into memory via
        `get_components()`. Subclasses that can inspect the underlying
        container (e.g. `VideoFromFile`) should override this with a more
        efficient implementation.
        Returns:
            Frame rate as a Fraction.
        """
        return self.get_components().frame_rate
    def get_container_format(self) -> str:
        """
        Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@ -121,6 +121,71 @@ class VideoFromFile(VideoInput):
        raise ValueError(f"Could not determine duration for file '{self.__file}'")
    def get_frame_count(self) -> int:
        """
        Returns the number of frames in the video without materializing them as
        torch tensors.
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        with av.open(self.__file, mode="r") as container:
            video_stream = self._get_first_video_stream(container)
            # 1. Prefer the frames field if available
            if video_stream.frames and video_stream.frames > 0:
                return int(video_stream.frames)
            # 2. Try to estimate from duration and average_rate using only metadata
            if container.duration is not None and video_stream.average_rate:
                duration_seconds = float(container.duration / av.time_base)
                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
                if estimated_frames > 0:
                    return estimated_frames
            if (
                getattr(video_stream, "duration", None) is not None
                and getattr(video_stream, "time_base", None) is not None
                and video_stream.average_rate
            ):
                duration_seconds = float(video_stream.duration * video_stream.time_base)
                estimated_frames = int(round(duration_seconds * float(video_stream.average_rate)))
                if estimated_frames > 0:
                    return estimated_frames
            # 3. Last resort: decode frames and count them (streaming)
            frame_count = 0
            container.seek(0)
            for packet in container.demux(video_stream):
                for _ in packet.decode():
                    frame_count += 1
            if frame_count == 0:
                raise ValueError(f"Could not determine frame count for file '{self.__file}'")
            return frame_count
    def get_frame_rate(self) -> Fraction:
        """
        Returns the average frame rate of the video using container metadata
        without decoding all frames.
        """
        if isinstance(self.__file, io.BytesIO):
            self.__file.seek(0)
        with av.open(self.__file, mode="r") as container:
            video_stream = self._get_first_video_stream(container)
            # Preferred: use PyAV's average_rate (usually already a Fraction-like)
            if video_stream.average_rate:
                return Fraction(video_stream.average_rate)
            # Fallback: estimate from frames + duration if available
            if video_stream.frames and container.duration:
                duration_seconds = float(container.duration / av.time_base)
                if duration_seconds > 0:
                    return Fraction(video_stream.frames / duration_seconds).limit_denominator()
            # Last resort: match get_components_internal default
            return Fraction(1)
    def get_container_format(self) -> str:
        """
        Returns the container format of the video (e.g., 'mp4', 'mov', 'avi').
@ -238,6 +303,13 @@ class VideoFromFile(VideoInput):
                        packet.stream = stream_map[packet.stream]
                        output_container.mux(packet)
    def _get_first_video_stream(self, container: InputContainer):
        video_stream = next((s for s in container.streams if s.type == "video"), None)
        if video_stream is None:
            raise ValueError(f"No video stream found in file '{self.__file}'")
        return video_stream
 class VideoFromComponents(VideoInput):
    """
    Class representing video input from tensors.
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@ -27,6 +27,7 @@ from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classpr
    prune_dict, shallow_clone_class)
 from comfy_api.latest._resources import Resources, ResourcesLocal
 from comfy_execution.graph_utils import ExecutionBlocker
 from ._util import MESH, VOXEL
 # from comfy_extras.nodes_images import SVG as SVG_ # NOTE: needs to be moved before can be imported due to circular reference
@ -628,6 +629,10 @@ class UpscaleModel(ComfyTypeIO):
    if TYPE_CHECKING:
        Type = ImageModelDescriptor
@comfytype(io_type="LATENT_UPSCALE_MODEL")
 class LatentUpscaleModel(ComfyTypeIO):
    Type = Any
@comfytype(io_type="AUDIO")
 class Audio(ComfyTypeIO):
    class AudioDict(TypedDict):
@ -656,11 +661,11 @@ class LossMap(ComfyTypeIO):
@comfytype(io_type="VOXEL")
 class Voxel(ComfyTypeIO):
-    Type = Any # TODO: VOXEL class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3
+    Type = VOXEL
@comfytype(io_type="MESH")
 class Mesh(ComfyTypeIO):
-    Type = Any # TODO: MESH class is defined in comfy_extras/nodes_hunyuan3d.py; should be moved to somewhere else before referenced directly in v3
+    Type = MESH
@comfytype(io_type="HOOKS")
 class Hooks(ComfyTypeIO):
--- a/comfy_api/latest/_util/init.py
+++ b/comfy_api/latest/_util/init.py
@ -1,8 +1,11 @@
 from .video_types import VideoContainer, VideoCodec, VideoComponents
 from .geometry_types import VOXEL, MESH
 __all__ = [
    # Utility Types
    "VideoContainer",
    "VideoCodec",
    "VideoComponents",
    "VOXEL",
    "MESH",
 ]
--- a/comfy_api/latest/_util/geometry_types.py
+++ b/comfy_api/latest/_util/geometry_types.py
@ -0,0 +1,12 @@
 import torch
 class VOXEL:
    def __init__(self, data: torch.Tensor):
        self.data = data
 class MESH:
    def __init__(self, vertices: torch.Tensor, faces: torch.Tensor):
        self.vertices = vertices
        self.faces = faces
--- a/comfy_api_nodes/apis/gemini_api.py
+++ b/comfy_api_nodes/apis/gemini_api.py
@ -1,22 +1,230 @@
-from typing import Optional
+from datetime import date
 from enum import Enum
 from typing import Any
-from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata
+from pydantic import BaseModel, Field
-from pydantic import BaseModel
+
 class GeminiSafetyCategory(str, Enum):
    HARM_CATEGORY_SEXUALLY_EXPLICIT = "HARM_CATEGORY_SEXUALLY_EXPLICIT"
    HARM_CATEGORY_HATE_SPEECH = "HARM_CATEGORY_HATE_SPEECH"
    HARM_CATEGORY_HARASSMENT = "HARM_CATEGORY_HARASSMENT"
    HARM_CATEGORY_DANGEROUS_CONTENT = "HARM_CATEGORY_DANGEROUS_CONTENT"
 class GeminiSafetyThreshold(str, Enum):
    OFF = "OFF"
    BLOCK_NONE = "BLOCK_NONE"
    BLOCK_LOW_AND_ABOVE = "BLOCK_LOW_AND_ABOVE"
    BLOCK_MEDIUM_AND_ABOVE = "BLOCK_MEDIUM_AND_ABOVE"
    BLOCK_ONLY_HIGH = "BLOCK_ONLY_HIGH"
 class GeminiSafetySetting(BaseModel):
    category: GeminiSafetyCategory
    threshold: GeminiSafetyThreshold
 class GeminiRole(str, Enum):
    user = "user"
    model = "model"
 class GeminiMimeType(str, Enum):
    application_pdf = "application/pdf"
    audio_mpeg = "audio/mpeg"
    audio_mp3 = "audio/mp3"
    audio_wav = "audio/wav"
    image_png = "image/png"
    image_jpeg = "image/jpeg"
    image_webp = "image/webp"
    text_plain = "text/plain"
    video_mov = "video/mov"
    video_mpeg = "video/mpeg"
    video_mp4 = "video/mp4"
    video_mpg = "video/mpg"
    video_avi = "video/avi"
    video_wmv = "video/wmv"
    video_mpegps = "video/mpegps"
    video_flv = "video/flv"
 class GeminiInlineData(BaseModel):
    data: str | None = Field(
        None,
        description="The base64 encoding of the image, PDF, or video to include inline in the prompt. "
        "When including media inline, you must also specify the media type (mimeType) of the data. Size limit: 20MB",
    )
    mimeType: GeminiMimeType | None = Field(None)
 class GeminiPart(BaseModel):
    inlineData: GeminiInlineData | None = Field(None)
    text: str | None = Field(None)
 class GeminiTextPart(BaseModel):
    text: str | None = Field(None)
 class GeminiContent(BaseModel):
    parts: list[GeminiPart] = Field([])
    role: GeminiRole = Field(..., examples=["user"])
 class GeminiSystemInstructionContent(BaseModel):
    parts: list[GeminiTextPart] = Field(
        ...,
        description="A list of ordered parts that make up a single message. "
        "Different parts may have different IANA MIME types.",
    )
    role: GeminiRole = Field(
        ...,
        description="The identity of the entity that creates the message. "
        "The following values are supported: "
        "user: This indicates that the message is sent by a real person, typically a user-generated message. "
        "model: This indicates that the message is generated by the model. "
        "The model value is used to insert messages from model into the conversation during multi-turn conversations. "
        "For non-multi-turn conversations, this field can be left blank or unset.",
    )
 class GeminiFunctionDeclaration(BaseModel):
    description: str | None = Field(None)
    name: str = Field(...)
    parameters: dict[str, Any] = Field(..., description="JSON schema for the function parameters")
 class GeminiTool(BaseModel):
    functionDeclarations: list[GeminiFunctionDeclaration] | None = Field(None)
 class GeminiOffset(BaseModel):
    nanos: int | None = Field(None, ge=0, le=999999999)
    seconds: int | None = Field(None, ge=-315576000000, le=315576000000)
 class GeminiVideoMetadata(BaseModel):
    endOffset: GeminiOffset | None = Field(None)
    startOffset: GeminiOffset | None = Field(None)
 class GeminiGenerationConfig(BaseModel):
    maxOutputTokens: int | None = Field(None, ge=16, le=8192)
    seed: int | None = Field(None)
    stopSequences: list[str] | None = Field(None)
    temperature: float | None = Field(None, ge=0.0, le=2.0)
    topK: int | None = Field(None, ge=1)
    topP: float | None = Field(None, ge=0.0, le=1.0)
 class GeminiImageConfig(BaseModel):
-    aspectRatio: Optional[str] = None
+    aspectRatio: str | None = Field(None)
    imageSize: str | None = Field(None)
 class GeminiImageGenerationConfig(GeminiGenerationConfig):
-    responseModalities: Optional[list[str]] = None
+    responseModalities: list[str] | None = Field(None)
-    imageConfig: Optional[GeminiImageConfig] = None
+    imageConfig: GeminiImageConfig | None = Field(None)
 class GeminiImageGenerateContentRequest(BaseModel):
-    contents: list[GeminiContent]
+    contents: list[GeminiContent] = Field(...)
-    generationConfig: Optional[GeminiImageGenerationConfig] = None
+    generationConfig: GeminiImageGenerationConfig | None = Field(None)
-    safetySettings: Optional[list[GeminiSafetySetting]] = None
+    safetySettings: list[GeminiSafetySetting] | None = Field(None)
-    systemInstruction: Optional[GeminiSystemInstructionContent] = None
+    systemInstruction: GeminiSystemInstructionContent | None = Field(None)
-    tools: Optional[list[GeminiTool]] = None
+    tools: list[GeminiTool] | None = Field(None)
-    videoMetadata: Optional[GeminiVideoMetadata] = None
+    videoMetadata: GeminiVideoMetadata | None = Field(None)
 class GeminiGenerateContentRequest(BaseModel):
    contents: list[GeminiContent] = Field(...)
    generationConfig: GeminiGenerationConfig | None = Field(None)
    safetySettings: list[GeminiSafetySetting] | None = Field(None)
    systemInstruction: GeminiSystemInstructionContent | None = Field(None)
    tools: list[GeminiTool] | None = Field(None)
    videoMetadata: GeminiVideoMetadata | None = Field(None)
 class Modality(str, Enum):
    MODALITY_UNSPECIFIED = "MODALITY_UNSPECIFIED"
    TEXT = "TEXT"
    IMAGE = "IMAGE"
    VIDEO = "VIDEO"
    AUDIO = "AUDIO"
    DOCUMENT = "DOCUMENT"
 class ModalityTokenCount(BaseModel):
    modality: Modality | None = None
    tokenCount: int | None = Field(None, description="Number of tokens for the given modality.")
 class Probability(str, Enum):
    NEGLIGIBLE = "NEGLIGIBLE"
    LOW = "LOW"
    MEDIUM = "MEDIUM"
    HIGH = "HIGH"
    UNKNOWN = "UNKNOWN"
 class GeminiSafetyRating(BaseModel):
    category: GeminiSafetyCategory | None = None
    probability: Probability | None = Field(
        None,
        description="The probability that the content violates the specified safety category",
    )
 class GeminiCitation(BaseModel):
    authors: list[str] | None = None
    endIndex: int | None = None
    license: str | None = None
    publicationDate: date | None = None
    startIndex: int | None = None
    title: str | None = None
    uri: str | None = None
 class GeminiCitationMetadata(BaseModel):
    citations: list[GeminiCitation] | None = None
 class GeminiCandidate(BaseModel):
    citationMetadata: GeminiCitationMetadata | None = None
    content: GeminiContent | None = None
    finishReason: str | None = None
    safetyRatings: list[GeminiSafetyRating] | None = None
 class GeminiPromptFeedback(BaseModel):
    blockReason: str | None = None
    blockReasonMessage: str | None = None
    safetyRatings: list[GeminiSafetyRating] | None = None
 class GeminiUsageMetadata(BaseModel):
    cachedContentTokenCount: int | None = Field(
        None,
        description="Output only. Number of tokens in the cached part in the input (the cached content).",
    )
    candidatesTokenCount: int | None = Field(None, description="Number of tokens in the response(s).")
    candidatesTokensDetails: list[ModalityTokenCount] | None = Field(
        None, description="Breakdown of candidate tokens by modality."
    )
    promptTokenCount: int | None = Field(
        None,
        description="Number of tokens in the request. When cachedContent is set, this is still the total effective prompt size meaning this includes the number of tokens in the cached content.",
    )
    promptTokensDetails: list[ModalityTokenCount] | None = Field(
        None, description="Breakdown of prompt tokens by modality."
    )
    thoughtsTokenCount: int | None = Field(None, description="Number of tokens present in thoughts output.")
    toolUsePromptTokenCount: int | None = Field(None, description="Number of tokens present in tool-use prompt(s).")
 class GeminiGenerateContentResponse(BaseModel):
    candidates: list[GeminiCandidate] | None = Field(None)
    promptFeedback: GeminiPromptFeedback | None = Field(None)
    usageMetadata: GeminiUsageMetadata | None = Field(None)
    modelVersion: str | None = Field(None)
--- a/comfy_api_nodes/apis/topaz_api.py
+++ b/comfy_api_nodes/apis/topaz_api.py
@ -0,0 +1,133 @@
 from typing import Optional, Union
 from pydantic import BaseModel, Field
 class ImageEnhanceRequest(BaseModel):
    model: str = Field("Reimagine")
    output_format: str = Field("jpeg")
    subject_detection: str = Field("All")
    face_enhancement: bool = Field(True)
    face_enhancement_creativity: float = Field(0, description="Is ignored if face_enhancement is false")
    face_enhancement_strength: float = Field(0.8, description="Is ignored if face_enhancement is false")
    source_url: str = Field(...)
    output_width: Optional[int] = Field(None)
    output_height: Optional[int] = Field(None)
    crop_to_fill: bool = Field(False)
    prompt: Optional[str] = Field(None, description="Text prompt for creative upscaling guidance")
    creativity: int = Field(3, description="Creativity settings range from 1 to 9")
    face_preservation: str = Field("true", description="To preserve the identity of characters")
    color_preservation: str = Field("true", description="To preserve the original color")
 class ImageAsyncTaskResponse(BaseModel):
    process_id: str = Field(...)
 class ImageStatusResponse(BaseModel):
    process_id: str = Field(...)
    status: str = Field(...)
    progress: Optional[int] = Field(None)
    credits: int = Field(...)
 class ImageDownloadResponse(BaseModel):
    download_url: str = Field(...)
    expiry: int = Field(...)
 class Resolution(BaseModel):
    width: int = Field(...)
    height: int = Field(...)
 class CreateCreateVideoRequestSource(BaseModel):
    container: str = Field(...)
    size: int = Field(..., description="Size of the video file in bytes")
    duration: int = Field(..., description="Duration of the video file in seconds")
    frameCount: int = Field(..., description="Total number of frames in the video")
    frameRate: int = Field(...)
    resolution: Resolution = Field(...)
 class VideoFrameInterpolationFilter(BaseModel):
    model: str = Field(...)
    slowmo: Optional[int] = Field(None)
    fps: int = Field(...)
    duplicate: bool = Field(...)
    duplicate_threshold: float = Field(...)
 class VideoEnhancementFilter(BaseModel):
    model: str = Field(...)
    auto: Optional[str] = Field(None, description="Auto, Manual, Relative")
    focusFixLevel: Optional[str] = Field(None, description="Downscales video input for correction of blurred subjects")
    compression: Optional[float] = Field(None, description="Strength of compression recovery")
    details: Optional[float] = Field(None, description="Amount of detail reconstruction")
    prenoise: Optional[float] = Field(None, description="Amount of noise to add to input to reduce over-smoothing")
    noise: Optional[float] = Field(None, description="Amount of noise reduction")
    halo: Optional[float] = Field(None, description="Amount of halo reduction")
    preblur: Optional[float] = Field(None, description="Anti-aliasing and deblurring strength")
    blur: Optional[float] = Field(None, description="Amount of sharpness applied")
    grain: Optional[float] = Field(None, description="Grain after AI model processing")
    grainSize: Optional[float] = Field(None, description="Size of generated grain")
    recoverOriginalDetailValue: Optional[float] = Field(None, description="Source details into the output video")
    creativity: Optional[str] = Field(None, description="Creativity level(high, low) for slc-1 only")
    isOptimizedMode: Optional[bool] = Field(None, description="Set to true for Starlight Creative (slc-1) only")
 class OutputInformationVideo(BaseModel):
    resolution: Resolution = Field(...)
    frameRate: int = Field(...)
    audioCodec: Optional[str] = Field(..., description="Required if audioTransfer is Copy or Convert")
    audioTransfer: str = Field(..., description="Copy, Convert, None")
    dynamicCompressionLevel: str = Field(..., description="Low, Mid, High")
 class Overrides(BaseModel):
    isPaidDiffusion: bool = Field(True)
 class CreateVideoRequest(BaseModel):
    source: CreateCreateVideoRequestSource = Field(...)
    filters: list[Union[VideoFrameInterpolationFilter, VideoEnhancementFilter]] = Field(...)
    output: OutputInformationVideo = Field(...)
    overrides: Overrides = Field(Overrides(isPaidDiffusion=True))
 class CreateVideoResponse(BaseModel):
    requestId: str = Field(...)
 class VideoAcceptResponse(BaseModel):
    uploadId: str = Field(...)
    urls: list[str] = Field(...)
 class VideoCompleteUploadRequestPart(BaseModel):
    partNum: int = Field(...)
    eTag: str = Field(...)
 class VideoCompleteUploadRequest(BaseModel):
    uploadResults: list[VideoCompleteUploadRequestPart] = Field(...)
 class VideoCompleteUploadResponse(BaseModel):
    message: str = Field(..., description="Confirmation message")
 class VideoStatusResponseEstimates(BaseModel):
    cost: list[int] = Field(...)
 class VideoStatusResponseDownloadUrl(BaseModel):
    url: str = Field(...)
 class VideoStatusResponse(BaseModel):
    status: str = Field(...)
    estimates: Optional[VideoStatusResponseEstimates] = Field(None)
    progress: Optional[float] = Field(None)
    message: Optional[str] = Field("")
    download: Optional[VideoStatusResponseDownloadUrl] = Field(None)
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@ -3,8 +3,6 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API
 See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
 """
 from __future__ import annotations
 import base64
 import json
 import os
@ -12,7 +10,7 @@ import time
 import uuid
 from enum import Enum
 from io import BytesIO
-from typing import Literal, Optional
+from typing import Literal
 import torch
 from typing_extensions import override
@ -20,23 +18,24 @@ from typing_extensions import override
 import folder_paths
 from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api.util import VideoCodec, VideoContainer
-from comfy_api_nodes.apis import (
+from comfy_api_nodes.apis.gemini_api import (
    GeminiContent,
    GeminiGenerateContentRequest,
    GeminiGenerateContentResponse,
    GeminiInlineData,
    GeminiMimeType,
    GeminiPart,
 )
 from comfy_api_nodes.apis.gemini_api import (
    GeminiImageConfig,
    GeminiImageGenerateContentRequest,
    GeminiImageGenerationConfig,
    GeminiInlineData,
    GeminiMimeType,
    GeminiPart,
    GeminiRole,
    Modality,
 )
 from comfy_api_nodes.util import (
    ApiEndpoint,
    audio_to_base64_string,
    bytesio_to_image_tensor,
    get_number_of_images,
    sync_op,
    tensor_to_base64_string,
    validate_string,
@ -57,6 +56,7 @@ class GeminiModel(str, Enum):
    gemini_2_5_flash_preview_04_17 = "gemini-2.5-flash-preview-04-17"
    gemini_2_5_pro = "gemini-2.5-pro"
    gemini_2_5_flash = "gemini-2.5-flash"
    gemini_3_0_pro = "gemini-3-pro-preview"
 class GeminiImageModel(str, Enum):
@ -103,6 +103,16 @@ def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Litera
    Returns:
        List of response parts matching the requested type.
    """
    if response.candidates is None:
        if response.promptFeedback and response.promptFeedback.blockReason:
            feedback = response.promptFeedback
            raise ValueError(
                f"Gemini API blocked the request. Reason: {feedback.blockReason} ({feedback.blockReasonMessage})"
            )
        raise ValueError(
            "Gemini API returned no response candidates. If you are using the `IMAGE` modality, "
            "try changing it to `IMAGE+TEXT` to view the model's reasoning and understand why image generation failed."
        )
    parts = []
    for part in response.candidates[0].content.parts:
        if part_type == "text" and hasattr(part, "text") and part.text:
@ -139,6 +149,50 @@ def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Te
    return torch.cat(image_tensors, dim=0)
 def calculate_tokens_price(response: GeminiGenerateContentResponse) -> float | None:
    if not response.modelVersion:
        return None
    # Define prices (Cost per 1,000,000 tokens), see https://cloud.google.com/vertex-ai/generative-ai/pricing
    if response.modelVersion in ("gemini-2.5-pro-preview-05-06", "gemini-2.5-pro"):
        input_tokens_price = 1.25
        output_text_tokens_price = 10.0
        output_image_tokens_price = 0.0
    elif response.modelVersion in (
        "gemini-2.5-flash-preview-04-17",
        "gemini-2.5-flash",
    ):
        input_tokens_price = 0.30
        output_text_tokens_price = 2.50
        output_image_tokens_price = 0.0
    elif response.modelVersion in (
        "gemini-2.5-flash-image-preview",
        "gemini-2.5-flash-image",
    ):
        input_tokens_price = 0.30
        output_text_tokens_price = 2.50
        output_image_tokens_price = 30.0
    elif response.modelVersion == "gemini-3-pro-preview":
        input_tokens_price = 2
        output_text_tokens_price = 12.0
        output_image_tokens_price = 0.0
    elif response.modelVersion == "gemini-3-pro-image-preview":
        input_tokens_price = 2
        output_text_tokens_price = 12.0
        output_image_tokens_price = 120.0
    else:
        return None
    final_price = response.usageMetadata.promptTokenCount * input_tokens_price
    if response.usageMetadata.candidatesTokensDetails:
        for i in response.usageMetadata.candidatesTokensDetails:
            if i.modality == Modality.IMAGE:
                final_price += output_image_tokens_price * i.tokenCount  # for Nano Banana models
            else:
                final_price += output_text_tokens_price * i.tokenCount
    if response.usageMetadata.thoughtsTokenCount:
        final_price += output_text_tokens_price * response.usageMetadata.thoughtsTokenCount
    return final_price / 1_000_000.0
 class GeminiNode(IO.ComfyNode):
    """
    Node to generate text responses from a Gemini model.
@ -272,10 +326,10 @@ class GeminiNode(IO.ComfyNode):
        prompt: str,
        model: str,
        seed: int,
-        images: Optional[torch.Tensor] = None,
+        images: torch.Tensor | None = None,
-        audio: Optional[Input.Audio] = None,
+        audio: Input.Audio | None = None,
-        video: Optional[Input.Video] = None,
+        video: Input.Video | None = None,
-        files: Optional[list[GeminiPart]] = None,
+        files: list[GeminiPart] | None = None,
    ) -> IO.NodeOutput:
        validate_string(prompt, strip_whitespace=False)
@ -300,15 +354,15 @@ class GeminiNode(IO.ComfyNode):
            data=GeminiGenerateContentRequest(
                contents=[
                    GeminiContent(
-                        role="user",
+                        role=GeminiRole.user,
                        parts=parts,
                    )
                ]
            ),
            response_model=GeminiGenerateContentResponse,
            price_extractor=calculate_tokens_price,
        )
        # Get result output
        output_text = get_text_from_response(response)
        if output_text:
            # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button.
@ -406,7 +460,7 @@ class GeminiInputFiles(IO.ComfyNode):
        )
    @classmethod
-    def execute(cls, file: str, GEMINI_INPUT_FILES: Optional[list[GeminiPart]] = None) -> IO.NodeOutput:
+    def execute(cls, file: str, GEMINI_INPUT_FILES: list[GeminiPart] | None = None) -> IO.NodeOutput:
        """Loads and formats input files for Gemini API."""
        if GEMINI_INPUT_FILES is None:
            GEMINI_INPUT_FILES = []
@ -421,7 +475,7 @@ class GeminiImage(IO.ComfyNode):
    def define_schema(cls):
        return IO.Schema(
            node_id="GeminiImageNode",
-            display_name="Google Gemini Image",
+            display_name="Nano Banana (Google Gemini Image)",
            category="api node/image/Gemini",
            description="Edit images synchronously via Google API.",
            inputs=[
@ -469,6 +523,13 @@ class GeminiImage(IO.ComfyNode):
                    "or otherwise generates 1:1 squares.",
                    optional=True,
                ),
                IO.Combo.Input(
                    "response_modalities",
                    options=["IMAGE+TEXT", "IMAGE"],
                    tooltip="Choose 'IMAGE' for image-only output, or "
                    "'IMAGE+TEXT' to return both the generated image and a text response.",
                    optional=True,
                ),
            ],
            outputs=[
                IO.Image.Output(),
@ -488,9 +549,10 @@ class GeminiImage(IO.ComfyNode):
        prompt: str,
        model: str,
        seed: int,
-        images: Optional[torch.Tensor] = None,
+        images: torch.Tensor | None = None,
-        files: Optional[list[GeminiPart]] = None,
+        files: list[GeminiPart] | None = None,
        aspect_ratio: str = "auto",
        response_modalities: str = "IMAGE+TEXT",
    ) -> IO.NodeOutput:
        validate_string(prompt, strip_whitespace=True, min_length=1)
        parts: list[GeminiPart] = [GeminiPart(text=prompt)]
@ -510,20 +572,19 @@ class GeminiImage(IO.ComfyNode):
            endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
            data=GeminiImageGenerateContentRequest(
                contents=[
-                    GeminiContent(role="user", parts=parts),
+                    GeminiContent(role=GeminiRole.user, parts=parts),
                ],
                generationConfig=GeminiImageGenerationConfig(
-                    responseModalities=["TEXT", "IMAGE"],
+                    responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                    imageConfig=None if aspect_ratio == "auto" else image_config,
                ),
            ),
            response_model=GeminiGenerateContentResponse,
            price_extractor=calculate_tokens_price,
        )
        output_image = get_image_from_response(response)
        output_text = get_text_from_response(response)
        if output_text:
            # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button.
            render_spec = {
                "node_id": cls.hidden.unique_id,
                "component": "ChatHistoryWidget",
@ -544,9 +605,150 @@ class GeminiImage(IO.ComfyNode):
                "display_component",
                render_spec,
            )
        return IO.NodeOutput(get_image_from_response(response), output_text)
-        output_text = output_text or "Empty response from Gemini model..."
+
-        return IO.NodeOutput(output_image, output_text)
+class GeminiImage2(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
        return IO.Schema(
            node_id="GeminiImage2Node",
            display_name="Nano Banana Pro (Google Gemini Image)",
            category="api node/image/Gemini",
            description="Generate or edit images synchronously via Google Vertex API.",
            inputs=[
                IO.String.Input(
                    "prompt",
                    multiline=True,
                    tooltip="Text prompt describing the image to generate or the edits to apply. "
                    "Include any constraints, styles, or details the model should follow.",
                    default="",
                ),
                IO.Combo.Input(
                    "model",
                    options=["gemini-3-pro-image-preview"],
                ),
                IO.Int.Input(
                    "seed",
                    default=42,
                    min=0,
                    max=0xFFFFFFFFFFFFFFFF,
                    control_after_generate=True,
                    tooltip="When the seed is fixed to a specific value, the model makes a best effort to provide "
                    "the same response for repeated requests. Deterministic output isn't guaranteed. "
                    "Also, changing the model or parameter settings, such as the temperature, "
                    "can cause variations in the response even when you use the same seed value. "
                    "By default, a random seed value is used.",
                ),
                IO.Combo.Input(
                    "aspect_ratio",
                    options=["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"],
                    default="auto",
                    tooltip="If set to 'auto', matches your input image's aspect ratio; "
                    "if no image is provided, a 16:9 square is usually generated.",
                ),
                IO.Combo.Input(
                    "resolution",
                    options=["1K", "2K", "4K"],
                    tooltip="Target output resolution. For 2K/4K the native Gemini upscaler is used.",
                ),
                IO.Combo.Input(
                    "response_modalities",
                    options=["IMAGE+TEXT", "IMAGE"],
                    tooltip="Choose 'IMAGE' for image-only output, or "
                    "'IMAGE+TEXT' to return both the generated image and a text response.",
                ),
                IO.Image.Input(
                    "images",
                    optional=True,
                    tooltip="Optional reference image(s). "
                    "To include multiple images, use the Batch Images node (up to 14).",
                ),
                IO.Custom("GEMINI_INPUT_FILES").Input(
                    "files",
                    optional=True,
                    tooltip="Optional file(s) to use as context for the model. "
                    "Accepts inputs from the Gemini Generate Content Input Files node.",
                ),
            ],
            outputs=[
                IO.Image.Output(),
                IO.String.Output(),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
                IO.Hidden.unique_id,
            ],
            is_api_node=True,
        )
    @classmethod
    async def execute(
        cls,
        prompt: str,
        model: str,
        seed: int,
        aspect_ratio: str,
        resolution: str,
        response_modalities: str,
        images: torch.Tensor | None = None,
        files: list[GeminiPart] | None = None,
    ) -> IO.NodeOutput:
        validate_string(prompt, strip_whitespace=True, min_length=1)
        parts: list[GeminiPart] = [GeminiPart(text=prompt)]
        if images is not None:
            if get_number_of_images(images) > 14:
                raise ValueError("The current maximum number of supported images is 14.")
            parts.extend(create_image_parts(images))
        if files is not None:
            parts.extend(files)
        image_config = GeminiImageConfig(imageSize=resolution)
        if aspect_ratio != "auto":
            image_config.aspectRatio = aspect_ratio
        response = await sync_op(
            cls,
            ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"),
            data=GeminiImageGenerateContentRequest(
                contents=[
                    GeminiContent(role=GeminiRole.user, parts=parts),
                ],
                generationConfig=GeminiImageGenerationConfig(
                    responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
                    imageConfig=image_config,
                ),
            ),
            response_model=GeminiGenerateContentResponse,
            price_extractor=calculate_tokens_price,
        )
        output_text = get_text_from_response(response)
        if output_text:
            render_spec = {
                "node_id": cls.hidden.unique_id,
                "component": "ChatHistoryWidget",
                "props": {
                    "history": json.dumps(
                        [
                            {
                                "prompt": prompt,
                                "response": output_text,
                                "response_id": str(uuid.uuid4()),
                                "timestamp": time.time(),
                            }
                        ]
                    ),
                },
            }
            PromptServer.instance.send_sync(
                "display_component",
                render_spec,
            )
        return IO.NodeOutput(get_image_from_response(response), output_text)
 class GeminiExtension(ComfyExtension):
@ -555,6 +757,7 @@ class GeminiExtension(ComfyExtension):
        return [
            GeminiNode,
            GeminiImage,
            GeminiImage2,
            GeminiInputFiles,
        ]
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@ -518,7 +518,9 @@ async def execute_lipsync(
    # Upload the audio file to Comfy API and get download URL
    if audio:
-        audio_url = await upload_audio_to_comfyapi(cls, audio)
+        audio_url = await upload_audio_to_comfyapi(
            cls, audio, container_format="mp3", codec_name="libmp3lame", mime_type="audio/mpeg", filename="output.mp3"
        )
        logging.info("Uploaded audio to Comfy API. URL: %s", audio_url)
    else:
        audio_url = None
--- a/comfy_api_nodes/nodes_topaz.py
+++ b/comfy_api_nodes/nodes_topaz.py
@ -0,0 +1,418 @@
 import builtins
 from io import BytesIO
 import aiohttp
 import torch
 from typing_extensions import override
 from comfy_api.latest import IO, ComfyExtension, Input
 from comfy_api_nodes.apis import topaz_api
 from comfy_api_nodes.util import (
    ApiEndpoint,
    download_url_to_image_tensor,
    download_url_to_video_output,
    get_fs_object_size,
    get_number_of_images,
    poll_op,
    sync_op,
    upload_images_to_comfyapi,
    validate_container_format_is_mp4,
 )
 UPSCALER_MODELS_MAP = {
    "Starlight (Astra) Fast": "slf-1",
    "Starlight (Astra) Creative": "slc-1",
 }
 UPSCALER_VALUES_MAP = {
    "FullHD (1080p)": 1920,
    "4K (2160p)": 3840,
 }
 class TopazImageEnhance(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
        return IO.Schema(
            node_id="TopazImageEnhance",
            display_name="Topaz Image Enhance",
            category="api node/image/Topaz",
            description="Industry-standard upscaling and image enhancement.",
            inputs=[
                IO.Combo.Input("model", options=["Reimagine"]),
                IO.Image.Input("image"),
                IO.String.Input(
                    "prompt",
                    multiline=True,
                    default="",
                    tooltip="Optional text prompt for creative upscaling guidance.",
                    optional=True,
                ),
                IO.Combo.Input(
                    "subject_detection",
                    options=["All", "Foreground", "Background"],
                    optional=True,
                ),
                IO.Boolean.Input(
                    "face_enhancement",
                    default=True,
                    optional=True,
                    tooltip="Enhance faces (if present) during processing.",
                ),
                IO.Float.Input(
                    "face_enhancement_creativity",
                    default=0.0,
                    min=0.0,
                    max=1.0,
                    step=0.01,
                    display_mode=IO.NumberDisplay.number,
                    optional=True,
                    tooltip="Set the creativity level for face enhancement.",
                ),
                IO.Float.Input(
                    "face_enhancement_strength",
                    default=1.0,
                    min=0.0,
                    max=1.0,
                    step=0.01,
                    display_mode=IO.NumberDisplay.number,
                    optional=True,
                    tooltip="Controls how sharp enhanced faces are relative to the background.",
                ),
                IO.Boolean.Input(
                    "crop_to_fill",
                    default=False,
                    optional=True,
                    tooltip="By default, the image is letterboxed when the output aspect ratio differs. "
                    "Enable to crop the image to fill the output dimensions.",
                ),
                IO.Int.Input(
                    "output_width",
                    default=0,
                    min=0,
                    max=32000,
                    step=1,
                    display_mode=IO.NumberDisplay.number,
                    optional=True,
                    tooltip="Zero value means to calculate automatically (usually it will be original size or output_height if specified).",
                ),
                IO.Int.Input(
                    "output_height",
                    default=0,
                    min=0,
                    max=32000,
                    step=1,
                    display_mode=IO.NumberDisplay.number,
                    optional=True,
                    tooltip="Zero value means to output in the same height as original or output width.",
                ),
                IO.Int.Input(
                    "creativity",
                    default=3,
                    min=1,
                    max=9,
                    step=1,
                    display_mode=IO.NumberDisplay.slider,
                    optional=True,
                ),
                IO.Boolean.Input(
                    "face_preservation",
                    default=True,
                    optional=True,
                    tooltip="Preserve subjects' facial identity.",
                ),
                IO.Boolean.Input(
                    "color_preservation",
                    default=True,
                    optional=True,
                    tooltip="Preserve the original colors.",
                ),
            ],
            outputs=[
                IO.Image.Output(),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
                IO.Hidden.unique_id,
            ],
            is_api_node=True,
        )
    @classmethod
    async def execute(
        cls,
        model: str,
        image: torch.Tensor,
        prompt: str = "",
        subject_detection: str = "All",
        face_enhancement: bool = True,
        face_enhancement_creativity: float = 1.0,
        face_enhancement_strength: float = 0.8,
        crop_to_fill: bool = False,
        output_width: int = 0,
        output_height: int = 0,
        creativity: int = 3,
        face_preservation: bool = True,
        color_preservation: bool = True,
    ) -> IO.NodeOutput:
        if get_number_of_images(image) != 1:
            raise ValueError("Only one input image is supported.")
        download_url = await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png")
        initial_response = await sync_op(
            cls,
            ApiEndpoint(path="/proxy/topaz/image/v1/enhance-gen/async", method="POST"),
            response_model=topaz_api.ImageAsyncTaskResponse,
            data=topaz_api.ImageEnhanceRequest(
                model=model,
                prompt=prompt,
                subject_detection=subject_detection,
                face_enhancement=face_enhancement,
                face_enhancement_creativity=face_enhancement_creativity,
                face_enhancement_strength=face_enhancement_strength,
                crop_to_fill=crop_to_fill,
                output_width=output_width if output_width else None,
                output_height=output_height if output_height else None,
                creativity=creativity,
                face_preservation=str(face_preservation).lower(),
                color_preservation=str(color_preservation).lower(),
                source_url=download_url[0],
                output_format="png",
            ),
            content_type="multipart/form-data",
        )
        await poll_op(
            cls,
            poll_endpoint=ApiEndpoint(path=f"/proxy/topaz/image/v1/status/{initial_response.process_id}"),
            response_model=topaz_api.ImageStatusResponse,
            status_extractor=lambda x: x.status,
            progress_extractor=lambda x: getattr(x, "progress", 0),
            price_extractor=lambda x: x.credits * 0.08,
            poll_interval=8.0,
            max_poll_attempts=160,
            estimated_duration=60,
        )
        results = await sync_op(
            cls,
            ApiEndpoint(path=f"/proxy/topaz/image/v1/download/{initial_response.process_id}"),
            response_model=topaz_api.ImageDownloadResponse,
            monitor_progress=False,
        )
        return IO.NodeOutput(await download_url_to_image_tensor(results.download_url))
 class TopazVideoEnhance(IO.ComfyNode):
    @classmethod
    def define_schema(cls):
        return IO.Schema(
            node_id="TopazVideoEnhance",
            display_name="Topaz Video Enhance",
            category="api node/video/Topaz",
            description="Breathe new life into video with powerful upscaling and recovery technology.",
            inputs=[
                IO.Video.Input("video"),
                IO.Boolean.Input("upscaler_enabled", default=True),
                IO.Combo.Input("upscaler_model", options=list(UPSCALER_MODELS_MAP.keys())),
                IO.Combo.Input("upscaler_resolution", options=list(UPSCALER_VALUES_MAP.keys())),
                IO.Combo.Input(
                    "upscaler_creativity",
                    options=["low", "middle", "high"],
                    default="low",
                    tooltip="Creativity level (applies only to Starlight (Astra) Creative).",
                    optional=True,
                ),
                IO.Boolean.Input("interpolation_enabled", default=False, optional=True),
                IO.Combo.Input("interpolation_model", options=["apo-8"], default="apo-8", optional=True),
                IO.Int.Input(
                    "interpolation_slowmo",
                    default=1,
                    min=1,
                    max=16,
                    display_mode=IO.NumberDisplay.number,
                    tooltip="Slow-motion factor applied to the input video. "
                    "For example, 2 makes the output twice as slow and doubles the duration.",
                    optional=True,
                ),
                IO.Int.Input(
                    "interpolation_frame_rate",
                    default=60,
                    min=15,
                    max=240,
                    display_mode=IO.NumberDisplay.number,
                    tooltip="Output frame rate.",
                    optional=True,
                ),
                IO.Boolean.Input(
                    "interpolation_duplicate",
                    default=False,
                    tooltip="Analyze the input for duplicate frames and remove them.",
                    optional=True,
                ),
                IO.Float.Input(
                    "interpolation_duplicate_threshold",
                    default=0.01,
                    min=0.001,
                    max=0.1,
                    step=0.001,
                    display_mode=IO.NumberDisplay.number,
                    tooltip="Detection sensitivity for duplicate frames.",
                    optional=True,
                ),
                IO.Combo.Input(
                    "dynamic_compression_level",
                    options=["Low", "Mid", "High"],
                    default="Low",
                    tooltip="CQP level.",
                    optional=True,
                ),
            ],
            outputs=[
                IO.Video.Output(),
            ],
            hidden=[
                IO.Hidden.auth_token_comfy_org,
                IO.Hidden.api_key_comfy_org,
                IO.Hidden.unique_id,
            ],
            is_api_node=True,
        )
    @classmethod
    async def execute(
        cls,
        video: Input.Video,
        upscaler_enabled: bool,
        upscaler_model: str,
        upscaler_resolution: str,
        upscaler_creativity: str = "low",
        interpolation_enabled: bool = False,
        interpolation_model: str = "apo-8",
        interpolation_slowmo: int = 1,
        interpolation_frame_rate: int = 60,
        interpolation_duplicate: bool = False,
        interpolation_duplicate_threshold: float = 0.01,
        dynamic_compression_level: str = "Low",
    ) -> IO.NodeOutput:
        if upscaler_enabled is False and interpolation_enabled is False:
            raise ValueError("There is nothing to do: both upscaling and interpolation are disabled.")
        validate_container_format_is_mp4(video)
        src_width, src_height = video.get_dimensions()
        src_frame_rate = int(video.get_frame_rate())
        duration_sec = video.get_duration()
        src_video_stream = video.get_stream_source()
        target_width = src_width
        target_height = src_height
        target_frame_rate = src_frame_rate
        filters = []
        if upscaler_enabled:
            target_width = UPSCALER_VALUES_MAP[upscaler_resolution]
            target_height = UPSCALER_VALUES_MAP[upscaler_resolution]
            filters.append(
                topaz_api.VideoEnhancementFilter(
                    model=UPSCALER_MODELS_MAP[upscaler_model],
                    creativity=(upscaler_creativity if UPSCALER_MODELS_MAP[upscaler_model] == "slc-1" else None),
                    isOptimizedMode=(True if UPSCALER_MODELS_MAP[upscaler_model] == "slc-1" else None),
                ),
            )
        if interpolation_enabled:
            target_frame_rate = interpolation_frame_rate
            filters.append(
                topaz_api.VideoFrameInterpolationFilter(
                    model=interpolation_model,
                    slowmo=interpolation_slowmo,
                    fps=interpolation_frame_rate,
                    duplicate=interpolation_duplicate,
                    duplicate_threshold=interpolation_duplicate_threshold,
                ),
            )
        initial_res = await sync_op(
            cls,
            ApiEndpoint(path="/proxy/topaz/video/", method="POST"),
            response_model=topaz_api.CreateVideoResponse,
            data=topaz_api.CreateVideoRequest(
                source=topaz_api.CreateCreateVideoRequestSource(
                    container="mp4",
                    size=get_fs_object_size(src_video_stream),
                    duration=int(duration_sec),
                    frameCount=video.get_frame_count(),
                    frameRate=src_frame_rate,
                    resolution=topaz_api.Resolution(width=src_width, height=src_height),
                ),
                filters=filters,
                output=topaz_api.OutputInformationVideo(
                    resolution=topaz_api.Resolution(width=target_width, height=target_height),
                    frameRate=target_frame_rate,
                    audioCodec="AAC",
                    audioTransfer="Copy",
                    dynamicCompressionLevel=dynamic_compression_level,
                ),
            ),
            wait_label="Creating task",
            final_label_on_success="Task created",
        )
        upload_res = await sync_op(
            cls,
            ApiEndpoint(
                path=f"/proxy/topaz/video/{initial_res.requestId}/accept",
                method="PATCH",
            ),
            response_model=topaz_api.VideoAcceptResponse,
            wait_label="Preparing upload",
            final_label_on_success="Upload started",
        )
        if len(upload_res.urls) > 1:
            raise NotImplementedError(
                "Large files are not currently supported. Please open an issue in the ComfyUI repository."
            )
        async with aiohttp.ClientSession(headers={"Content-Type": "video/mp4"}) as session:
            if isinstance(src_video_stream, BytesIO):
                src_video_stream.seek(0)
                async with session.put(upload_res.urls[0], data=src_video_stream, raise_for_status=True) as res:
                    upload_etag = res.headers["Etag"]
            else:
                with builtins.open(src_video_stream, "rb") as video_file:
                    async with session.put(upload_res.urls[0], data=video_file, raise_for_status=True) as res:
                        upload_etag = res.headers["Etag"]
        await sync_op(
            cls,
            ApiEndpoint(
                path=f"/proxy/topaz/video/{initial_res.requestId}/complete-upload",
                method="PATCH",
            ),
            response_model=topaz_api.VideoCompleteUploadResponse,
            data=topaz_api.VideoCompleteUploadRequest(
                uploadResults=[
                    topaz_api.VideoCompleteUploadRequestPart(
                        partNum=1,
                        eTag=upload_etag,
                    ),
                ],
            ),
            wait_label="Finalizing upload",
            final_label_on_success="Upload completed",
        )
        final_response = await poll_op(
            cls,
            ApiEndpoint(path=f"/proxy/topaz/video/{initial_res.requestId}/status"),
            response_model=topaz_api.VideoStatusResponse,
            status_extractor=lambda x: x.status,
            progress_extractor=lambda x: getattr(x, "progress", 0),
            price_extractor=lambda x: (x.estimates.cost[0] * 0.08 if x.estimates and x.estimates.cost[0] else None),
            poll_interval=10.0,
            max_poll_attempts=320,
        )
        return IO.NodeOutput(await download_url_to_video_output(final_response.download.url))
 class TopazExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
        return [
            TopazImageEnhance,
            TopazVideoEnhance,
        ]
 async def comfy_entrypoint() -> TopazExtension:
    return TopazExtension()
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@ -63,6 +63,7 @@ class _RequestConfig:
    estimated_total: Optional[int] = None
    final_label_on_success: Optional[str] = "Completed"
    progress_origin_ts: Optional[float] = None
    price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None
@dataclass
@ -77,9 +78,9 @@ class _PollUIState:
 _RETRY_STATUS = {408, 429, 500, 502, 503, 504}
-COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done"]
+COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"]
-FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"]
+FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"]
-QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"]
+QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing"]
 async def sync_op(
@ -87,6 +88,7 @@ async def sync_op(
    endpoint: ApiEndpoint,
    *,
    response_model: Type[M],
    price_extractor: Optional[Callable[[M], Optional[float]]] = None,
    data: Optional[BaseModel] = None,
    files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None,
    content_type: str = "application/json",
@ -104,6 +106,7 @@ async def sync_op(
    raw = await sync_op_raw(
        cls,
        endpoint,
        price_extractor=_wrap_model_extractor(response_model, price_extractor),
        data=data,
        files=files,
        content_type=content_type,
@ -175,6 +178,7 @@ async def sync_op_raw(
    cls: type[IO.ComfyNode],
    endpoint: ApiEndpoint,
    *,
    price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None,
    data: Optional[Union[dict[str, Any], BaseModel]] = None,
    files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None,
    content_type: str = "application/json",
@ -216,6 +220,7 @@ async def sync_op_raw(
        estimated_total=estimated_duration,
        final_label_on_success=final_label_on_success,
        progress_origin_ts=progress_origin_ts,
        price_extractor=price_extractor,
    )
    return await _request_base(cfg, expect_binary=as_binary)
@ -424,7 +429,9 @@ def _display_text(
    if status:
        display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}")
    if price is not None:
-        display_lines.append(f"Price: ${float(price):,.4f}")
+        p = f"{float(price):,.4f}".rstrip("0").rstrip(".")
        if p != "0":
            display_lines.append(f"Price: ${p}")
    if text is not None:
        display_lines.append(text)
    if display_lines:
@ -580,6 +587,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
    delay = cfg.retry_delay
    operation_succeeded: bool = False
    final_elapsed_seconds: Optional[int] = None
    extracted_price: Optional[float] = None
    while True:
        attempt += 1
        stop_event = asyncio.Event()
@ -767,6 +775,8 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                        except json.JSONDecodeError:
                            payload = {"_raw": text}
                        response_content_to_log = payload if isinstance(payload, dict) else text
                    with contextlib.suppress(Exception):
                        extracted_price = cfg.price_extractor(payload) if cfg.price_extractor else None
                    operation_succeeded = True
                    final_elapsed_seconds = int(time.monotonic() - start_time)
                    try:
@ -871,7 +881,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
                        else int(time.monotonic() - start_time)
                    ),
                    estimated_total=cfg.estimated_total,
-                    price=None,
+                    price=extracted_price,
                    is_queued=False,
                    processing_elapsed_seconds=final_elapsed_seconds,
                )
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@ -11,13 +11,13 @@ if TYPE_CHECKING:
 def easycache_forward_wrapper(executor, *args, **kwargs):
    # get values from args
    x: torch.Tensor = args[0]
    transformer_options: dict[str] = args[-1]
    if not isinstance(transformer_options, dict):
        transformer_options = kwargs.get("transformer_options")
        if not transformer_options:
            transformer_options = args[-2]
    easycache: EasyCacheHolder = transformer_options["easycache"]
    x: torch.Tensor = args[0][:, :easycache.output_channels]
    sigmas = transformer_options["sigmas"]
    uuids = transformer_options["uuids"]
    if sigmas is not None and easycache.is_past_end_timestep(sigmas):
@ -82,13 +82,13 @@ def easycache_forward_wrapper(executor, *args, **kwargs):
 def lazycache_predict_noise_wrapper(executor, *args, **kwargs):
    # get values from args
    x: torch.Tensor = args[0]
    timestep: float = args[1]
    model_options: dict[str] = args[2]
    easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"]
    if easycache.is_past_end_timestep(timestep):
        return executor(*args, **kwargs)
    # prepare next x_prev
    x: torch.Tensor = args[0][:, :easycache.output_channels]
    next_x_prev = x
    input_change = None
    do_easycache = easycache.should_do_easycache(timestep)
@ -173,7 +173,7 @@ def easycache_sample_wrapper(executor, *args, **kwargs):
 class EasyCacheHolder:
-    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False):
+    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None):
        self.name = "EasyCache"
        self.reuse_threshold = reuse_threshold
        self.start_percent = start_percent
@ -202,6 +202,7 @@ class EasyCacheHolder:
        self.allow_mismatch = True
        self.cut_from_start = True
        self.state_metadata = None
        self.output_channels = output_channels
    def is_past_end_timestep(self, timestep: float) -> bool:
        return not (timestep[0] > self.end_t).item()
@ -264,7 +265,7 @@ class EasyCacheHolder:
                    else:
                        slicing.append(slice(None))
                batch_slice = batch_slice + slicing
-            x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device)
+            x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device)
        return x
    def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
@ -283,7 +284,7 @@ class EasyCacheHolder:
                else:
                    slicing.append(slice(None))
                skip_dim = False
-            x = x[slicing]
+            x = x[tuple(slicing)]
        diff = output - x
        batch_offset = diff.shape[0] // len(uuids)
        for i, uuid in enumerate(uuids):
@ -323,7 +324,7 @@ class EasyCacheHolder:
        return self
    def clone(self):
-        return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose)
+        return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels)
 class EasyCacheNode(io.ComfyNode):
@ -350,7 +351,7 @@ class EasyCacheNode(io.ComfyNode):
    @classmethod
    def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
        model = model.clone()
-        model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose)
+        model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels)
        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "easycache", easycache_sample_wrapper)
        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, "easycache", easycache_calc_cond_batch_wrapper)
        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "easycache", easycache_forward_wrapper)
@ -358,7 +359,7 @@ class EasyCacheNode(io.ComfyNode):
 class LazyCacheHolder:
-    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False):
+    def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None):
        self.name = "LazyCache"
        self.reuse_threshold = reuse_threshold
        self.start_percent = start_percent
@ -382,6 +383,7 @@ class LazyCacheHolder:
        self.approx_output_change_rates = []
        self.total_steps_skipped = 0
        self.state_metadata = None
        self.output_channels = output_channels
    def has_cache_diff(self) -> bool:
        return self.cache_diff is not None
@ -456,7 +458,7 @@ class LazyCacheHolder:
        return self
    def clone(self):
-        return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose)
+        return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels)
 class LazyCacheNode(io.ComfyNode):
    @classmethod
@ -482,7 +484,7 @@ class LazyCacheNode(io.ComfyNode):
    @classmethod
    def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput:
        model = model.clone()
-        model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose)
+        model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels)
        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "lazycache", easycache_sample_wrapper)
        model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "lazycache", lazycache_predict_noise_wrapper)
        return io.NodeOutput(model)
--- a/comfy_extras/nodes_flux.py
+++ b/comfy_extras/nodes_flux.py
@ -2,7 +2,10 @@ import node_helpers
 import comfy.utils
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-
+import comfy.model_management
 import torch
 import math
 import nodes
 class CLIPTextEncodeFlux(io.ComfyNode):
    @classmethod
@ -30,6 +33,27 @@ class CLIPTextEncodeFlux(io.ComfyNode):
    encode = execute  # TODO: remove
 class EmptyFlux2LatentImage(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="EmptyFlux2LatentImage",
            display_name="Empty Flux 2 Latent",
            category="latent",
            inputs=[
                io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
                io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
            ],
            outputs=[
                io.Latent.Output(),
            ],
        )
    @classmethod
    def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 128, height // 16, width // 16], device=comfy.model_management.intermediate_device())
        return io.NodeOutput({"samples": latent})
 class FluxGuidance(io.ComfyNode):
    @classmethod
@ -154,6 +178,58 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode):
    append = execute  # TODO: remove
 def generalized_time_snr_shift(t, mu: float, sigma: float):
    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
 def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float:
    a1, b1 = 8.73809524e-05, 1.89833333
    a2, b2 = 0.00016927, 0.45666666
    if image_seq_len > 4300:
        mu = a2 * image_seq_len + b2
        return float(mu)
    m_200 = a2 * image_seq_len + b2
    m_10 = a1 * image_seq_len + b1
    a = (m_200 - m_10) / 190.0
    b = m_200 - 200.0 * a
    mu = a * num_steps + b
    return float(mu)
 def get_schedule(num_steps: int, image_seq_len: int) -> list[float]:
    mu = compute_empirical_mu(image_seq_len, num_steps)
    timesteps = torch.linspace(1, 0, num_steps + 1)
    timesteps = generalized_time_snr_shift(timesteps, mu, 1.0)
    return timesteps
 class Flux2Scheduler(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="Flux2Scheduler",
            category="sampling/custom_sampling/schedulers",
            inputs=[
                io.Int.Input("steps", default=20, min=1, max=4096),
                io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1),
                io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1),
            ],
            outputs=[
                io.Sigmas.Output(),
            ],
        )
    @classmethod
    def execute(cls, steps, width, height) -> io.NodeOutput:
        seq_len = (width * height / (16 * 16))
        sigmas = get_schedule(steps, round(seq_len))
        return io.NodeOutput(sigmas)
 class FluxExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
@ -163,6 +239,8 @@ class FluxExtension(ComfyExtension):
            FluxDisableGuidance,
            FluxKontextImageScale,
            FluxKontextMultiReferenceLatentMethod,
            EmptyFlux2LatentImage,
            Flux2Scheduler,
        ]
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@ -4,7 +4,8 @@ import torch
 import comfy.model_management
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-
+from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel
 import folder_paths
 class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
    @classmethod
@ -37,6 +38,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
    def define_schema(cls):
        return io.Schema(
            node_id="EmptyHunyuanLatentVideo",
            display_name="Empty HunyuanVideo 1.0 Latent",
            category="latent/video",
            inputs=[
                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -57,6 +59,198 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
    generate = execute  # TODO: remove
 class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo):
    @classmethod
    def define_schema(cls):
        schema = super().define_schema()
        schema.node_id = "EmptyHunyuanVideo15Latent"
        schema.display_name = "Empty HunyuanVideo 1.5 Latent"
        return schema
    @classmethod
    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
        # Using scale factor of 16 instead of 8
        latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device())
        return io.NodeOutput({"samples": latent})
 class HunyuanVideo15ImageToVideo(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="HunyuanVideo15ImageToVideo",
            category="conditioning/video_models",
            inputs=[
                io.Conditioning.Input("positive"),
                io.Conditioning.Input("negative"),
                io.Vae.Input("vae"),
                io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
                io.Int.Input("length", default=33, min=1, max=nodes.MAX_RESOLUTION, step=4),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
                io.Image.Input("start_image", optional=True),
                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent"),
            ],
        )
    @classmethod
    def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, clip_vision_output=None) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 32, ((length - 1) // 4) + 1, height // 16, width // 16], device=comfy.model_management.intermediate_device())
        if start_image is not None:
            start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
            encoded = vae.encode(start_image[:, :, :, :3])
            concat_latent_image = torch.zeros((latent.shape[0], 32, latent.shape[2], latent.shape[3], latent.shape[4]), device=comfy.model_management.intermediate_device())
            concat_latent_image[:, :, :encoded.shape[2], :, :] = encoded
            mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
            mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": mask})
        if clip_vision_output is not None:
            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
        out_latent = {}
        out_latent["samples"] = latent
        return io.NodeOutput(positive, negative, out_latent)
 class HunyuanVideo15SuperResolution(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="HunyuanVideo15SuperResolution",
            inputs=[
                io.Conditioning.Input("positive"),
                io.Conditioning.Input("negative"),
                io.Vae.Input("vae", optional=True),
                io.Image.Input("start_image", optional=True),
                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
                io.Latent.Input("latent"),
                io.Float.Input("noise_augmentation", default=0.70, min=0.0, max=1.0, step=0.01),
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent"),
            ],
        )
    @classmethod
    def execute(cls, positive, negative, latent, noise_augmentation, vae=None, start_image=None, clip_vision_output=None) -> io.NodeOutput:
        in_latent = latent["samples"]
        in_channels = in_latent.shape[1]
        cond_latent = torch.zeros([in_latent.shape[0], in_channels * 2 + 2, in_latent.shape[-3], in_latent.shape[-2], in_latent.shape[-1]], device=comfy.model_management.intermediate_device())
        cond_latent[:, in_channels + 1 : 2 * in_channels + 1] = in_latent
        cond_latent[:, 2 * in_channels + 1] = 1
        if start_image is not None:
            start_image = comfy.utils.common_upscale(start_image.movedim(-1, 1), in_latent.shape[-1] * 16, in_latent.shape[-2] * 16, "bilinear", "center").movedim(1, -1)
            encoded = vae.encode(start_image[:, :, :, :3])
            cond_latent[:, :in_channels, :encoded.shape[2], :, :] = encoded
            cond_latent[:, in_channels + 1, 0] = 1
        positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "noise_augmentation": noise_augmentation})
        negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "noise_augmentation": noise_augmentation})
        if clip_vision_output is not None:
            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
        return io.NodeOutput(positive, negative, latent)
 class LatentUpscaleModelLoader(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="LatentUpscaleModelLoader",
            display_name="Load Latent Upscale Model",
            category="loaders",
            inputs=[
                io.Combo.Input("model_name", options=folder_paths.get_filename_list("latent_upscale_models")),
            ],
            outputs=[
                io.LatentUpscaleModel.Output(),
            ],
        )
    @classmethod
    def execute(cls, model_name) -> io.NodeOutput:
        model_path = folder_paths.get_full_path_or_raise("latent_upscale_models", model_name)
        sd = comfy.utils.load_torch_file(model_path, safe_load=True)
        if "blocks.0.block.0.conv.weight" in sd:
            config = {
                "in_channels": sd["in_conv.conv.weight"].shape[1],
                "out_channels": sd["out_conv.conv.weight"].shape[0],
                "hidden_channels": sd["in_conv.conv.weight"].shape[0],
                "num_blocks": len([k for k in sd.keys() if k.startswith("blocks.") and k.endswith(".block.0.conv.weight")]),
                "global_residual": False,
            }
            model_type = "720p"
        elif "up.0.block.0.conv1.conv.weight" in sd:
            sd = {key.replace("nin_shortcut", "nin_shortcut.conv", 1): value for key, value in sd.items()}
            config = {
                "z_channels": sd["conv_in.conv.weight"].shape[1],
                "out_channels": sd["conv_out.conv.weight"].shape[0],
                "block_out_channels": tuple(sd[f"up.{i}.block.0.conv1.conv.weight"].shape[0] for i in range(len([k for k in sd.keys() if k.startswith("up.") and k.endswith(".block.0.conv1.conv.weight")]))),
            }
            model_type = "1080p"
        model = HunyuanVideo15SRModel(model_type, config)
        model.load_sd(sd)
        return io.NodeOutput(model)
 class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="HunyuanVideo15LatentUpscaleWithModel",
            display_name="Hunyuan Video 15 Latent Upscale With Model",
            category="latent",
            inputs=[
                io.LatentUpscaleModel.Input("model"),
                io.Latent.Input("samples"),
                io.Combo.Input("upscale_method", options=["nearest-exact", "bilinear", "area", "bicubic", "bislerp"], default="bilinear"),
                io.Int.Input("width", default=1280, min=0, max=16384, step=8),
                io.Int.Input("height", default=720, min=0, max=16384, step=8),
                io.Combo.Input("crop", options=["disabled", "center"]),
            ],
            outputs=[
                io.Latent.Output(),
            ],
        )
    @classmethod
    def execute(cls, model, samples, upscale_method, width, height, crop) -> io.NodeOutput:
        if width == 0 and height == 0:
            return io.NodeOutput(samples)
        else:
            if width == 0:
                height = max(64, height)
                width = max(64, round(samples["samples"].shape[-1] * height / samples["samples"].shape[-2]))
            elif height == 0:
                width = max(64, width)
                height = max(64, round(samples["samples"].shape[-2] * width / samples["samples"].shape[-1]))
            else:
                width = max(64, width)
                height = max(64, height)
            s = comfy.utils.common_upscale(samples["samples"], width // 16, height // 16, upscale_method, crop)
            s = model.resample_latent(s)
            return io.NodeOutput({"samples": s.cpu().float()})
 PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
    "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
    "1. The main content and theme of the video."
@ -210,6 +404,11 @@ class HunyuanExtension(ComfyExtension):
            CLIPTextEncodeHunyuanDiT,
            TextEncodeHunyuanVideo_ImageToVideo,
            EmptyHunyuanLatentVideo,
            EmptyHunyuanVideo15Latent,
            HunyuanVideo15ImageToVideo,
            HunyuanVideo15SuperResolution,
            HunyuanVideo15LatentUpscaleWithModel,
            LatentUpscaleModelLoader,
            HunyuanImageToVideo,
            EmptyHunyuanImageLatent,
            HunyuanRefinerLatent,
--- a/comfy_extras/nodes_hunyuan3d.py
+++ b/comfy_extras/nodes_hunyuan3d.py
@ -7,63 +7,79 @@ from comfy.ldm.modules.diffusionmodules.mmdit import get_1d_sincos_pos_embed_fro
 import folder_paths
 import comfy.model_management
 from comfy.cli_args import args
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, IO, Types
 from comfy_api.latest._util import MESH, VOXEL  # only for backward compatibility if someone import it from this file (will be removed later) # noqa
-class EmptyLatentHunyuan3Dv2:
+
 class EmptyLatentHunyuan3Dv2(IO.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {
+        return IO.Schema(
-            "required": {
+            node_id="EmptyLatentHunyuan3Dv2",
-                "resolution": ("INT", {"default": 3072, "min": 1, "max": 8192}),
+            category="latent/3d",
-                "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096, "tooltip": "The number of latent images in the batch."}),
+            inputs=[
-            }
+                IO.Int.Input("resolution", default=3072, min=1, max=8192),
-        }
+                IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
            ],
            outputs=[
                IO.Latent.Output(),
            ]
        )
-    RETURN_TYPES = ("LATENT",)
+    @classmethod
-    FUNCTION = "generate"
+    def execute(cls, resolution, batch_size) -> IO.NodeOutput:
    CATEGORY = "latent/3d"
    def generate(self, resolution, batch_size):
        latent = torch.zeros([batch_size, 64, resolution], device=comfy.model_management.intermediate_device())
-        return ({"samples": latent, "type": "hunyuan3dv2"}, )
+        return IO.NodeOutput({"samples": latent, "type": "hunyuan3dv2"})
-class Hunyuan3Dv2Conditioning:
+    generate = execute  # TODO: remove
 class Hunyuan3Dv2Conditioning(IO.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"clip_vision_output": ("CLIP_VISION_OUTPUT",),
+        return IO.Schema(
-                             }}
+            node_id="Hunyuan3Dv2Conditioning",
            category="conditioning/video_models",
            inputs=[
                IO.ClipVisionOutput.Input("clip_vision_output"),
            ],
            outputs=[
                IO.Conditioning.Output(display_name="positive"),
                IO.Conditioning.Output(display_name="negative"),
            ]
        )
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
+    @classmethod
-    RETURN_NAMES = ("positive", "negative")
+    def execute(cls, clip_vision_output) -> IO.NodeOutput:
    FUNCTION = "encode"
    CATEGORY = "conditioning/video_models"
    def encode(self, clip_vision_output):
        embeds = clip_vision_output.last_hidden_state
        positive = [[embeds, {}]]
        negative = [[torch.zeros_like(embeds), {}]]
-        return (positive, negative)
+        return IO.NodeOutput(positive, negative)
    encode = execute  # TODO: remove
-class Hunyuan3Dv2ConditioningMultiView:
+class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {},
+        return IO.Schema(
-                "optional": {"front": ("CLIP_VISION_OUTPUT",),
+            node_id="Hunyuan3Dv2ConditioningMultiView",
-                             "left": ("CLIP_VISION_OUTPUT",),
+            category="conditioning/video_models",
-                             "back": ("CLIP_VISION_OUTPUT",),
+            inputs=[
-                             "right": ("CLIP_VISION_OUTPUT",), }}
+                IO.ClipVisionOutput.Input("front", optional=True),
                IO.ClipVisionOutput.Input("left", optional=True),
                IO.ClipVisionOutput.Input("back", optional=True),
                IO.ClipVisionOutput.Input("right", optional=True),
            ],
            outputs=[
                IO.Conditioning.Output(display_name="positive"),
                IO.Conditioning.Output(display_name="negative"),
            ]
        )
-    RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
+    @classmethod
-    RETURN_NAMES = ("positive", "negative")
+    def execute(cls, front=None, left=None, back=None, right=None) -> IO.NodeOutput:
    FUNCTION = "encode"
    CATEGORY = "conditioning/video_models"
    def encode(self, front=None, left=None, back=None, right=None):
        all_embeds = [front, left, back, right]
        out = []
        pos_embeds = None
@ -76,29 +92,35 @@ class Hunyuan3Dv2ConditioningMultiView:
        embeds = torch.cat(out, dim=1)
        positive = [[embeds, {}]]
        negative = [[torch.zeros_like(embeds), {}]]
-        return (positive, negative)
+        return IO.NodeOutput(positive, negative)
    encode = execute  # TODO: remove
-class VOXEL:
+class VAEDecodeHunyuan3D(IO.ComfyNode):
    def __init__(self, data):
        self.data = data
 class VAEDecodeHunyuan3D:
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"samples": ("LATENT", ),
+        return IO.Schema(
-                             "vae": ("VAE", ),
+            node_id="VAEDecodeHunyuan3D",
-                             "num_chunks": ("INT", {"default": 8000, "min": 1000, "max": 500000}),
+            category="latent/3d",
-                             "octree_resolution": ("INT", {"default": 256, "min": 16, "max": 512}),
+            inputs=[
-                             }}
+                IO.Latent.Input("samples"),
-    RETURN_TYPES = ("VOXEL",)
+                IO.Vae.Input("vae"),
-    FUNCTION = "decode"
+                IO.Int.Input("num_chunks", default=8000, min=1000, max=500000),
                IO.Int.Input("octree_resolution", default=256, min=16, max=512),
            ],
            outputs=[
                IO.Voxel.Output(),
            ]
        )
-    CATEGORY = "latent/3d"
+    @classmethod
    def execute(cls, vae, samples, num_chunks, octree_resolution) -> IO.NodeOutput:
        voxels = Types.VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution}))
        return IO.NodeOutput(voxels)
    decode = execute  # TODO: remove
    def decode(self, vae, samples, num_chunks, octree_resolution):
        voxels = VOXEL(vae.decode(samples["samples"], vae_options={"num_chunks": num_chunks, "octree_resolution": octree_resolution}))
        return (voxels, )
 def voxel_to_mesh(voxels, threshold=0.5, device=None):
    if device is None:
@ -396,24 +418,24 @@ def voxel_to_mesh_surfnet(voxels, threshold=0.5, device=None):
    return final_vertices, faces
 class MESH:
    def __init__(self, vertices, faces):
        self.vertices = vertices
        self.faces = faces
-
+class VoxelToMeshBasic(IO.ComfyNode):
 class VoxelToMeshBasic:
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"voxel": ("VOXEL", ),
+        return IO.Schema(
-                             "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}),
+            node_id="VoxelToMeshBasic",
-                             }}
+            category="3d",
-    RETURN_TYPES = ("MESH",)
+            inputs=[
-    FUNCTION = "decode"
+                IO.Voxel.Input("voxel"),
                IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01),
            ],
            outputs=[
                IO.Mesh.Output(),
            ]
        )
-    CATEGORY = "3d"
+    @classmethod
-
+    def execute(cls, voxel, threshold) -> IO.NodeOutput:
    def decode(self, voxel, threshold):
        vertices = []
        faces = []
        for x in voxel.data:
@ -421,21 +443,29 @@ class VoxelToMeshBasic:
            vertices.append(v)
            faces.append(f)
-        return (MESH(torch.stack(vertices), torch.stack(faces)), )
+        return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
-class VoxelToMesh:
+    decode = execute  # TODO: remove
 class VoxelToMesh(IO.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"voxel": ("VOXEL", ),
+        return IO.Schema(
-                             "algorithm": (["surface net", "basic"], ),
+            node_id="VoxelToMesh",
-                             "threshold": ("FLOAT", {"default": 0.6, "min": -1.0, "max": 1.0, "step": 0.01}),
+            category="3d",
-                             }}
+            inputs=[
-    RETURN_TYPES = ("MESH",)
+                IO.Voxel.Input("voxel"),
-    FUNCTION = "decode"
+                IO.Combo.Input("algorithm", options=["surface net", "basic"]),
                IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01),
            ],
            outputs=[
                IO.Mesh.Output(),
            ]
        )
-    CATEGORY = "3d"
+    @classmethod
-
+    def execute(cls, voxel, algorithm, threshold) -> IO.NodeOutput:
    def decode(self, voxel, algorithm, threshold):
        vertices = []
        faces = []
@ -449,7 +479,9 @@ class VoxelToMesh:
            vertices.append(v)
            faces.append(f)
-        return (MESH(torch.stack(vertices), torch.stack(faces)), )
+        return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
    decode = execute  # TODO: remove
 def save_glb(vertices, faces, filepath, metadata=None):
@ -581,31 +613,32 @@ def save_glb(vertices, faces, filepath, metadata=None):
    return filepath
-class SaveGLB:
+class SaveGLB(IO.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"mesh": ("MESH", ),
+        return IO.Schema(
-                             "filename_prefix": ("STRING", {"default": "mesh/ComfyUI"}), },
+            node_id="SaveGLB",
-                "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"}, }
+            category="3d",
            is_output_node=True,
            inputs=[
                IO.Mesh.Input("mesh"),
                IO.String.Input("filename_prefix", default="mesh/ComfyUI"),
            ],
            hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo]
        )
-    RETURN_TYPES = ()
+    @classmethod
-    FUNCTION = "save"
+    def execute(cls, mesh, filename_prefix) -> IO.NodeOutput:
    OUTPUT_NODE = True
    CATEGORY = "3d"
    def save(self, mesh, filename_prefix, prompt=None, extra_pnginfo=None):
        full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory())
        results = []
        metadata = {}
        if not args.disable_metadata:
-            if prompt is not None:
+            if cls.hidden.prompt is not None:
-                metadata["prompt"] = json.dumps(prompt)
+                metadata["prompt"] = json.dumps(cls.hidden.prompt)
-            if extra_pnginfo is not None:
+            if cls.hidden.extra_pnginfo is not None:
-                for x in extra_pnginfo:
+                for x in cls.hidden.extra_pnginfo:
-                    metadata[x] = json.dumps(extra_pnginfo[x])
+                    metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])
        for i in range(mesh.vertices.shape[0]):
            f = f"{filename}_{counter:05}_.glb"
@ -616,15 +649,22 @@ class SaveGLB:
                "type": "output"
            })
            counter += 1
-        return {"ui": {"3d": results}}
+        return IO.NodeOutput(ui={"3d": results})
-NODE_CLASS_MAPPINGS = {
+class Hunyuan3dExtension(ComfyExtension):
-    "EmptyLatentHunyuan3Dv2": EmptyLatentHunyuan3Dv2,
+    @override
-    "Hunyuan3Dv2Conditioning": Hunyuan3Dv2Conditioning,
+    async def get_node_list(self) -> list[type[IO.ComfyNode]]:
-    "Hunyuan3Dv2ConditioningMultiView": Hunyuan3Dv2ConditioningMultiView,
+        return [
-    "VAEDecodeHunyuan3D": VAEDecodeHunyuan3D,
+            EmptyLatentHunyuan3Dv2,
-    "VoxelToMeshBasic": VoxelToMeshBasic,
+            Hunyuan3Dv2Conditioning,
-    "VoxelToMesh": VoxelToMesh,
+            Hunyuan3Dv2ConditioningMultiView,
-    "SaveGLB": SaveGLB,
+            VAEDecodeHunyuan3D,
-}
+            VoxelToMeshBasic,
            VoxelToMesh,
            SaveGLB,
        ]
 async def comfy_entrypoint() -> Hunyuan3dExtension:
    return Hunyuan3dExtension()
--- a/comfy_extras/nodes_nop.py
+++ b/comfy_extras/nodes_nop.py
@ -0,0 +1,39 @@
 from comfy_api.latest import ComfyExtension, io
 from typing_extensions import override
 # If you write a node that is so useless that it breaks ComfyUI it will be featured in this exclusive list
 # "native" block swap nodes are placebo at best and break the ComfyUI memory management system.
 # They are also considered harmful because instead of users reporting issues with the built in
 # memory management they install these stupid nodes and complain even harder. Now it completely
 # breaks with some of the new ComfyUI memory optimizations so I have made the decision to NOP it
 # out of all workflows.
 class wanBlockSwap(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="wanBlockSwap",
            category="",
            description="NOP",
            inputs=[
                io.Model.Input("model"),
            ],
            outputs=[
                io.Model.Output(),
            ],
            is_deprecated=True,
        )
    @classmethod
    def execute(cls, model) -> io.NodeOutput:
        return io.NodeOutput(model)
 class NopExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            wanBlockSwap
        ]
 async def comfy_entrypoint() -> NopExtension:
    return NopExtension()
--- a/comfy_extras/nodes_preview_any.py
+++ b/comfy_extras/nodes_preview_any.py
@ -39,5 +39,5 @@ NODE_CLASS_MAPPINGS = {
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "PreviewAny": "Preview Any",
+    "PreviewAny": "Preview as Text",
 }
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.68"
+__version__ = "0.3.71"
--- a/folder_paths.py
+++ b/folder_paths.py
@ -38,6 +38,8 @@ folder_names_and_paths["gligen"] = ([os.path.join(models_dir, "gligen")], suppor
 folder_names_and_paths["upscale_models"] = ([os.path.join(models_dir, "upscale_models")], supported_pt_extensions)
 folder_names_and_paths["latent_upscale_models"] = ([os.path.join(models_dir, "latent_upscale_models")], supported_pt_extensions)
 folder_names_and_paths["custom_nodes"] = ([os.path.join(base_path, "custom_nodes")], set())
 folder_names_and_paths["hypernetworks"] = ([os.path.join(models_dir, "hypernetworks")], supported_pt_extensions)
--- a/models/latent_upscale_models/put_latent_upscale_models_here
+++ b/models/latent_upscale_models/put_latent_upscale_models_here
--- a/nodes.py
+++ b/nodes.py
@ -929,7 +929,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -957,7 +957,7 @@ class DualCLIPLoader:
    def INPUT_TYPES(s):
        return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
                              "clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image"], ),
+                              "type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -1852,6 +1852,11 @@ class ImageBatch:
    CATEGORY = "image"
    def batch(self, image1, image2):
        if image1.shape[-1] != image2.shape[-1]:
            if image1.shape[-1] > image2.shape[-1]:
                image2 = torch.nn.functional.pad(image2, (0,1), mode='constant', value=1.0)
            else:
                image1 = torch.nn.functional.pad(image1, (0,1), mode='constant', value=1.0)
        if image1.shape[1:] != image2.shape[1:]:
            image2 = comfy.utils.common_upscale(image2.movedim(-1,1), image1.shape[2], image1.shape[1], "bilinear", "center").movedim(1,-1)
        s = torch.cat((image1, image2), dim=0)
@ -2331,6 +2336,7 @@ async def init_builtin_extra_nodes():
        "nodes_audio_encoder.py",
        "nodes_gds.py",
        "nodes_rope.py",
        "nodes_nop.py",
    ]
    import_failed = []
@ -2359,6 +2365,7 @@ async def init_builtin_api_nodes():
        "nodes_pika.py",
        "nodes_runway.py",
        "nodes_sora.py",
        "nodes_topaz.py",
        "nodes_tripo.py",
        "nodes_moonvalley.py",
        "nodes_rodin.py",
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.68"
+version = "0.3.71"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
@ -24,7 +24,7 @@ lint.select = [
 exclude = ["*.ipynb", "**/generated/*.pyi"]
 [tool.pylint]
-master.py-version = "3.9"
+master.py-version = "3.10"
 master.extension-pkg-allow-list = [
  "pydantic",
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
-comfyui-frontend-package==1.28.8
+comfyui-frontend-package==1.30.6
-comfyui-workflow-templates==0.2.11
+comfyui-workflow-templates==0.7.9
 comfyui-embedded-docs==0.3.1
 torch
 torchsde
@ -7,7 +7,7 @@ torchvision
 torchaudio
 numpy>=1.25.0
 einops
-transformers>=4.37.2
+transformers>=4.50.3
 tokenizers>=0.13.3
 sentencepiece
 safetensors>=0.4.2
--- a/server.py
+++ b/server.py
@ -2,6 +2,7 @@ import os
 import sys
 import asyncio
 import traceback
 import time
 import nodes
 import folder_paths
@ -29,7 +30,7 @@ import comfy.model_management
 from comfy_api import feature_flags
 import node_helpers
 from comfyui_version import __version__
-from app.frontend_management import FrontendManager
+from app.frontend_management import FrontendManager, parse_version
 from comfy_api.internal import _ComfyNodeInternal
 from app.user_manager import UserManager
@ -163,6 +164,22 @@ def create_origin_only_middleware():
    return origin_only_middleware
 def create_block_external_middleware():
    @web.middleware
    async def block_external_middleware(request: web.Request, handler):
        if request.method == "OPTIONS":
            # Pre-flight request. Reply successfully:
            response = web.Response()
        else:
            response = await handler(request)
        response.headers['Content-Security-Policy'] = "default-src 'self'; script-src 'self' 'unsafe-inline' blob:; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self'; frame-src 'self'; object-src 'self';"
        return response
    return block_external_middleware
 class PromptServer():
    def __init__(self, loop):
        PromptServer.instance = self
@ -192,6 +209,9 @@ class PromptServer():
        else:
            middlewares.append(create_origin_only_middleware())
        if args.disable_api_nodes:
            middlewares.append(create_block_external_middleware())
        max_upload_size = round(args.max_upload_size * 1024 * 1024)
        self.app = web.Application(client_max_size=max_upload_size, middlewares=middlewares)
        self.sockets = dict()
@ -733,6 +753,7 @@ class PromptServer():
                    for sensitive_val in execution.SENSITIVE_EXTRA_DATA_KEYS:
                        if sensitive_val in extra_data:
                            sensitive[sensitive_val] = extra_data.pop(sensitive_val)
                    extra_data["create_time"] = int(time.time() * 1000)  # timestamp in milliseconds
                    self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute, sensitive))
                    response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]}
                    return web.json_response(response)
@ -847,11 +868,31 @@ class PromptServer():
        for name, dir in nodes.EXTENSION_WEB_DIRS.items():
            self.app.add_routes([web.static('/extensions/' + name, dir)])
-        workflow_templates_path = FrontendManager.templates_path()
+        installed_templates_version = FrontendManager.get_installed_templates_version()
-        if workflow_templates_path:
+        use_legacy_templates = True
-            self.app.add_routes([
+        if installed_templates_version:
-                web.static('/templates', workflow_templates_path)
+            try:
-            ])
+                use_legacy_templates = (
                    parse_version(installed_templates_version)
                    < parse_version("0.3.0")
                )
            except Exception as exc:
                logging.warning(
                    "Unable to parse templates version '%s': %s",
                    installed_templates_version,
                    exc,
                )
        if use_legacy_templates:
            workflow_templates_path = FrontendManager.legacy_templates_path()
            if workflow_templates_path:
                self.app.add_routes([
                    web.static('/templates', workflow_templates_path)
                ])
        else:
            handler = FrontendManager.template_asset_handler()
            if handler:
                self.app.router.add_get("/templates/{path:.*}", handler)
        # Serve embedded documentation from the package
        embedded_docs_path = FrontendManager.embedded_docs_path()
--- a/tests-unit/comfy_quant/test_mixed_precision.py
+++ b/tests-unit/comfy_quant/test_mixed_precision.py
@ -37,11 +37,8 @@ class TestMixedPrecisionOps(unittest.TestCase):
    def test_all_layers_standard(self):
        """Test that model with no quantization works normally"""
        # Configure no quantization
        ops.MixedPrecisionOps._layer_quant_config = {}
        # Create model
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops({}))
        # Initialize weights manually
        model.layer1.weight = torch.nn.Parameter(torch.randn(20, 10, dtype=torch.bfloat16))
@ -76,7 +73,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                "params": {}
            }
        }
        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
        # Create state dict with mixed precision
        fp8_weight1 = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn)
@ -99,7 +95,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
        }
        # Create model and load state dict (strict=False because custom loading pops keys)
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
        model.load_state_dict(state_dict, strict=False)
        # Verify weights are wrapped in QuantizedTensor
@ -132,7 +128,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                "params": {}
            }
        }
        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
        # Create and load model
        fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn)
@ -146,7 +141,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
            "layer3.bias": torch.randn(40, dtype=torch.bfloat16),
        }
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
        model.load_state_dict(state_dict1, strict=False)
        # Save state dict
@ -170,7 +165,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                "params": {}
            }
        }
        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
        # Create and load model
        fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn)
@ -184,7 +178,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
            "layer3.bias": torch.randn(40, dtype=torch.bfloat16),
        }
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
        model.load_state_dict(state_dict, strict=False)
        # Add a weight function (simulating LoRA)
@ -210,7 +204,6 @@ class TestMixedPrecisionOps(unittest.TestCase):
                "params": {}
            }
        }
        ops.MixedPrecisionOps._layer_quant_config = layer_quant_config
        # Create state dict
        state_dict = {
@ -223,7 +216,7 @@ class TestMixedPrecisionOps(unittest.TestCase):
        }
        # Load should raise KeyError for unknown format in QUANT_FORMAT_MIXINS
-        model = SimpleModel(operations=ops.MixedPrecisionOps)
+        model = SimpleModel(operations=ops.mixed_precision_ops(layer_quant_config))
        with self.assertRaises(KeyError):
            model.load_state_dict(state_dict, strict=False)
--- a/tests/execution/test_public_api.py
+++ b/tests/execution/test_public_api.py
@ -0,0 +1,153 @@
 """
 Tests for public ComfyAPI and ComfyAPISync functions.
 These tests verify that the public API methods work correctly in both sync and async contexts,
 ensuring that the sync wrapper generation (via get_type_hints() in async_to_sync.py) correctly
 handles string annotations from 'from __future__ import annotations'.
 """
 import pytest
 import time
 import subprocess
 import torch
 from pytest import fixture
 from comfy_execution.graph_utils import GraphBuilder
 from tests.execution.test_execution import ComfyClient
@pytest.mark.execution
 class TestPublicAPI:
    """Test suite for public ComfyAPI and ComfyAPISync methods."""
    @fixture(scope="class", autouse=True)
    def _server(self, args_pytest):
        """Start ComfyUI server for testing."""
        pargs = [
            'python', 'main.py',
            '--output-directory', args_pytest["output_dir"],
            '--listen', args_pytest["listen"],
            '--port', str(args_pytest["port"]),
            '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml',
            '--cpu',
        ]
        p = subprocess.Popen(pargs)
        yield
        p.kill()
        torch.cuda.empty_cache()
    @fixture(scope="class", autouse=True)
    def shared_client(self, args_pytest, _server):
        """Create shared client with connection retry."""
        client = ComfyClient()
        n_tries = 5
        for i in range(n_tries):
            time.sleep(4)
            try:
                client.connect(listen=args_pytest["listen"], port=args_pytest["port"])
                break
            except ConnectionRefusedError:
                if i == n_tries - 1:
                    raise
        yield client
        del client
        torch.cuda.empty_cache()
    @fixture
    def client(self, shared_client, request):
        """Set test name for each test."""
        shared_client.set_test_name(f"public_api[{request.node.name}]")
        yield shared_client
    @fixture
    def builder(self, request):
        """Create GraphBuilder for each test."""
        yield GraphBuilder(prefix=request.node.name)
    def test_sync_progress_update_executes(self, client: ComfyClient, builder: GraphBuilder):
        """Test that TestSyncProgressUpdate executes without errors.
        This test validates that api_sync.execution.set_progress() works correctly,
        which is the primary code path fixed by adding get_type_hints() to async_to_sync.py.
        """
        g = builder
        image = g.node("StubImage", content="BLACK", height=256, width=256, batch_size=1)
        # Use TestSyncProgressUpdate with short sleep
        progress_node = g.node("TestSyncProgressUpdate",
                              value=image.out(0),
                              sleep_seconds=0.5)
        output = g.node("SaveImage", images=progress_node.out(0))
        # Execute workflow
        result = client.run(g)
        # Verify execution
        assert result.did_run(progress_node), "Progress node should have executed"
        assert result.did_run(output), "Output node should have executed"
        # Verify output
        images = result.get_images(output)
        assert len(images) == 1, "Should have produced 1 image"
    def test_async_progress_update_executes(self, client: ComfyClient, builder: GraphBuilder):
        """Test that TestAsyncProgressUpdate executes without errors.
        This test validates that await api.execution.set_progress() works correctly
        in async contexts.
        """
        g = builder
        image = g.node("StubImage", content="WHITE", height=256, width=256, batch_size=1)
        # Use TestAsyncProgressUpdate with short sleep
        progress_node = g.node("TestAsyncProgressUpdate",
                              value=image.out(0),
                              sleep_seconds=0.5)
        output = g.node("SaveImage", images=progress_node.out(0))
        # Execute workflow
        result = client.run(g)
        # Verify execution
        assert result.did_run(progress_node), "Async progress node should have executed"
        assert result.did_run(output), "Output node should have executed"
        # Verify output
        images = result.get_images(output)
        assert len(images) == 1, "Should have produced 1 image"
    def test_sync_and_async_progress_together(self, client: ComfyClient, builder: GraphBuilder):
        """Test both sync and async progress updates in same workflow.
        This test ensures that both ComfyAPISync and ComfyAPI can coexist and work
        correctly in the same workflow execution.
        """
        g = builder
        image1 = g.node("StubImage", content="BLACK", height=256, width=256, batch_size=1)
        image2 = g.node("StubImage", content="WHITE", height=256, width=256, batch_size=1)
        # Use both types of progress nodes
        sync_progress = g.node("TestSyncProgressUpdate",
                              value=image1.out(0),
                              sleep_seconds=0.3)
        async_progress = g.node("TestAsyncProgressUpdate",
                               value=image2.out(0),
                               sleep_seconds=0.3)
        # Create outputs
        output1 = g.node("SaveImage", images=sync_progress.out(0))
        output2 = g.node("SaveImage", images=async_progress.out(0))
        # Execute workflow
        result = client.run(g)
        # Both should execute successfully
        assert result.did_run(sync_progress), "Sync progress node should have executed"
        assert result.did_run(async_progress), "Async progress node should have executed"
        assert result.did_run(output1), "First output node should have executed"
        assert result.did_run(output2), "Second output node should have executed"
        # Verify outputs
        images1 = result.get_images(output1)
        images2 = result.get_images(output2)
        assert len(images1) == 1, "Should have produced 1 image from sync node"
        assert len(images2) == 1, "Should have produced 1 image from async node"