diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index 96cb37fa6..ee0226ec9 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -377,7 +377,6 @@ class NextDiT(nn.Module):
         z_image_modulation=False,
         time_scale=1.0,
         pad_tokens_multiple=None,
-        clip_text_dim=None,
         image_model=None,
         device=None,
         dtype=None,
@@ -448,31 +447,6 @@ class NextDiT(nn.Module):
             ),
         )
 
-        self.clip_text_pooled_proj = None
-
-        if clip_text_dim is not None:
-            self.clip_text_dim = clip_text_dim
-            self.clip_text_pooled_proj = nn.Sequential(
-                operation_settings.get("operations").RMSNorm(clip_text_dim, eps=norm_eps, elementwise_affine=True, device=operation_settings.get("device"), dtype=operation_settings.get("dtype")),
-                operation_settings.get("operations").Linear(
-                    clip_text_dim,
-                    clip_text_dim,
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
-            self.time_text_embed = nn.Sequential(
-                nn.SiLU(),
-                operation_settings.get("operations").Linear(
-                    min(dim, 1024) + clip_text_dim,
-                    min(dim, 1024),
-                    bias=True,
-                    device=operation_settings.get("device"),
-                    dtype=operation_settings.get("dtype"),
-                ),
-            )
-
         self.layers = nn.ModuleList(
             [
                 JointTransformerBlock(
@@ -620,15 +594,6 @@ class NextDiT(nn.Module):
 
         cap_feats = self.cap_embedder(cap_feats)  # (N, L, D)  # todo check if able to batchify w.o. redundant compute
 
-        if self.clip_text_pooled_proj is not None:
-            pooled = kwargs.get("clip_text_pooled", None)
-            if pooled is not None:
-                pooled = self.clip_text_pooled_proj(pooled)
-            else:
-                pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype)
-
-            adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
-
         patches = transformer_options.get("patches", {})
         x_is_tensor = isinstance(x, torch.Tensor)
         img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options)
diff --git a/comfy/ldm/newbie/components.py b/comfy/ldm/newbie/components.py
new file mode 100644
index 000000000..44bbd9250
--- /dev/null
+++ b/comfy/ldm/newbie/components.py
@@ -0,0 +1,54 @@
+import warnings
+
+import torch
+import torch.nn as nn
+
+try:
+    from apex.normalization import FusedRMSNorm as RMSNorm
+except ImportError:
+    warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
+
+    class RMSNorm(torch.nn.Module):
+        def __init__(self, dim: int, eps: float = 1e-6):
+            """
+            Initialize the RMSNorm normalization layer.
+
+            Args:
+                dim (int): The dimension of the input tensor.
+                eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+            Attributes:
+                eps (float): A small value added to the denominator for numerical stability.
+                weight (nn.Parameter): Learnable scaling parameter.
+
+            """
+            super().__init__()
+            self.eps = eps
+            self.weight = nn.Parameter(torch.ones(dim))
+
+        def _norm(self, x):
+            """
+            Apply the RMSNorm normalization to the input tensor.
+
+            Args:
+                x (torch.Tensor): The input tensor.
+
+            Returns:
+                torch.Tensor: The normalized tensor.
+
+            """
+            return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+        def forward(self, x):
+            """
+            Forward pass through the RMSNorm layer.
+
+            Args:
+                x (torch.Tensor): The input tensor.
+
+            Returns:
+                torch.Tensor: The output tensor after applying RMSNorm.
+
+            """
+            output = self._norm(x.float()).type_as(x)
+            return output * self.weight
diff --git a/comfy/ldm/newbie/model.py b/comfy/ldm/newbie/model.py
new file mode 100644
index 000000000..0d5dd8ef8
--- /dev/null
+++ b/comfy/ldm/newbie/model.py
@@ -0,0 +1,195 @@
+from __future__ import annotations
+from typing import Optional, Any, Dict
+import torch
+import torch.nn as nn
+import comfy.ldm.common_dit as common_dit
+from comfy.ldm.lumina.model import NextDiT as NextDiTBase
+from .components import RMSNorm
+
+#######################################################
+#            Adds support for NewBie image            #
+#######################################################
+
+def _fallback_operations():
+    try:
+        import comfy.ops
+        return comfy.ops.disable_weight_init
+    except Exception:
+        return None
+
+def _pop_unexpected_kwargs(kwargs: Dict[str, Any]) -> None:
+    for k in (
+        "model_type",
+        "operation_settings",
+        "unet_dtype",
+        "weight_dtype",
+        "precision",
+        "extra_model_config",
+    ):
+        kwargs.pop(k, None)
+
+class NewBieNextDiT_CLIP(NextDiTBase):
+
+    def __init__(
+        self,
+        *args,
+        clip_text_dim: int = 1024,
+        clip_img_dim: int = 1024,
+        device=None,
+        dtype=None,
+        operations=None,
+        **kwargs,
+    ):
+        _pop_unexpected_kwargs(kwargs)
+        if operations is None:
+            operations = _fallback_operations()
+        super().__init__(*args, device=device, dtype=dtype, operations=operations, **kwargs)
+        self._nb_device = device
+        self._nb_dtype = dtype
+        self._nb_ops = operations
+        min_mod = min(int(getattr(self, "dim", 1024)), 1024)
+        if operations is not None and hasattr(operations, "Linear"):
+            Linear = operations.Linear
+            Norm = getattr(operations, "RMSNorm", None)
+        else:
+            Linear = nn.Linear
+            Norm = None
+        if Norm is not None:
+            self.clip_text_pooled_proj = nn.Sequential(
+                Norm(clip_text_dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype),
+                Linear(clip_text_dim, clip_text_dim, bias=True, device=device, dtype=dtype),
+            )
+        else:
+            self.clip_text_pooled_proj = nn.Sequential(
+                RMSNorm(clip_text_dim),
+                nn.Linear(clip_text_dim, clip_text_dim, bias=True),
+            )
+        nn.init.normal_(self.clip_text_pooled_proj[1].weight, std=0.01)
+        nn.init.zeros_(self.clip_text_pooled_proj[1].bias)
+        self.time_text_embed = nn.Sequential(
+            nn.SiLU(),
+            Linear(min_mod + clip_text_dim, min_mod, bias=True, device=device, dtype=dtype),
+        )
+        nn.init.zeros_(self.time_text_embed[1].weight)
+        nn.init.zeros_(self.time_text_embed[1].bias)
+        if Norm is not None:
+            self.clip_img_pooled_embedder = nn.Sequential(
+                Norm(clip_img_dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype),
+                Linear(clip_img_dim, min_mod, bias=True, device=device, dtype=dtype),
+            )
+        else:
+            self.clip_img_pooled_embedder = nn.Sequential(
+                RMSNorm(clip_img_dim),
+                nn.Linear(clip_img_dim, min_mod, bias=True),
+            )
+        nn.init.normal_(self.clip_img_pooled_embedder[1].weight, std=0.01)
+        nn.init.zeros_(self.clip_img_pooled_embedder[1].bias)
+
+    @staticmethod
+    def _get_clip_from_kwargs(transformer_options: dict, kwargs: dict, key: str):
+        if key in kwargs:
+            return kwargs.get(key)
+        if transformer_options is not None and key in transformer_options:
+            return transformer_options.get(key)
+        extra = transformer_options.get("extra_cond", None) if transformer_options else None
+        if isinstance(extra, dict) and key in extra:
+            return extra.get(key)
+        return None
+    def _forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: torch.Tensor,
+        num_tokens: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        transformer_options: dict = {},
+        **kwargs,
+    ):
+        t = timesteps
+        cap_feats = context
+        cap_mask = attention_mask
+        bs, c, h, w = x.shape
+        x = common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+        t_emb = self.t_embedder(t, dtype=x.dtype)
+        adaln_input = t_emb
+        clip_text_pooled = self._get_clip_from_kwargs(transformer_options, kwargs, "clip_text_pooled")
+        clip_img_pooled = self._get_clip_from_kwargs(transformer_options, kwargs, "clip_img_pooled")
+        if clip_text_pooled is not None:
+            if clip_text_pooled.dim() > 2:
+                clip_text_pooled = clip_text_pooled.view(clip_text_pooled.shape[0], -1)
+            clip_text_pooled = clip_text_pooled.to(device=t_emb.device, dtype=t_emb.dtype)
+            clip_emb = self.clip_text_pooled_proj(clip_text_pooled)
+            adaln_input = self.time_text_embed(torch.cat([t_emb, clip_emb], dim=-1))
+        if clip_img_pooled is not None:
+            if clip_img_pooled.dim() > 2:
+                clip_img_pooled = clip_img_pooled.view(clip_img_pooled.shape[0], -1)
+            clip_img_pooled = clip_img_pooled.to(device=t_emb.device, dtype=t_emb.dtype)
+            adaln_input = adaln_input + self.clip_img_pooled_embedder(clip_img_pooled)
+        if isinstance(cap_feats, torch.Tensor):
+            try:
+                target_dtype = next(self.cap_embedder.parameters()).dtype
+            except StopIteration:
+                target_dtype = cap_feats.dtype
+            cap_feats = cap_feats.to(device=t_emb.device, dtype=target_dtype)
+        cap_feats = self.cap_embedder(cap_feats)
+        patches = transformer_options.get("patches", {})
+        x_is_tensor = True
+        img, mask, img_size, cap_size, freqs_cis = self.patchify_and_embed(
+            x, cap_feats, cap_mask, adaln_input, num_tokens, transformer_options=transformer_options
+        )
+        freqs_cis = freqs_cis.to(img.device)
+        for i, layer in enumerate(self.layers):
+            img = layer(img, mask, freqs_cis, adaln_input, transformer_options=transformer_options)
+            if "double_block" in patches:
+                for p in patches["double_block"]:
+                    out = p(
+                        {
+                            "img": img[:, cap_size[0] :],
+                            "txt": img[:, : cap_size[0]],
+                            "pe": freqs_cis[:, cap_size[0] :],
+                            "vec": adaln_input,
+                            "x": x,
+                            "block_index": i,
+                            "transformer_options": transformer_options,
+                        }
+                    )
+                    if isinstance(out, dict):
+                        if "img" in out:
+                            img[:, cap_size[0] :] = out["img"]
+                        if "txt" in out:
+                            img[:, : cap_size[0]] = out["txt"]
+
+        img = self.final_layer(img, adaln_input)
+        img = self.unpatchify(img, img_size, cap_size, return_tensor=x_is_tensor)
+        img = img[:, :, :h, :w]
+        return img
+
+def NextDiT_3B_GQA_patch2_Adaln_Refiner_WHIT_CLIP(**kwargs):
+    _pop_unexpected_kwargs(kwargs)
+    kwargs.setdefault("patch_size", 2)
+    kwargs.setdefault("in_channels", 16)
+    kwargs.setdefault("dim", 2304)
+    kwargs.setdefault("n_layers", 36)
+    kwargs.setdefault("n_heads", 24)
+    kwargs.setdefault("n_kv_heads", 8)
+    kwargs.setdefault("axes_dims", [32, 32, 32])
+    kwargs.setdefault("axes_lens", [1024, 512, 512])
+    return NewBieNextDiT_CLIP(**kwargs)
+
+def NewBieNextDiT(*, device=None, dtype=None, operations=None, **kwargs):
+    _pop_unexpected_kwargs(kwargs)
+    if operations is None:
+        operations = _fallback_operations()
+    if dtype is None:
+        dev_str = str(device) if device is not None else ""
+        if dev_str.startswith("cuda") and torch.cuda.is_available():
+            if hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
+                dtype = torch.bfloat16
+            else:
+                dtype = torch.float16
+        else:
+            dtype = torch.float32
+    model = NextDiT_3B_GQA_patch2_Adaln_Refiner_WHIT_CLIP(
+        device=device, dtype=dtype, operations=operations, **kwargs
+    )
+    return model
\ No newline at end of file
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 6b8a8454d..6b663f90c 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -928,6 +928,90 @@ class Flux2(Flux):
                 cross_attn = torch.nn.functional.pad(cross_attn, (0, 0, target_text_len - cross_attn.shape[1], 0))
             out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
         return out
+    
+class NewBieImage(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        import comfy.ldm.newbie.model as nb
+        super().__init__(model_config, model_type, device=device, unet_model=nb.NewBieNextDiT)
+
+    def extra_conds(self, **kwargs):
+        out = super().extra_conds(**kwargs)
+        cross_attn = kwargs.get("cross_attn", None)
+        if cross_attn is not None:
+            out["c_crossattn"] = comfy.conds.CONDCrossAttn(cross_attn)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out["attention_mask"] = comfy.conds.CONDRegular(attention_mask)
+        cap_feats = kwargs.get("cap_feats", None)
+        if cap_feats is not None:
+            out["cap_feats"] = comfy.conds.CONDRegular(cap_feats)
+        cap_mask = kwargs.get("cap_mask", None)
+        if cap_mask is not None:
+            out["cap_mask"] = comfy.conds.CONDRegular(cap_mask)
+        clip_text_pooled = kwargs.get("clip_text_pooled", None)
+        if clip_text_pooled is not None:
+            out["clip_text_pooled"] = comfy.conds.CONDRegular(clip_text_pooled)
+        clip_img_pooled = kwargs.get("clip_img_pooled", None)
+        if clip_img_pooled is not None:
+            out["clip_img_pooled"] = comfy.conds.CONDRegular(clip_img_pooled)
+        return out
+
+    def extra_conds_shapes(self, **kwargs):
+        out = super().extra_conds_shapes(**kwargs)
+        cap_feats = kwargs.get("cap_feats", None)
+        if cap_feats is not None:
+            out["cap_feats"] = list(cap_feats.shape)
+        clip_text_pooled = kwargs.get("clip_text_pooled", None)
+        if clip_text_pooled is not None:
+            out["clip_text_pooled"] = list(clip_text_pooled.shape)
+        clip_img_pooled = kwargs.get("clip_img_pooled", None)
+        if clip_img_pooled is not None:
+            out["clip_img_pooled"] = list(clip_img_pooled.shape)
+        return out
+
+    def apply_model(
+            self, x, t,
+            c_concat=None, c_crossattn=None,
+            control=None, transformer_options={}, **kwargs
+    ):
+        sigma = t
+        try:
+            model_device = next(self.diffusion_model.parameters()).device
+        except StopIteration:
+            model_device = x.device
+        x_in = x.to(device=model_device)
+        sigma_in = sigma.to(device=model_device)
+        xc = self.model_sampling.calculate_input(sigma_in, x_in)
+        if c_concat is not None:
+            xc = torch.cat([xc] + [c_concat.to(device=model_device)], dim=1)
+        dtype = self.get_dtype()
+        if self.manual_cast_dtype is not None:
+            dtype = self.manual_cast_dtype
+        xc = xc.to(dtype=dtype)
+        t_val = (1.0 - sigma_in).to(dtype=torch.float32)
+        cap_feats = kwargs.get("cap_feats", kwargs.get("cross_attn", c_crossattn))
+        cap_mask = kwargs.get("cap_mask", kwargs.get("attention_mask"))
+        clip_text_pooled = kwargs.get("clip_text_pooled")
+        clip_img_pooled = kwargs.get("clip_img_pooled")
+        if cap_feats is not None:
+            cap_feats = cap_feats.to(device=model_device, dtype=dtype)
+        if cap_mask is None and cap_feats is not None:
+            cap_mask = torch.ones(cap_feats.shape[:2], dtype=torch.bool, device=model_device)
+        elif cap_mask is not None:
+            cap_mask = cap_mask.to(device=model_device)
+            if cap_mask.dtype != torch.bool:
+                cap_mask = cap_mask != 0
+        model_kwargs = {}
+        if clip_text_pooled is not None:
+            model_kwargs["clip_text_pooled"] = clip_text_pooled.to(device=model_device, dtype=dtype)
+        if clip_img_pooled is not None:
+            model_kwargs["clip_img_pooled"] = clip_img_pooled.to(device=model_device, dtype=dtype)
+        model_output = self.diffusion_model(xc, t_val, cap_feats, cap_mask, **model_kwargs).float()
+        model_output = -model_output
+        denoised = self.model_sampling.calculate_denoised(sigma_in, model_output, x_in)
+        if denoised.device != x.device:
+            denoised = denoised.to(device=x.device)
+        return denoised
 
 class GenmoMochi(BaseModel):
     def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
@@ -1110,10 +1194,6 @@ class Lumina2(BaseModel):
             if 'num_tokens' not in out:
                 out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
 
-        clip_text_pooled = kwargs["pooled_output"]  # Newbie
-        if clip_text_pooled is not None:
-            out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
-
         return out
 
 class WAN21(BaseModel):
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index dd6a703f6..9f38b7f9d 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -6,6 +6,26 @@ import math
 import logging
 import torch
 
+def is_newbie_unet_state_dict(state_dict, key_prefix): 
+    state_dict_keys = state_dict.keys()
+    try:
+        x_embed = state_dict[f"{key_prefix}x_embedder.weight"]
+        final = state_dict[f"{key_prefix}final_layer.linear.weight"]
+    except KeyError:
+        return False
+    if x_embed.ndim != 2:
+        return False
+    dim = x_embed.shape[0]
+    patch_dim = x_embed.shape[1]
+    if dim != 2304 or patch_dim != 64:
+        return False
+    if final.shape[0] != patch_dim or final.shape[1] != dim:
+        return False
+    n_layers = count_blocks(state_dict_keys, f"{key_prefix}layers." + "{}.")
+    if n_layers != 36:
+        return False
+    return True
+
 def count_blocks(state_dict_keys, prefix_string):
     count = 0
     while True:
@@ -411,7 +431,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["extra_per_block_abs_pos_emb_type"] = "learnable"
         return dit_config
 
-    if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys:  # Lumina 2
+    if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys:  # Lumina 2 / NewBie image
         dit_config = {}
         dit_config["image_model"] = "lumina2"
         dit_config["patch_size"] = 2
@@ -422,6 +442,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.')
         dit_config["qk_norm"] = True
 
+        if dit_config["dim"] == 2304 and is_newbie_unet_state_dict(state_dict, key_prefix):  # NewBie image
+            dit_config["n_heads"] = 24
+            dit_config["n_kv_heads"] = 8
+            dit_config["axes_dims"] = [32, 32, 32]
+            dit_config["axes_lens"] = [1024, 512, 512]
+            dit_config["rope_theta"] = 10000.0
+            dit_config["model_type"] = "newbie_dit"
+            dit_config["image_model"] = "NewBieImage"
+            return dit_config
+
         if dit_config["dim"] == 2304: # Original Lumina 2
             dit_config["n_heads"] = 24
             dit_config["n_kv_heads"] = 8
@@ -429,9 +459,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
             dit_config["axes_lens"] = [300, 512, 512]
             dit_config["rope_theta"] = 10000.0
             dit_config["ffn_dim_multiplier"] = 4.0
-            ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None)
-            if ctd_weight is not None:
-                dit_config["clip_text_dim"] = ctd_weight.shape[0]
         elif dit_config["dim"] == 3840:  # Z image
             dit_config["n_heads"] = 30
             dit_config["n_kv_heads"] = 30
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 834dfcffc..d9a4ba459 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1,1534 +1,1557 @@
-import torch
-from . import model_base
-from . import utils
-
-from . import sd1_clip
-from . import sdxl_clip
-import comfy.text_encoders.sd2_clip
-import comfy.text_encoders.sd3_clip
-import comfy.text_encoders.sa_t5
-import comfy.text_encoders.aura_t5
-import comfy.text_encoders.pixart_t5
-import comfy.text_encoders.hydit
-import comfy.text_encoders.flux
-import comfy.text_encoders.genmo
-import comfy.text_encoders.lt
-import comfy.text_encoders.hunyuan_video
-import comfy.text_encoders.cosmos
-import comfy.text_encoders.lumina2
-import comfy.text_encoders.wan
-import comfy.text_encoders.ace
-import comfy.text_encoders.omnigen2
-import comfy.text_encoders.qwen_image
-import comfy.text_encoders.hunyuan_image
-import comfy.text_encoders.kandinsky5
-import comfy.text_encoders.z_image
-
-from . import supported_models_base
-from . import latent_formats
-
-from . import diffusers_convert
-
-class SD15(supported_models_base.BASE):
-    unet_config = {
-        "context_dim": 768,
-        "model_channels": 320,
-        "use_linear_in_transformer": False,
-        "adm_in_channels": None,
-        "use_temporal_attention": False,
-    }
-
-    unet_extra_config = {
-        "num_heads": 8,
-        "num_head_channels": -1,
-    }
-
-    latent_format = latent_formats.SD15
-    memory_usage_factor = 1.0
-
-    def process_clip_state_dict(self, state_dict):
-        k = list(state_dict.keys())
-        for x in k:
-            if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."):
-                y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.")
-                state_dict[y] = state_dict.pop(x)
-
-        if 'cond_stage_model.transformer.text_model.embeddings.position_ids' in state_dict:
-            ids = state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids']
-            if ids.dtype == torch.float32:
-                state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round()
-
-        replace_prefix = {}
-        replace_prefix["cond_stage_model."] = "clip_l."
-        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
-        return state_dict
-
-    def process_clip_state_dict_for_saving(self, state_dict):
-        pop_keys = ["clip_l.transformer.text_projection.weight", "clip_l.logit_scale"]
-        for p in pop_keys:
-            if p in state_dict:
-                state_dict.pop(p)
-
-        replace_prefix = {"clip_l.": "cond_stage_model."}
-        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sd1_clip.SD1Tokenizer, sd1_clip.SD1ClipModel)
-
-class SD20(supported_models_base.BASE):
-    unet_config = {
-        "context_dim": 1024,
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "adm_in_channels": None,
-        "use_temporal_attention": False,
-    }
-
-    unet_extra_config = {
-        "num_heads": -1,
-        "num_head_channels": 64,
-        "attn_precision": torch.float32,
-    }
-
-    latent_format = latent_formats.SD15
-    memory_usage_factor = 1.0
-
-    def model_type(self, state_dict, prefix=""):
-        if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
-            k = "{}output_blocks.11.1.transformer_blocks.0.norm1.bias".format(prefix)
-            out = state_dict.get(k, None)
-            if out is not None and torch.std(out, unbiased=False) > 0.09: # not sure how well this will actually work. I guess we will find out.
-                return model_base.ModelType.V_PREDICTION
-        return model_base.ModelType.EPS
-
-    def process_clip_state_dict(self, state_dict):
-        replace_prefix = {}
-        replace_prefix["conditioner.embedders.0.model."] = "clip_h." #SD2 in sgm format
-        replace_prefix["cond_stage_model.model."] = "clip_h."
-        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
-        state_dict = utils.clip_text_transformers_convert(state_dict, "clip_h.", "clip_h.transformer.")
-        return state_dict
-
-    def process_clip_state_dict_for_saving(self, state_dict):
-        replace_prefix = {}
-        replace_prefix["clip_h"] = "cond_stage_model.model"
-        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix)
-        state_dict = diffusers_convert.convert_text_enc_state_dict_v20(state_dict)
-        return state_dict
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.sd2_clip.SD2Tokenizer, comfy.text_encoders.sd2_clip.SD2ClipModel)
-
-class SD21UnclipL(SD20):
-    unet_config = {
-        "context_dim": 1024,
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "adm_in_channels": 1536,
-        "use_temporal_attention": False,
-    }
-
-    clip_vision_prefix = "embedder.model.visual."
-    noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 768}
-
-
-class SD21UnclipH(SD20):
-    unet_config = {
-        "context_dim": 1024,
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "adm_in_channels": 2048,
-        "use_temporal_attention": False,
-    }
-
-    clip_vision_prefix = "embedder.model.visual."
-    noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1024}
-
-class SDXLRefiner(supported_models_base.BASE):
-    unet_config = {
-        "model_channels": 384,
-        "use_linear_in_transformer": True,
-        "context_dim": 1280,
-        "adm_in_channels": 2560,
-        "transformer_depth": [0, 0, 4, 4, 4, 4, 0, 0],
-        "use_temporal_attention": False,
-    }
-
-    latent_format = latent_formats.SDXL
-    memory_usage_factor = 1.0
-
-    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.SDXLRefiner(self, device=device)
-
-    def process_clip_state_dict(self, state_dict):
-        keys_to_replace = {}
-        replace_prefix = {}
-        replace_prefix["conditioner.embedders.0.model."] = "clip_g."
-        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
-
-        state_dict = utils.clip_text_transformers_convert(state_dict, "clip_g.", "clip_g.transformer.")
-        state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace)
-        return state_dict
-
-    def process_clip_state_dict_for_saving(self, state_dict):
-        replace_prefix = {}
-        state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g")
-        if "clip_g.transformer.text_model.embeddings.position_ids" in state_dict_g:
-            state_dict_g.pop("clip_g.transformer.text_model.embeddings.position_ids")
-        replace_prefix["clip_g"] = "conditioner.embedders.0.model"
-        state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix)
-        return state_dict_g
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLRefinerClipModel)
-
-class SDXL(supported_models_base.BASE):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [0, 0, 2, 2, 10, 10],
-        "context_dim": 2048,
-        "adm_in_channels": 2816,
-        "use_temporal_attention": False,
-    }
-
-    latent_format = latent_formats.SDXL
-
-    memory_usage_factor = 0.8
-
-    def model_type(self, state_dict, prefix=""):
-        if 'edm_mean' in state_dict and 'edm_std' in state_dict: #Playground V2.5
-            self.latent_format = latent_formats.SDXL_Playground_2_5()
-            self.sampling_settings["sigma_data"] = 0.5
-            self.sampling_settings["sigma_max"] = 80.0
-            self.sampling_settings["sigma_min"] = 0.002
-            return model_base.ModelType.EDM
-        elif "edm_vpred.sigma_max" in state_dict:
-            self.sampling_settings["sigma_max"] = float(state_dict["edm_vpred.sigma_max"].item())
-            if "edm_vpred.sigma_min" in state_dict:
-                self.sampling_settings["sigma_min"] = float(state_dict["edm_vpred.sigma_min"].item())
-            return model_base.ModelType.V_PREDICTION_EDM
-        elif "v_pred" in state_dict:
-            if "ztsnr" in state_dict: #Some zsnr anime checkpoints
-                self.sampling_settings["zsnr"] = True
-            return model_base.ModelType.V_PREDICTION
-        else:
-            return model_base.ModelType.EPS
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.SDXL(self, model_type=self.model_type(state_dict, prefix), device=device)
-        if self.inpaint_model():
-            out.set_inpaint()
-        return out
-
-    def process_clip_state_dict(self, state_dict):
-        keys_to_replace = {}
-        replace_prefix = {}
-
-        replace_prefix["conditioner.embedders.0.transformer.text_model"] = "clip_l.transformer.text_model"
-        replace_prefix["conditioner.embedders.1.model."] = "clip_g."
-        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
-
-        state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace)
-        state_dict = utils.clip_text_transformers_convert(state_dict, "clip_g.", "clip_g.transformer.")
-        return state_dict
-
-    def process_clip_state_dict_for_saving(self, state_dict):
-        replace_prefix = {}
-        state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g")
-        for k in state_dict:
-            if k.startswith("clip_l"):
-                state_dict_g[k] = state_dict[k]
-
-        state_dict_g["clip_l.transformer.text_model.embeddings.position_ids"] = torch.arange(77).expand((1, -1))
-        pop_keys = ["clip_l.transformer.text_projection.weight", "clip_l.logit_scale"]
-        for p in pop_keys:
-            if p in state_dict_g:
-                state_dict_g.pop(p)
-
-        replace_prefix["clip_g"] = "conditioner.embedders.1.model"
-        replace_prefix["clip_l"] = "conditioner.embedders.0"
-        state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix)
-        return state_dict_g
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLClipModel)
-
-class SSD1B(SDXL):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [0, 0, 2, 2, 4, 4],
-        "context_dim": 2048,
-        "adm_in_channels": 2816,
-        "use_temporal_attention": False,
-    }
-
-class Segmind_Vega(SDXL):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [0, 0, 1, 1, 2, 2],
-        "context_dim": 2048,
-        "adm_in_channels": 2816,
-        "use_temporal_attention": False,
-    }
-
-class KOALA_700M(SDXL):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [0, 2, 5],
-        "context_dim": 2048,
-        "adm_in_channels": 2816,
-        "use_temporal_attention": False,
-    }
-
-class KOALA_1B(SDXL):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [0, 2, 6],
-        "context_dim": 2048,
-        "adm_in_channels": 2816,
-        "use_temporal_attention": False,
-    }
-
-class SVD_img2vid(supported_models_base.BASE):
-    unet_config = {
-        "model_channels": 320,
-        "in_channels": 8,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
-        "context_dim": 1024,
-        "adm_in_channels": 768,
-        "use_temporal_attention": True,
-        "use_temporal_resblock": True
-    }
-
-    unet_extra_config = {
-        "num_heads": -1,
-        "num_head_channels": 64,
-        "attn_precision": torch.float32,
-    }
-
-    clip_vision_prefix = "conditioner.embedders.0.open_clip.model.visual."
-
-    latent_format = latent_formats.SD15
-
-    sampling_settings = {"sigma_max": 700.0, "sigma_min": 0.002}
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.SVD_img2vid(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return None
-
-class SV3D_u(SVD_img2vid):
-    unet_config = {
-        "model_channels": 320,
-        "in_channels": 8,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
-        "context_dim": 1024,
-        "adm_in_channels": 256,
-        "use_temporal_attention": True,
-        "use_temporal_resblock": True
-    }
-
-    vae_key_prefix = ["conditioner.embedders.1.encoder."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.SV3D_u(self, device=device)
-        return out
-
-class SV3D_p(SV3D_u):
-    unet_config = {
-        "model_channels": 320,
-        "in_channels": 8,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
-        "context_dim": 1024,
-        "adm_in_channels": 1280,
-        "use_temporal_attention": True,
-        "use_temporal_resblock": True
-    }
-
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.SV3D_p(self, device=device)
-        return out
-
-class Stable_Zero123(supported_models_base.BASE):
-    unet_config = {
-        "context_dim": 768,
-        "model_channels": 320,
-        "use_linear_in_transformer": False,
-        "adm_in_channels": None,
-        "use_temporal_attention": False,
-        "in_channels": 8,
-    }
-
-    unet_extra_config = {
-        "num_heads": 8,
-        "num_head_channels": -1,
-    }
-
-    required_keys = {
-        "cc_projection.weight": None,
-        "cc_projection.bias": None,
-    }
-
-    clip_vision_prefix = "cond_stage_model.model.visual."
-
-    latent_format = latent_formats.SD15
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Stable_Zero123(self, device=device, cc_projection_weight=state_dict["cc_projection.weight"], cc_projection_bias=state_dict["cc_projection.bias"])
-        return out
-
-    def clip_target(self, state_dict={}):
-        return None
-
-class SD_X4Upscaler(SD20):
-    unet_config = {
-        "context_dim": 1024,
-        "model_channels": 256,
-        'in_channels': 7,
-        "use_linear_in_transformer": True,
-        "adm_in_channels": None,
-        "use_temporal_attention": False,
-    }
-
-    unet_extra_config = {
-        "disable_self_attentions": [True, True, True, False],
-        "num_classes": 1000,
-        "num_heads": 8,
-        "num_head_channels": -1,
-    }
-
-    latent_format = latent_formats.SD_X4
-
-    sampling_settings = {
-        "linear_start": 0.0001,
-        "linear_end": 0.02,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.SD_X4Upscaler(self, device=device)
-        return out
-
-class Stable_Cascade_C(supported_models_base.BASE):
-    unet_config = {
-        "stable_cascade_stage": 'c',
-    }
-
-    unet_extra_config = {}
-
-    latent_format = latent_formats.SC_Prior
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    sampling_settings = {
-        "shift": 2.0,
-    }
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoder."]
-    clip_vision_prefix = "clip_l_vision."
-
-    def process_unet_state_dict(self, state_dict):
-        key_list = list(state_dict.keys())
-        for y in ["weight", "bias"]:
-            suffix = "in_proj_{}".format(y)
-            keys = filter(lambda a: a.endswith(suffix), key_list)
-            for k_from in keys:
-                weights = state_dict.pop(k_from)
-                prefix = k_from[:-(len(suffix) + 1)]
-                shape_from = weights.shape[0] // 3
-                for x in range(3):
-                    p = ["to_q", "to_k", "to_v"]
-                    k_to = "{}.{}.{}".format(prefix, p[x], y)
-                    state_dict[k_to] = weights[shape_from*x:shape_from*(x + 1)]
-        return state_dict
-
-    def process_clip_state_dict(self, state_dict):
-        state_dict = utils.state_dict_prefix_replace(state_dict, {k: "" for k in self.text_encoder_key_prefix}, filter_keys=True)
-        if "clip_g.text_projection" in state_dict:
-            state_dict["clip_g.transformer.text_projection.weight"] = state_dict.pop("clip_g.text_projection").transpose(0, 1)
-        return state_dict
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.StableCascade_C(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(sdxl_clip.StableCascadeTokenizer, sdxl_clip.StableCascadeClipModel)
-
-class Stable_Cascade_B(Stable_Cascade_C):
-    unet_config = {
-        "stable_cascade_stage": 'b',
-    }
-
-    unet_extra_config = {}
-
-    latent_format = latent_formats.SC_B
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-
-    sampling_settings = {
-        "shift": 1.0,
-    }
-
-    clip_vision_prefix = None
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.StableCascade_B(self, device=device)
-        return out
-
-class SD15_instructpix2pix(SD15):
-    unet_config = {
-        "context_dim": 768,
-        "model_channels": 320,
-        "use_linear_in_transformer": False,
-        "adm_in_channels": None,
-        "use_temporal_attention": False,
-        "in_channels": 8,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.SD15_instructpix2pix(self, device=device)
-
-class SDXL_instructpix2pix(SDXL):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "transformer_depth": [0, 0, 2, 2, 10, 10],
-        "context_dim": 2048,
-        "adm_in_channels": 2816,
-        "use_temporal_attention": False,
-        "in_channels": 8,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.SDXL_instructpix2pix(self, model_type=self.model_type(state_dict, prefix), device=device)
-
-class LotusD(SD20):
-    unet_config = {
-        "model_channels": 320,
-        "use_linear_in_transformer": True,
-        "use_temporal_attention": False,
-        "adm_in_channels": 4,
-        "in_channels": 4,
-    }
-
-    unet_extra_config = {
-        "num_classes": 'sequential'
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.Lotus(self, device=device)
-
-class SD3(supported_models_base.BASE):
-    unet_config = {
-        "in_channels": 16,
-        "pos_embed_scaling_factor": None,
-    }
-
-    sampling_settings = {
-        "shift": 3.0,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.SD3
-
-    memory_usage_factor = 1.6
-
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.SD3(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        clip_l = False
-        clip_g = False
-        t5 = False
-        pref = self.text_encoder_key_prefix[0]
-        if "{}clip_l.transformer.text_model.final_layer_norm.weight".format(pref) in state_dict:
-            clip_l = True
-        if "{}clip_g.transformer.text_model.final_layer_norm.weight".format(pref) in state_dict:
-            clip_g = True
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        if "dtype_t5" in t5_detect:
-            t5 = True
-
-        return supported_models_base.ClipTarget(comfy.text_encoders.sd3_clip.SD3Tokenizer, comfy.text_encoders.sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, **t5_detect))
-
-class StableAudio(supported_models_base.BASE):
-    unet_config = {
-        "audio_model": "dit1.0",
-    }
-
-    sampling_settings = {"sigma_max": 500.0, "sigma_min": 0.03}
-
-    unet_extra_config = {}
-    latent_format = latent_formats.StableAudio1
-
-    text_encoder_key_prefix = ["text_encoders."]
-    vae_key_prefix = ["pretransform.model."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        seconds_start_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_start.": ""}, filter_keys=True)
-        seconds_total_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_total.": ""}, filter_keys=True)
-        return model_base.StableAudio1(self, seconds_start_embedder_weights=seconds_start_sd, seconds_total_embedder_weights=seconds_total_sd, device=device)
-
-    def process_unet_state_dict(self, state_dict):
-        for k in list(state_dict.keys()):
-            if k.endswith(".cross_attend_norm.beta") or k.endswith(".ff_norm.beta") or k.endswith(".pre_norm.beta"): #These weights are all zero
-                state_dict.pop(k)
-        return state_dict
-
-    def process_unet_state_dict_for_saving(self, state_dict):
-        replace_prefix = {"": "model.model."}
-        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model)
-
-class AuraFlow(supported_models_base.BASE):
-    unet_config = {
-        "cond_seq_dim": 2048,
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 1.73,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.SDXL
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.AuraFlow(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.aura_t5.AuraT5Tokenizer, comfy.text_encoders.aura_t5.AuraT5Model)
-
-class PixArtAlpha(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "pixart_alpha",
-    }
-
-    sampling_settings = {
-        "beta_schedule" : "sqrt_linear",
-        "linear_start"  : 0.0001,
-        "linear_end"    : 0.02,
-        "timesteps"     : 1000,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.SD15
-
-    memory_usage_factor = 0.5
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.PixArt(self, device=device)
-        return out.eval()
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.PixArtT5XXL)
-
-class PixArtSigma(PixArtAlpha):
-    unet_config = {
-        "image_model": "pixart_sigma",
-    }
-    latent_format = latent_formats.SDXL
-
-class HunyuanDiT(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "hydit",
-    }
-
-    unet_extra_config = {
-        "attn_precision": torch.float32,
-    }
-
-    sampling_settings = {
-        "linear_start": 0.00085,
-        "linear_end": 0.018,
-    }
-
-    latent_format = latent_formats.SDXL
-
-    memory_usage_factor = 1.3
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanDiT(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.hydit.HyditTokenizer, comfy.text_encoders.hydit.HyditModel)
-
-class HunyuanDiT1(HunyuanDiT):
-    unet_config = {
-        "image_model": "hydit1",
-    }
-
-    unet_extra_config = {}
-
-    sampling_settings = {
-        "linear_start" : 0.00085,
-        "linear_end" : 0.03,
-    }
-
-class Flux(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "flux",
-        "guidance_embed": True,
-    }
-
-    sampling_settings = {
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux
-
-    memory_usage_factor = 3.1 # TODO: debug why flux mem usage is so weird on windows.
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Flux(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
-
-class FluxInpaint(Flux):
-    unet_config = {
-        "image_model": "flux",
-        "guidance_embed": True,
-        "in_channels": 96,
-    }
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-class FluxSchnell(Flux):
-    unet_config = {
-        "image_model": "flux",
-        "guidance_embed": False,
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 1.0,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
-        return out
-
-class Flux2(Flux):
-    unet_config = {
-        "image_model": "flux2",
-    }
-
-    sampling_settings = {
-        "shift": 2.02,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux2
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Flux2(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return None # TODO
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
-
-class GenmoMochi(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "mochi_preview",
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 6.0,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Mochi
-
-    memory_usage_factor = 2.0 #TODO
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.GenmoMochi(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.genmo.MochiT5Tokenizer, comfy.text_encoders.genmo.mochi_te(**t5_detect))
-
-class LTXV(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "ltxv",
-    }
-
-    sampling_settings = {
-        "shift": 2.37,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.LTXV
-
-    memory_usage_factor = 5.5 # TODO: img2vid is about 2x vs txt2vid
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        self.memory_usage_factor = (unet_config.get("cross_attention_dim", 2048) / 2048) * 5.5
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.LTXV(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.lt.LTXVT5Tokenizer, comfy.text_encoders.lt.ltxv_te(**t5_detect))
-
-class HunyuanVideo(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "hunyuan_video",
-    }
-
-    sampling_settings = {
-        "shift": 7.0,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.HunyuanVideo
-
-    memory_usage_factor = 1.8 #TODO
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideo(self, device=device)
-        return out
-
-    def process_unet_state_dict(self, state_dict):
-        out_sd = {}
-        for k in list(state_dict.keys()):
-            key_out = k
-            key_out = key_out.replace("txt_in.t_embedder.mlp.0.", "txt_in.t_embedder.in_layer.").replace("txt_in.t_embedder.mlp.2.", "txt_in.t_embedder.out_layer.")
-            key_out = key_out.replace("txt_in.c_embedder.linear_1.", "txt_in.c_embedder.in_layer.").replace("txt_in.c_embedder.linear_2.", "txt_in.c_embedder.out_layer.")
-            key_out = key_out.replace("_mod.linear.", "_mod.lin.").replace("_attn_qkv.", "_attn.qkv.")
-            key_out = key_out.replace("mlp.fc1.", "mlp.0.").replace("mlp.fc2.", "mlp.2.")
-            key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.scale").replace("_attn_k_norm.weight", "_attn.norm.key_norm.scale")
-            key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.scale").replace(".k_norm.weight", ".norm.key_norm.scale")
-            key_out = key_out.replace("_attn_proj.", "_attn.proj.")
-            key_out = key_out.replace(".modulation.linear.", ".modulation.lin.")
-            key_out = key_out.replace("_in.mlp.2.", "_in.out_layer.").replace("_in.mlp.0.", "_in.in_layer.")
-            out_sd[key_out] = state_dict[k]
-        return out_sd
-
-    def process_unet_state_dict_for_saving(self, state_dict):
-        replace_prefix = {"": "model.model."}
-        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}llama.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer, comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**hunyuan_detect))
-
-class HunyuanVideoI2V(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "in_channels": 33,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideoI2V(self, device=device)
-        return out
-
-class HunyuanVideoSkyreelsI2V(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "in_channels": 32,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideoSkyreelsI2V(self, device=device)
-        return out
-
-class CosmosT2V(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "cosmos",
-        "in_channels": 16,
-    }
-
-    sampling_settings = {
-        "sigma_data": 0.5,
-        "sigma_max": 80.0,
-        "sigma_min": 0.002,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Cosmos1CV8x8x8
-
-    memory_usage_factor = 1.6 #TODO
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] #TODO
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.CosmosVideo(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))
-
-class CosmosI2V(CosmosT2V):
-    unet_config = {
-        "image_model": "cosmos",
-        "in_channels": 17,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.CosmosVideo(self, image_to_video=True, device=device)
-        return out
-
-class CosmosT2IPredict2(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "cosmos_predict2",
-        "in_channels": 16,
-    }
-
-    sampling_settings = {
-        "sigma_data": 1.0,
-        "sigma_max": 80.0,
-        "sigma_min": 0.002,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Wan21
-
-    memory_usage_factor = 1.0
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.CosmosPredict2(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))
-
-class CosmosI2VPredict2(CosmosT2IPredict2):
-    unet_config = {
-        "image_model": "cosmos_predict2",
-        "in_channels": 17,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.CosmosPredict2(self, image_to_video=True, device=device)
-        return out
-
-class Lumina2(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "lumina2",
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 6.0,
-    }
-
-    memory_usage_factor = 1.4
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Lumina2(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}gemma2_2b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.lumina2.LuminaTokenizer, comfy.text_encoders.lumina2.te(**hunyuan_detect))
-
-class ZImage(Lumina2):
-    unet_config = {
-        "image_model": "lumina2",
-        "dim": 3840,
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 3.0,
-    }
-
-    memory_usage_factor = 2.0
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.z_image.ZImageTokenizer, comfy.text_encoders.z_image.te(**hunyuan_detect))
-
-class WAN21_T2V(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "t2v",
-    }
-
-    sampling_settings = {
-        "shift": 8.0,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Wan21
-
-    memory_usage_factor = 0.9
-
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        self.memory_usage_factor = self.unet_config.get("dim", 2000) / 2222
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}umt5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.wan.WanT5Tokenizer, comfy.text_encoders.wan.te(**t5_detect))
-
-class WAN21_I2V(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "i2v",
-        "in_dim": 36,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21(self, image_to_video=True, device=device)
-        return out
-
-class WAN21_FunControl2V(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "i2v",
-        "in_dim": 48,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21(self, image_to_video=False, device=device)
-        return out
-
-class WAN21_Camera(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "camera",
-        "in_dim": 32,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
-        return out
-
-class WAN22_Camera(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "camera_2.2",
-        "in_dim": 36,
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
-        return out
-
-class WAN21_Vace(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "vace",
-    }
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        self.memory_usage_factor = 1.2 * self.memory_usage_factor
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21_Vace(self, image_to_video=False, device=device)
-        return out
-
-class WAN21_HuMo(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "humo",
-    }
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN21_HuMo(self, image_to_video=False, device=device)
-        return out
-
-class WAN22_S2V(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "s2v",
-    }
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN22_S2V(self, device=device)
-        return out
-
-class WAN22_Animate(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "animate",
-    }
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN22_Animate(self, device=device)
-        return out
-
-class WAN22_T2V(WAN21_T2V):
-    unet_config = {
-        "image_model": "wan2.1",
-        "model_type": "t2v",
-        "out_dim": 48,
-    }
-
-    latent_format = latent_formats.Wan22
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.WAN22(self, image_to_video=True, device=device)
-        return out
-
-class Hunyuan3Dv2(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "hunyuan3d2",
-    }
-
-    unet_extra_config = {}
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 1.0,
-    }
-
-    memory_usage_factor = 3.5
-
-    clip_vision_prefix = "conditioner.main_image_encoder.model."
-    vae_key_prefix = ["vae."]
-
-    latent_format = latent_formats.Hunyuan3Dv2
-
-    def process_unet_state_dict_for_saving(self, state_dict):
-        replace_prefix = {"": "model."}
-        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Hunyuan3Dv2(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return None
-
-class Hunyuan3Dv2_1(Hunyuan3Dv2):
-    unet_config = {
-        "image_model": "hunyuan3d2_1",
-    }
-
-    latent_format = latent_formats.Hunyuan3Dv2_1
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Hunyuan3Dv2_1(self, device = device)
-        return out
-
-class Hunyuan3Dv2mini(Hunyuan3Dv2):
-    unet_config = {
-        "image_model": "hunyuan3d2",
-        "depth": 8,
-    }
-
-    latent_format = latent_formats.Hunyuan3Dv2mini
-
-class HiDream(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "hidream",
-    }
-
-    sampling_settings = {
-        "shift": 3.0,
-    }
-
-    sampling_settings = {
-    }
-
-    # memory_usage_factor = 1.2 # TODO
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HiDream(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return None #  TODO
-
-class Chroma(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "chroma",
-    }
-
-    unet_extra_config = {
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-    }
-
-    latent_format = comfy.latent_formats.Flux
-
-    memory_usage_factor = 3.2
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
-
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Chroma(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect))
-
-class ChromaRadiance(Chroma):
-    unet_config = {
-        "image_model": "chroma_radiance",
-    }
-
-    latent_format = comfy.latent_formats.ChromaRadiance
-
-    # Pixel-space model, no spatial compression for model input.
-    memory_usage_factor = 0.044
-
-    def get_model(self, state_dict, prefix="", device=None):
-        return model_base.ChromaRadiance(self, device=device)
-
-class ACEStep(supported_models_base.BASE):
-    unet_config = {
-        "audio_model": "ace",
-    }
-
-    unet_extra_config = {
-    }
-
-    sampling_settings = {
-        "shift": 3.0,
-    }
-
-    latent_format = comfy.latent_formats.ACEAudio
-
-    memory_usage_factor = 0.5
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.ACEStep(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        return supported_models_base.ClipTarget(comfy.text_encoders.ace.AceT5Tokenizer, comfy.text_encoders.ace.AceT5Model)
-
-class Omnigen2(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "omnigen2",
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 2.6,
-    }
-
-    memory_usage_factor = 1.95 #TODO
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Flux
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def __init__(self, unet_config):
-        super().__init__(unet_config)
-        if comfy.model_management.extended_fp16_support():
-            self.supported_inference_dtypes = [torch.float16] + self.supported_inference_dtypes
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Omnigen2(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
-
-class QwenImage(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "qwen_image",
-    }
-
-    sampling_settings = {
-        "multiplier": 1.0,
-        "shift": 1.15,
-    }
-
-    memory_usage_factor = 1.8 #TODO
-
-    unet_extra_config = {}
-    latent_format = latent_formats.Wan21
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.QwenImage(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
-
-class HunyuanImage21(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "vec_in_dim": None,
-    }
-
-    sampling_settings = {
-        "shift": 5.0,
-    }
-
-    latent_format = latent_formats.HunyuanImage21
-
-    memory_usage_factor = 8.7
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanImage21(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
-
-class HunyuanImage21Refiner(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "patch_size": [1, 1, 1],
-        "vec_in_dim": None,
-    }
-
-    sampling_settings = {
-        "shift": 4.0,
-    }
-
-    latent_format = latent_formats.HunyuanImage21Refiner
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanImage21Refiner(self, device=device)
-        return out
-
-class HunyuanVideo15(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "vision_in_dim": 1152,
-    }
-
-    sampling_settings = {
-        "shift": 7.0,
-    }
-    memory_usage_factor = 4.0 #TODO
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-
-    latent_format = latent_formats.HunyuanVideo15
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideo15(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
-
-
-class HunyuanVideo15_SR_Distilled(HunyuanVideo):
-    unet_config = {
-        "image_model": "hunyuan_video",
-        "vision_in_dim": 1152,
-        "in_channels": 98,
-    }
-
-    sampling_settings = {
-        "shift": 2.0,
-    }
-    memory_usage_factor = 4.0 #TODO
-    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
-
-    latent_format = latent_formats.HunyuanVideo15
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.HunyuanVideo15_SR_Distilled(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
-
-
-class Kandinsky5(supported_models_base.BASE):
-    unet_config = {
-        "image_model": "kandinsky5",
-    }
-
-    sampling_settings = {
-        "shift": 10.0,
-    }
-
-    unet_extra_config = {}
-    latent_format = latent_formats.HunyuanVideo
-
-    memory_usage_factor = 1.25 #TODO
-
-    supported_inference_dtypes = [torch.bfloat16, torch.float32]
-
-    vae_key_prefix = ["vae."]
-    text_encoder_key_prefix = ["text_encoders."]
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Kandinsky5(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
-
-
-class Kandinsky5Image(Kandinsky5):
-    unet_config = {
-        "image_model": "kandinsky5",
-        "model_dim": 2560,
-        "visual_embed_dim": 64,
-    }
-
-    sampling_settings = {
-        "shift": 3.0,
-    }
-
-    latent_format = latent_formats.Flux
-    memory_usage_factor = 1.25 #TODO
-
-    def get_model(self, state_dict, prefix="", device=None):
-        out = model_base.Kandinsky5Image(self, device=device)
-        return out
-
-    def clip_target(self, state_dict={}):
-        pref = self.text_encoder_key_prefix[0]
-        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
-        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
-
-
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
-
-models += [SVD_img2vid]
+import torch
+from . import model_base
+from . import utils
+
+from . import sd1_clip
+from . import sdxl_clip
+import comfy.text_encoders.sd2_clip
+import comfy.text_encoders.sd3_clip
+import comfy.text_encoders.sa_t5
+import comfy.text_encoders.aura_t5
+import comfy.text_encoders.pixart_t5
+import comfy.text_encoders.hydit
+import comfy.text_encoders.flux
+import comfy.text_encoders.genmo
+import comfy.text_encoders.lt
+import comfy.text_encoders.hunyuan_video
+import comfy.text_encoders.cosmos
+import comfy.text_encoders.lumina2
+import comfy.text_encoders.wan
+import comfy.text_encoders.ace
+import comfy.text_encoders.omnigen2
+import comfy.text_encoders.qwen_image
+import comfy.text_encoders.hunyuan_image
+import comfy.text_encoders.kandinsky5
+import comfy.text_encoders.z_image
+
+from . import supported_models_base
+from . import latent_formats
+
+from . import diffusers_convert
+
+class SD15(supported_models_base.BASE):
+    unet_config = {
+        "context_dim": 768,
+        "model_channels": 320,
+        "use_linear_in_transformer": False,
+        "adm_in_channels": None,
+        "use_temporal_attention": False,
+    }
+
+    unet_extra_config = {
+        "num_heads": 8,
+        "num_head_channels": -1,
+    }
+
+    latent_format = latent_formats.SD15
+    memory_usage_factor = 1.0
+
+    def process_clip_state_dict(self, state_dict):
+        k = list(state_dict.keys())
+        for x in k:
+            if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."):
+                y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.")
+                state_dict[y] = state_dict.pop(x)
+
+        if 'cond_stage_model.transformer.text_model.embeddings.position_ids' in state_dict:
+            ids = state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids']
+            if ids.dtype == torch.float32:
+                state_dict['cond_stage_model.transformer.text_model.embeddings.position_ids'] = ids.round()
+
+        replace_prefix = {}
+        replace_prefix["cond_stage_model."] = "clip_l."
+        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
+        return state_dict
+
+    def process_clip_state_dict_for_saving(self, state_dict):
+        pop_keys = ["clip_l.transformer.text_projection.weight", "clip_l.logit_scale"]
+        for p in pop_keys:
+            if p in state_dict:
+                state_dict.pop(p)
+
+        replace_prefix = {"clip_l.": "cond_stage_model."}
+        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(sd1_clip.SD1Tokenizer, sd1_clip.SD1ClipModel)
+
+class SD20(supported_models_base.BASE):
+    unet_config = {
+        "context_dim": 1024,
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "adm_in_channels": None,
+        "use_temporal_attention": False,
+    }
+
+    unet_extra_config = {
+        "num_heads": -1,
+        "num_head_channels": 64,
+        "attn_precision": torch.float32,
+    }
+
+    latent_format = latent_formats.SD15
+    memory_usage_factor = 1.0
+
+    def model_type(self, state_dict, prefix=""):
+        if self.unet_config["in_channels"] == 4: #SD2.0 inpainting models are not v prediction
+            k = "{}output_blocks.11.1.transformer_blocks.0.norm1.bias".format(prefix)
+            out = state_dict.get(k, None)
+            if out is not None and torch.std(out, unbiased=False) > 0.09: # not sure how well this will actually work. I guess we will find out.
+                return model_base.ModelType.V_PREDICTION
+        return model_base.ModelType.EPS
+
+    def process_clip_state_dict(self, state_dict):
+        replace_prefix = {}
+        replace_prefix["conditioner.embedders.0.model."] = "clip_h." #SD2 in sgm format
+        replace_prefix["cond_stage_model.model."] = "clip_h."
+        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
+        state_dict = utils.clip_text_transformers_convert(state_dict, "clip_h.", "clip_h.transformer.")
+        return state_dict
+
+    def process_clip_state_dict_for_saving(self, state_dict):
+        replace_prefix = {}
+        replace_prefix["clip_h"] = "cond_stage_model.model"
+        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix)
+        state_dict = diffusers_convert.convert_text_enc_state_dict_v20(state_dict)
+        return state_dict
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.sd2_clip.SD2Tokenizer, comfy.text_encoders.sd2_clip.SD2ClipModel)
+
+class SD21UnclipL(SD20):
+    unet_config = {
+        "context_dim": 1024,
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "adm_in_channels": 1536,
+        "use_temporal_attention": False,
+    }
+
+    clip_vision_prefix = "embedder.model.visual."
+    noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 768}
+
+
+class SD21UnclipH(SD20):
+    unet_config = {
+        "context_dim": 1024,
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "adm_in_channels": 2048,
+        "use_temporal_attention": False,
+    }
+
+    clip_vision_prefix = "embedder.model.visual."
+    noise_aug_config = {"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1024}
+
+class SDXLRefiner(supported_models_base.BASE):
+    unet_config = {
+        "model_channels": 384,
+        "use_linear_in_transformer": True,
+        "context_dim": 1280,
+        "adm_in_channels": 2560,
+        "transformer_depth": [0, 0, 4, 4, 4, 4, 0, 0],
+        "use_temporal_attention": False,
+    }
+
+    latent_format = latent_formats.SDXL
+    memory_usage_factor = 1.0
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.SDXLRefiner(self, device=device)
+
+    def process_clip_state_dict(self, state_dict):
+        keys_to_replace = {}
+        replace_prefix = {}
+        replace_prefix["conditioner.embedders.0.model."] = "clip_g."
+        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
+
+        state_dict = utils.clip_text_transformers_convert(state_dict, "clip_g.", "clip_g.transformer.")
+        state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace)
+        return state_dict
+
+    def process_clip_state_dict_for_saving(self, state_dict):
+        replace_prefix = {}
+        state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g")
+        if "clip_g.transformer.text_model.embeddings.position_ids" in state_dict_g:
+            state_dict_g.pop("clip_g.transformer.text_model.embeddings.position_ids")
+        replace_prefix["clip_g"] = "conditioner.embedders.0.model"
+        state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix)
+        return state_dict_g
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLRefinerClipModel)
+
+class SDXL(supported_models_base.BASE):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [0, 0, 2, 2, 10, 10],
+        "context_dim": 2048,
+        "adm_in_channels": 2816,
+        "use_temporal_attention": False,
+    }
+
+    latent_format = latent_formats.SDXL
+
+    memory_usage_factor = 0.8
+
+    def model_type(self, state_dict, prefix=""):
+        if 'edm_mean' in state_dict and 'edm_std' in state_dict: #Playground V2.5
+            self.latent_format = latent_formats.SDXL_Playground_2_5()
+            self.sampling_settings["sigma_data"] = 0.5
+            self.sampling_settings["sigma_max"] = 80.0
+            self.sampling_settings["sigma_min"] = 0.002
+            return model_base.ModelType.EDM
+        elif "edm_vpred.sigma_max" in state_dict:
+            self.sampling_settings["sigma_max"] = float(state_dict["edm_vpred.sigma_max"].item())
+            if "edm_vpred.sigma_min" in state_dict:
+                self.sampling_settings["sigma_min"] = float(state_dict["edm_vpred.sigma_min"].item())
+            return model_base.ModelType.V_PREDICTION_EDM
+        elif "v_pred" in state_dict:
+            if "ztsnr" in state_dict: #Some zsnr anime checkpoints
+                self.sampling_settings["zsnr"] = True
+            return model_base.ModelType.V_PREDICTION
+        else:
+            return model_base.ModelType.EPS
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.SDXL(self, model_type=self.model_type(state_dict, prefix), device=device)
+        if self.inpaint_model():
+            out.set_inpaint()
+        return out
+
+    def process_clip_state_dict(self, state_dict):
+        keys_to_replace = {}
+        replace_prefix = {}
+
+        replace_prefix["conditioner.embedders.0.transformer.text_model"] = "clip_l.transformer.text_model"
+        replace_prefix["conditioner.embedders.1.model."] = "clip_g."
+        state_dict = utils.state_dict_prefix_replace(state_dict, replace_prefix, filter_keys=True)
+
+        state_dict = utils.state_dict_key_replace(state_dict, keys_to_replace)
+        state_dict = utils.clip_text_transformers_convert(state_dict, "clip_g.", "clip_g.transformer.")
+        return state_dict
+
+    def process_clip_state_dict_for_saving(self, state_dict):
+        replace_prefix = {}
+        state_dict_g = diffusers_convert.convert_text_enc_state_dict_v20(state_dict, "clip_g")
+        for k in state_dict:
+            if k.startswith("clip_l"):
+                state_dict_g[k] = state_dict[k]
+
+        state_dict_g["clip_l.transformer.text_model.embeddings.position_ids"] = torch.arange(77).expand((1, -1))
+        pop_keys = ["clip_l.transformer.text_projection.weight", "clip_l.logit_scale"]
+        for p in pop_keys:
+            if p in state_dict_g:
+                state_dict_g.pop(p)
+
+        replace_prefix["clip_g"] = "conditioner.embedders.1.model"
+        replace_prefix["clip_l"] = "conditioner.embedders.0"
+        state_dict_g = utils.state_dict_prefix_replace(state_dict_g, replace_prefix)
+        return state_dict_g
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(sdxl_clip.SDXLTokenizer, sdxl_clip.SDXLClipModel)
+
+class SSD1B(SDXL):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [0, 0, 2, 2, 4, 4],
+        "context_dim": 2048,
+        "adm_in_channels": 2816,
+        "use_temporal_attention": False,
+    }
+
+class Segmind_Vega(SDXL):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [0, 0, 1, 1, 2, 2],
+        "context_dim": 2048,
+        "adm_in_channels": 2816,
+        "use_temporal_attention": False,
+    }
+
+class KOALA_700M(SDXL):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [0, 2, 5],
+        "context_dim": 2048,
+        "adm_in_channels": 2816,
+        "use_temporal_attention": False,
+    }
+
+class KOALA_1B(SDXL):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [0, 2, 6],
+        "context_dim": 2048,
+        "adm_in_channels": 2816,
+        "use_temporal_attention": False,
+    }
+
+class SVD_img2vid(supported_models_base.BASE):
+    unet_config = {
+        "model_channels": 320,
+        "in_channels": 8,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
+        "context_dim": 1024,
+        "adm_in_channels": 768,
+        "use_temporal_attention": True,
+        "use_temporal_resblock": True
+    }
+
+    unet_extra_config = {
+        "num_heads": -1,
+        "num_head_channels": 64,
+        "attn_precision": torch.float32,
+    }
+
+    clip_vision_prefix = "conditioner.embedders.0.open_clip.model.visual."
+
+    latent_format = latent_formats.SD15
+
+    sampling_settings = {"sigma_max": 700.0, "sigma_min": 0.002}
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.SVD_img2vid(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None
+
+class SV3D_u(SVD_img2vid):
+    unet_config = {
+        "model_channels": 320,
+        "in_channels": 8,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
+        "context_dim": 1024,
+        "adm_in_channels": 256,
+        "use_temporal_attention": True,
+        "use_temporal_resblock": True
+    }
+
+    vae_key_prefix = ["conditioner.embedders.1.encoder."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.SV3D_u(self, device=device)
+        return out
+
+class SV3D_p(SV3D_u):
+    unet_config = {
+        "model_channels": 320,
+        "in_channels": 8,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [1, 1, 1, 1, 1, 1, 0, 0],
+        "context_dim": 1024,
+        "adm_in_channels": 1280,
+        "use_temporal_attention": True,
+        "use_temporal_resblock": True
+    }
+
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.SV3D_p(self, device=device)
+        return out
+
+class Stable_Zero123(supported_models_base.BASE):
+    unet_config = {
+        "context_dim": 768,
+        "model_channels": 320,
+        "use_linear_in_transformer": False,
+        "adm_in_channels": None,
+        "use_temporal_attention": False,
+        "in_channels": 8,
+    }
+
+    unet_extra_config = {
+        "num_heads": 8,
+        "num_head_channels": -1,
+    }
+
+    required_keys = {
+        "cc_projection.weight": None,
+        "cc_projection.bias": None,
+    }
+
+    clip_vision_prefix = "cond_stage_model.model.visual."
+
+    latent_format = latent_formats.SD15
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Stable_Zero123(self, device=device, cc_projection_weight=state_dict["cc_projection.weight"], cc_projection_bias=state_dict["cc_projection.bias"])
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None
+
+class SD_X4Upscaler(SD20):
+    unet_config = {
+        "context_dim": 1024,
+        "model_channels": 256,
+        'in_channels': 7,
+        "use_linear_in_transformer": True,
+        "adm_in_channels": None,
+        "use_temporal_attention": False,
+    }
+
+    unet_extra_config = {
+        "disable_self_attentions": [True, True, True, False],
+        "num_classes": 1000,
+        "num_heads": 8,
+        "num_head_channels": -1,
+    }
+
+    latent_format = latent_formats.SD_X4
+
+    sampling_settings = {
+        "linear_start": 0.0001,
+        "linear_end": 0.02,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.SD_X4Upscaler(self, device=device)
+        return out
+
+class Stable_Cascade_C(supported_models_base.BASE):
+    unet_config = {
+        "stable_cascade_stage": 'c',
+    }
+
+    unet_extra_config = {}
+
+    latent_format = latent_formats.SC_Prior
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    sampling_settings = {
+        "shift": 2.0,
+    }
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoder."]
+    clip_vision_prefix = "clip_l_vision."
+
+    def process_unet_state_dict(self, state_dict):
+        key_list = list(state_dict.keys())
+        for y in ["weight", "bias"]:
+            suffix = "in_proj_{}".format(y)
+            keys = filter(lambda a: a.endswith(suffix), key_list)
+            for k_from in keys:
+                weights = state_dict.pop(k_from)
+                prefix = k_from[:-(len(suffix) + 1)]
+                shape_from = weights.shape[0] // 3
+                for x in range(3):
+                    p = ["to_q", "to_k", "to_v"]
+                    k_to = "{}.{}.{}".format(prefix, p[x], y)
+                    state_dict[k_to] = weights[shape_from*x:shape_from*(x + 1)]
+        return state_dict
+
+    def process_clip_state_dict(self, state_dict):
+        state_dict = utils.state_dict_prefix_replace(state_dict, {k: "" for k in self.text_encoder_key_prefix}, filter_keys=True)
+        if "clip_g.text_projection" in state_dict:
+            state_dict["clip_g.transformer.text_projection.weight"] = state_dict.pop("clip_g.text_projection").transpose(0, 1)
+        return state_dict
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.StableCascade_C(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(sdxl_clip.StableCascadeTokenizer, sdxl_clip.StableCascadeClipModel)
+
+class Stable_Cascade_B(Stable_Cascade_C):
+    unet_config = {
+        "stable_cascade_stage": 'b',
+    }
+
+    unet_extra_config = {}
+
+    latent_format = latent_formats.SC_B
+    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    sampling_settings = {
+        "shift": 1.0,
+    }
+
+    clip_vision_prefix = None
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.StableCascade_B(self, device=device)
+        return out
+
+class SD15_instructpix2pix(SD15):
+    unet_config = {
+        "context_dim": 768,
+        "model_channels": 320,
+        "use_linear_in_transformer": False,
+        "adm_in_channels": None,
+        "use_temporal_attention": False,
+        "in_channels": 8,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.SD15_instructpix2pix(self, device=device)
+
+class SDXL_instructpix2pix(SDXL):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "transformer_depth": [0, 0, 2, 2, 10, 10],
+        "context_dim": 2048,
+        "adm_in_channels": 2816,
+        "use_temporal_attention": False,
+        "in_channels": 8,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.SDXL_instructpix2pix(self, model_type=self.model_type(state_dict, prefix), device=device)
+
+class LotusD(SD20):
+    unet_config = {
+        "model_channels": 320,
+        "use_linear_in_transformer": True,
+        "use_temporal_attention": False,
+        "adm_in_channels": 4,
+        "in_channels": 4,
+    }
+
+    unet_extra_config = {
+        "num_classes": 'sequential'
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.Lotus(self, device=device)
+
+class SD3(supported_models_base.BASE):
+    unet_config = {
+        "in_channels": 16,
+        "pos_embed_scaling_factor": None,
+    }
+
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.SD3
+
+    memory_usage_factor = 1.6
+
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.SD3(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        clip_l = False
+        clip_g = False
+        t5 = False
+        pref = self.text_encoder_key_prefix[0]
+        if "{}clip_l.transformer.text_model.final_layer_norm.weight".format(pref) in state_dict:
+            clip_l = True
+        if "{}clip_g.transformer.text_model.final_layer_norm.weight".format(pref) in state_dict:
+            clip_g = True
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        if "dtype_t5" in t5_detect:
+            t5 = True
+
+        return supported_models_base.ClipTarget(comfy.text_encoders.sd3_clip.SD3Tokenizer, comfy.text_encoders.sd3_clip.sd3_clip(clip_l=clip_l, clip_g=clip_g, t5=t5, **t5_detect))
+
+class StableAudio(supported_models_base.BASE):
+    unet_config = {
+        "audio_model": "dit1.0",
+    }
+
+    sampling_settings = {"sigma_max": 500.0, "sigma_min": 0.03}
+
+    unet_extra_config = {}
+    latent_format = latent_formats.StableAudio1
+
+    text_encoder_key_prefix = ["text_encoders."]
+    vae_key_prefix = ["pretransform.model."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        seconds_start_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_start.": ""}, filter_keys=True)
+        seconds_total_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_total.": ""}, filter_keys=True)
+        return model_base.StableAudio1(self, seconds_start_embedder_weights=seconds_start_sd, seconds_total_embedder_weights=seconds_total_sd, device=device)
+
+    def process_unet_state_dict(self, state_dict):
+        for k in list(state_dict.keys()):
+            if k.endswith(".cross_attend_norm.beta") or k.endswith(".ff_norm.beta") or k.endswith(".pre_norm.beta"): #These weights are all zero
+                state_dict.pop(k)
+        return state_dict
+
+    def process_unet_state_dict_for_saving(self, state_dict):
+        replace_prefix = {"": "model.model."}
+        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model)
+
+class AuraFlow(supported_models_base.BASE):
+    unet_config = {
+        "cond_seq_dim": 2048,
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 1.73,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.SDXL
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.AuraFlow(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.aura_t5.AuraT5Tokenizer, comfy.text_encoders.aura_t5.AuraT5Model)
+
+class PixArtAlpha(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "pixart_alpha",
+    }
+
+    sampling_settings = {
+        "beta_schedule" : "sqrt_linear",
+        "linear_start"  : 0.0001,
+        "linear_end"    : 0.02,
+        "timesteps"     : 1000,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.SD15
+
+    memory_usage_factor = 0.5
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.PixArt(self, device=device)
+        return out.eval()
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.PixArtT5XXL)
+
+class PixArtSigma(PixArtAlpha):
+    unet_config = {
+        "image_model": "pixart_sigma",
+    }
+    latent_format = latent_formats.SDXL
+
+class HunyuanDiT(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hydit",
+    }
+
+    unet_extra_config = {
+        "attn_precision": torch.float32,
+    }
+
+    sampling_settings = {
+        "linear_start": 0.00085,
+        "linear_end": 0.018,
+    }
+
+    latent_format = latent_formats.SDXL
+
+    memory_usage_factor = 1.3
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanDiT(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.hydit.HyditTokenizer, comfy.text_encoders.hydit.HyditModel)
+
+class HunyuanDiT1(HunyuanDiT):
+    unet_config = {
+        "image_model": "hydit1",
+    }
+
+    unet_extra_config = {}
+
+    sampling_settings = {
+        "linear_start" : 0.00085,
+        "linear_end" : 0.03,
+    }
+
+class Flux(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "flux",
+        "guidance_embed": True,
+    }
+
+    sampling_settings = {
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+
+    memory_usage_factor = 3.1 # TODO: debug why flux mem usage is so weird on windows.
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Flux(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
+
+class FluxInpaint(Flux):
+    unet_config = {
+        "image_model": "flux",
+        "guidance_embed": True,
+        "in_channels": 96,
+    }
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+class FluxSchnell(Flux):
+    unet_config = {
+        "image_model": "flux",
+        "guidance_embed": False,
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 1.0,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Flux(self, model_type=model_base.ModelType.FLOW, device=device)
+        return out
+
+class Flux2(Flux):
+    unet_config = {
+        "image_model": "flux2",
+    }
+
+    sampling_settings = {
+        "shift": 2.02,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux2
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Flux2(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None # TODO
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.flux.FluxTokenizer, comfy.text_encoders.flux.flux_clip(**t5_detect))
+
+class GenmoMochi(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "mochi_preview",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 6.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Mochi
+
+    memory_usage_factor = 2.0 #TODO
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.GenmoMochi(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.genmo.MochiT5Tokenizer, comfy.text_encoders.genmo.mochi_te(**t5_detect))
+
+class LTXV(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "ltxv",
+    }
+
+    sampling_settings = {
+        "shift": 2.37,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.LTXV
+
+    memory_usage_factor = 5.5 # TODO: img2vid is about 2x vs txt2vid
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = (unet_config.get("cross_attention_dim", 2048) / 2048) * 5.5
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.LTXV(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.lt.LTXVT5Tokenizer, comfy.text_encoders.lt.ltxv_te(**t5_detect))
+
+class HunyuanVideo(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hunyuan_video",
+    }
+
+    sampling_settings = {
+        "shift": 7.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.HunyuanVideo
+
+    memory_usage_factor = 1.8 #TODO
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideo(self, device=device)
+        return out
+
+    def process_unet_state_dict(self, state_dict):
+        out_sd = {}
+        for k in list(state_dict.keys()):
+            key_out = k
+            key_out = key_out.replace("txt_in.t_embedder.mlp.0.", "txt_in.t_embedder.in_layer.").replace("txt_in.t_embedder.mlp.2.", "txt_in.t_embedder.out_layer.")
+            key_out = key_out.replace("txt_in.c_embedder.linear_1.", "txt_in.c_embedder.in_layer.").replace("txt_in.c_embedder.linear_2.", "txt_in.c_embedder.out_layer.")
+            key_out = key_out.replace("_mod.linear.", "_mod.lin.").replace("_attn_qkv.", "_attn.qkv.")
+            key_out = key_out.replace("mlp.fc1.", "mlp.0.").replace("mlp.fc2.", "mlp.2.")
+            key_out = key_out.replace("_attn_q_norm.weight", "_attn.norm.query_norm.scale").replace("_attn_k_norm.weight", "_attn.norm.key_norm.scale")
+            key_out = key_out.replace(".q_norm.weight", ".norm.query_norm.scale").replace(".k_norm.weight", ".norm.key_norm.scale")
+            key_out = key_out.replace("_attn_proj.", "_attn.proj.")
+            key_out = key_out.replace(".modulation.linear.", ".modulation.lin.")
+            key_out = key_out.replace("_in.mlp.2.", "_in.out_layer.").replace("_in.mlp.0.", "_in.in_layer.")
+            out_sd[key_out] = state_dict[k]
+        return out_sd
+
+    def process_unet_state_dict_for_saving(self, state_dict):
+        replace_prefix = {"": "model.model."}
+        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}llama.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer, comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**hunyuan_detect))
+
+class HunyuanVideoI2V(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "in_channels": 33,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideoI2V(self, device=device)
+        return out
+
+class HunyuanVideoSkyreelsI2V(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "in_channels": 32,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideoSkyreelsI2V(self, device=device)
+        return out
+
+class CosmosT2V(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "cosmos",
+        "in_channels": 16,
+    }
+
+    sampling_settings = {
+        "sigma_data": 0.5,
+        "sigma_max": 80.0,
+        "sigma_min": 0.002,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Cosmos1CV8x8x8
+
+    memory_usage_factor = 1.6 #TODO
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] #TODO
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.CosmosVideo(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))
+
+class CosmosI2V(CosmosT2V):
+    unet_config = {
+        "image_model": "cosmos",
+        "in_channels": 17,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.CosmosVideo(self, image_to_video=True, device=device)
+        return out
+
+class CosmosT2IPredict2(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "cosmos_predict2",
+        "in_channels": 16,
+    }
+
+    sampling_settings = {
+        "sigma_data": 1.0,
+        "sigma_max": 80.0,
+        "sigma_min": 0.002,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Wan21
+
+    memory_usage_factor = 1.0
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.CosmosPredict2(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.cosmos.CosmosT5Tokenizer, comfy.text_encoders.cosmos.te(**t5_detect))
+
+class CosmosI2VPredict2(CosmosT2IPredict2):
+    unet_config = {
+        "image_model": "cosmos_predict2",
+        "in_channels": 17,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.CosmosPredict2(self, image_to_video=True, device=device)
+        return out
+
+class Lumina2(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "lumina2",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 6.0,
+    }
+
+    memory_usage_factor = 1.4
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Lumina2(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}gemma2_2b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.lumina2.LuminaTokenizer, comfy.text_encoders.lumina2.te(**hunyuan_detect))
+
+class ZImage(Lumina2):
+    unet_config = {
+        "image_model": "lumina2",
+        "dim": 3840,
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.0,
+    }
+
+    memory_usage_factor = 2.0
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_4b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.z_image.ZImageTokenizer, comfy.text_encoders.z_image.te(**hunyuan_detect))
+
+class NewBieImageModel(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "NewBieImage",
+        "model_type": "newbie_dit",
+    }
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 6.0,
+    }
+    memory_usage_factor = 1.5
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.NewBieImage(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None
+
+class WAN21_T2V(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "t2v",
+    }
+
+    sampling_settings = {
+        "shift": 8.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Wan21
+
+    memory_usage_factor = 0.9
+
+    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = self.unet_config.get("dim", 2000) / 2222
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}umt5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.wan.WanT5Tokenizer, comfy.text_encoders.wan.te(**t5_detect))
+
+class WAN21_I2V(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "i2v",
+        "in_dim": 36,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21(self, image_to_video=True, device=device)
+        return out
+
+class WAN21_FunControl2V(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "i2v",
+        "in_dim": 48,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21(self, image_to_video=False, device=device)
+        return out
+
+class WAN21_Camera(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "camera",
+        "in_dim": 32,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
+        return out
+
+class WAN22_Camera(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "camera_2.2",
+        "in_dim": 36,
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_Camera(self, image_to_video=False, device=device)
+        return out
+
+class WAN21_Vace(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "vace",
+    }
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        self.memory_usage_factor = 1.2 * self.memory_usage_factor
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_Vace(self, image_to_video=False, device=device)
+        return out
+
+class WAN21_HuMo(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "humo",
+    }
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN21_HuMo(self, image_to_video=False, device=device)
+        return out
+
+class WAN22_S2V(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "s2v",
+    }
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN22_S2V(self, device=device)
+        return out
+
+class WAN22_Animate(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "animate",
+    }
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN22_Animate(self, device=device)
+        return out
+
+class WAN22_T2V(WAN21_T2V):
+    unet_config = {
+        "image_model": "wan2.1",
+        "model_type": "t2v",
+        "out_dim": 48,
+    }
+
+    latent_format = latent_formats.Wan22
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.WAN22(self, image_to_video=True, device=device)
+        return out
+
+class Hunyuan3Dv2(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hunyuan3d2",
+    }
+
+    unet_extra_config = {}
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 1.0,
+    }
+
+    memory_usage_factor = 3.5
+
+    clip_vision_prefix = "conditioner.main_image_encoder.model."
+    vae_key_prefix = ["vae."]
+
+    latent_format = latent_formats.Hunyuan3Dv2
+
+    def process_unet_state_dict_for_saving(self, state_dict):
+        replace_prefix = {"": "model."}
+        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Hunyuan3Dv2(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None
+
+class Hunyuan3Dv2_1(Hunyuan3Dv2):
+    unet_config = {
+        "image_model": "hunyuan3d2_1",
+    }
+
+    latent_format = latent_formats.Hunyuan3Dv2_1
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Hunyuan3Dv2_1(self, device = device)
+        return out
+
+class Hunyuan3Dv2mini(Hunyuan3Dv2):
+    unet_config = {
+        "image_model": "hunyuan3d2",
+        "depth": 8,
+    }
+
+    latent_format = latent_formats.Hunyuan3Dv2mini
+
+class HiDream(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "hidream",
+    }
+
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    sampling_settings = {
+    }
+
+    # memory_usage_factor = 1.2 # TODO
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HiDream(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return None #  TODO
+
+class Chroma(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "chroma",
+    }
+
+    unet_extra_config = {
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+    }
+
+    latent_format = comfy.latent_formats.Flux
+
+    memory_usage_factor = 3.2
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Chroma(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect))
+
+class ChromaRadiance(Chroma):
+    unet_config = {
+        "image_model": "chroma_radiance",
+    }
+
+    latent_format = comfy.latent_formats.ChromaRadiance
+
+    # Pixel-space model, no spatial compression for model input.
+    memory_usage_factor = 0.044
+
+    def get_model(self, state_dict, prefix="", device=None):
+        return model_base.ChromaRadiance(self, device=device)
+
+class ACEStep(supported_models_base.BASE):
+    unet_config = {
+        "audio_model": "ace",
+    }
+
+    unet_extra_config = {
+    }
+
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    latent_format = comfy.latent_formats.ACEAudio
+
+    memory_usage_factor = 0.5
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.ACEStep(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        return supported_models_base.ClipTarget(comfy.text_encoders.ace.AceT5Tokenizer, comfy.text_encoders.ace.AceT5Model)
+
+class Omnigen2(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "omnigen2",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 2.6,
+    }
+
+    memory_usage_factor = 1.95 #TODO
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Flux
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def __init__(self, unet_config):
+        super().__init__(unet_config)
+        if comfy.model_management.extended_fp16_support():
+            self.supported_inference_dtypes = [torch.float16] + self.supported_inference_dtypes
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Omnigen2(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
+
+class QwenImage(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "qwen_image",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 1.15,
+    }
+
+    memory_usage_factor = 1.8 #TODO
+
+    unet_extra_config = {}
+    latent_format = latent_formats.Wan21
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.QwenImage(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.qwen_image.QwenImageTokenizer, comfy.text_encoders.qwen_image.te(**hunyuan_detect))
+
+class HunyuanImage21(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "vec_in_dim": None,
+    }
+
+    sampling_settings = {
+        "shift": 5.0,
+    }
+
+    latent_format = latent_formats.HunyuanImage21
+
+    memory_usage_factor = 8.7
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanImage21(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+
+class HunyuanImage21Refiner(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "patch_size": [1, 1, 1],
+        "vec_in_dim": None,
+    }
+
+    sampling_settings = {
+        "shift": 4.0,
+    }
+
+    latent_format = latent_formats.HunyuanImage21Refiner
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanImage21Refiner(self, device=device)
+        return out
+
+class HunyuanVideo15(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "vision_in_dim": 1152,
+    }
+
+    sampling_settings = {
+        "shift": 7.0,
+    }
+    memory_usage_factor = 4.0 #TODO
+    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    latent_format = latent_formats.HunyuanVideo15
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideo15(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+
+
+class HunyuanVideo15_SR_Distilled(HunyuanVideo):
+    unet_config = {
+        "image_model": "hunyuan_video",
+        "vision_in_dim": 1152,
+        "in_channels": 98,
+    }
+
+    sampling_settings = {
+        "shift": 2.0,
+    }
+    memory_usage_factor = 4.0 #TODO
+    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+
+    latent_format = latent_formats.HunyuanVideo15
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.HunyuanVideo15_SR_Distilled(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer, comfy.text_encoders.hunyuan_image.te(**hunyuan_detect))
+
+
+class Kandinsky5(supported_models_base.BASE):
+    unet_config = {
+        "image_model": "kandinsky5",
+    }
+
+    sampling_settings = {
+        "shift": 10.0,
+    }
+
+    unet_extra_config = {}
+    latent_format = latent_formats.HunyuanVideo
+
+    memory_usage_factor = 1.25 #TODO
+
+    supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+    vae_key_prefix = ["vae."]
+    text_encoder_key_prefix = ["text_encoders."]
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Kandinsky5(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5Tokenizer, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
+
+
+class Kandinsky5Image(Kandinsky5):
+    unet_config = {
+        "image_model": "kandinsky5",
+        "model_dim": 2560,
+        "visual_embed_dim": 64,
+    }
+
+    sampling_settings = {
+        "shift": 3.0,
+    }
+
+    latent_format = latent_formats.Flux
+    memory_usage_factor = 1.25 #TODO
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Kandinsky5Image(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage, comfy.text_encoders.kandinsky5.te(**hunyuan_detect))
+
+
+models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, NewBieImageModel, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5]
+
+models += [SVD_img2vid]