full inference tests, group offloading is required for hunyuan image's refiner

2026-03-10 03:37:33 +08:00 · 2025-12-29 12:23:42 -08:00 · 2025-12-29 12:23:42 -08:00 · 397fb62df5
commit 397fb62df5
parent 85772d450d
10 changed files with 450 additions and 376 deletions
--- a/comfy/ldm/hunyuan_video/model.py
+++ b/comfy/ldm/hunyuan_video/model.py
@ -1,18 +1,15 @@
 # Based on Flux code because of weird hunyuan video code license.

-import torch
-
-from ..modules.attention import optimized_attention
-
-from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP
-
 from dataclasses import dataclass
-from einops import repeat

+import torch
+from einops import repeat
 from torch import Tensor, nn

 from ..flux.layers import DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock, timestep_embedding
+from ..modules.attention import optimized_attention
 from ..modules.diffusionmodules.mmdit import PatchEmbed
+from ...patcher_extension import WrapperExecutor, get_all_wrappers, WrappersMP


@dataclass
@ -74,9 +71,7 @@ class TokenRefinerBlock(nn.Module):
            operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device),
        )

-    def forward(self, x, c, mask, transformer_options=None):
-        if transformer_options is None:
-            transformer_options = {}
+    def forward(self, x, c, mask, transformer_options={}):
        mod1, mod2 = self.adaLN_modulation(c).chunk(2, dim=1)

        norm_x = self.norm1(x)
@ -113,9 +108,7 @@ class IndividualTokenRefiner(nn.Module):
            ]
        )

-    def forward(self, x, c, mask, transformer_options=None):
-        if transformer_options is None:
-            transformer_options = {}
+    def forward(self, x, c, mask, transformer_options={}):
        m = None
        if mask is not None:
            m = mask.view(mask.shape[0], 1, 1, mask.shape[1]).repeat(1, 1, mask.shape[1], 1)
@ -149,10 +142,8 @@ class TokenRefiner(nn.Module):
            x,
            timesteps,
            mask,
-            transformer_options=None,
+            transformer_options={},
    ):
-        if transformer_options is None:
-            transformer_options = {}
        t = self.t_embedder(timestep_embedding(timesteps, 256, time_factor=1.0).to(x.dtype))
        # m = mask.float().unsqueeze(-1)
        # c = (x.float() * m).sum(dim=1) / m.sum(dim=1) #TODO: the following works when the x.shape is the same length as the tokens but might break otherwise
@ -295,15 +286,14 @@ class HunyuanVideo(nn.Module):
            timesteps: Tensor,
            y: Tensor = None,
            txt_byt5=None,
-            clip_fea=None,guidance: Tensor = None,
+            clip_fea=None,
+            guidance: Tensor = None,
            guiding_frame_index=None,
            ref_latent=None,
            disable_time_r=False,
            control=None,
-            transformer_options=None,
+            transformer_options={},
    ) -> Tensor:
-        if transformer_options is None:
-            transformer_options = {}
        patches_replace = transformer_options.get("patches_replace", {})

        initial_shape = list(img.shape)
@ -362,7 +352,7 @@ class HunyuanVideo(nn.Module):
            if self.cond_type_embedding is not None:
                cond_emb = self.cond_type_embedding(torch.ones_like(txt_byt5[:, :, 0], device=txt_byt5.device, dtype=torch.long))
                txt_byt5 = txt_byt5 + cond_emb.to(txt_byt5.dtype)
-                txt = torch.cat((txt_byt5, txt), dim=1) # byt5 first for HunyuanVideo1.5
+                txt = torch.cat((txt_byt5, txt), dim=1)  # byt5 first for HunyuanVideo1.5
            else:
                txt = torch.cat((txt, txt_byt5), dim=1)
            txt_byt5_ids = torch.zeros((txt_ids.shape[0], txt_byt5.shape[1], txt_ids.shape[-1]), device=txt_ids.device, dtype=txt_ids.dtype)
@ -476,18 +466,14 @@ class HunyuanVideo(nn.Module):
        img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0)
        return repeat(img_ids, "h w c -> b (h w) c", b=bs)

-    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options=None, **kwargs):
-        if transformer_options is None:
-            transformer_options = {}
+    def forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        return WrapperExecutor.new_class_executor(
            self._forward,
            self,
            get_all_wrappers(WrappersMP.DIFFUSION_MODEL, transformer_options)
        ).execute(x, timestep, context, y, txt_byt5, clip_fea, guidance, attention_mask, guiding_frame_index, ref_latent, disable_time_r, control, transformer_options, **kwargs)

-    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options=None, **kwargs):
-        if transformer_options is None:
-            transformer_options = {}
+    def _forward(self, x, timestep, context, y=None, txt_byt5=None, clip_fea=None, guidance=None, attention_mask=None, guiding_frame_index=None, ref_latent=None, disable_time_r=False, control=None, transformer_options={}, **kwargs):
        bs = x.shape[0]
        if len(self.patch_size) == 3:
            img_ids = self.img_ids(x)
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1019,7 +1019,7 @@ class LTXV(BaseModel):


 class HunyuanVideo(BaseModel):
-    def __init__(self, model_config, model_type=ModelType.FLOW, device=None, image_model=None):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=HunyuanVideoModel)

    def encode_adm(self, **kwargs):
@ -1552,7 +1552,7 @@ class QwenImage(BaseModel):

 class HunyuanImage21(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
-        super().__init__(model_config, model_type, device=device, unet_model=HunyuanVideo)
+        super().__init__(model_config, model_type, device=device, unet_model=HunyuanVideoModel)

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@ -590,6 +590,7 @@ KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = {
    'appmana/Cosmos-1.0-Prompt-Upsampler-12B-Text2World-hf',
    'llava-hf/llava-onevision-qwen2-7b-si-hf',
    'llava-hf/llama3-llava-next-8b-hf',
+    'PromptEnhancer/PromptEnhancer-32B',
 }

 KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
@ -699,6 +700,7 @@ KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([
    # Hunyuan Image
    HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/text_encoders/byt5_small_glyphxl_fp16.safetensors"),
    HuggingFile("Comfy-Org/HunyuanImage_2.1_ComfyUI", "split_files/text_encoders/qwen_2.5_vl_7b.safetensors"),
+    HuggingFile("Comfy-Org/Ovis-Image", "split_files/text_encoders/ovis_2.5.safetensors"),
 ], folder_names=["clip", "text_encoders"])

 KNOWN_STYLE_MODELS: Final[KnownDownloadables] = KnownDownloadables([
@ -731,7 +733,7 @@ def _get_known_models_for_folder_name(folder_name: str) -> List[Downloadable]:
    return list(chain.from_iterable([candidate for candidate in _known_models_db if folder_name in candidate]))


-def add_known_models(folder_name: str, known_models: Optional[List[Downloadable]] | Downloadable = None, *models: Downloadable) -> MutableSequence[Downloadable]:
+def add_known_models(folder_name: str, known_models: KnownDownloadables | Optional[List[Downloadable]] | Downloadable = None, *models: Downloadable) -> MutableSequence[Downloadable]:
    if isinstance(known_models, Downloadable):
        models = [known_models] + list(models) or []
        known_models = None
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
@ -8,11 +8,11 @@ RMSNorm = None
 logger = logging.getLogger(__name__)

 try:
-    rms_norm_torch = torch.nn.functional.rms_norm  # pylint: disable=no-member
-    RMSNorm = torch.nn.RMSNorm # pylint: disable=no-member
+    rms_norm_torch = torch.nn.functional.rms_norm
+    RMSNorm = torch.nn.RMSNorm
 except:
    rms_norm_torch = None
-    logger.debug("Please update pytorch to use native RMSNorm")
+    logger.warning("Please update pytorch to use native RMSNorm")


 def rms_norm(x, weight=None, eps=1e-6):
@ -22,7 +22,7 @@ def rms_norm(x, weight=None, eps=1e-6):
        else:
            return rms_norm_torch(x, weight.shape, weight=cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
    else:
-        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        r = x * torch.rsqrt(torch.mean(x ** 2, dim=-1, keepdim=True) + eps)
        if weight is None:
            return r
        else:
@ -32,12 +32,12 @@ def rms_norm(x, weight=None, eps=1e-6):
 if RMSNorm is None:
    class RMSNorm(torch.nn.Module):
        def __init__(
-            self,
-            normalized_shape,
-            eps=1e-6,
-            elementwise_affine=True,
-            device=None,
-            dtype=None,
+                self,
+                normalized_shape,
+                eps=1e-6,
+                elementwise_affine=True,
+                device=None,
+                dtype=None,
        ):
            factory_kwargs = {"device": device, "dtype": dtype}
            super().__init__()
--- a/comfy/text_encoders/hunyuan_image.py
+++ b/comfy/text_encoders/hunyuan_image.py
@ -13,7 +13,7 @@ class ByT5SmallTokenizer(sd1_clip.SDTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data=None):
        if tokenizer_data is None:
            tokenizer_data = {}
-        tokenizer_path = files.get_package_as_path("byt5_tokenizer")
+        tokenizer_path = files.get_package_as_path(f"{__package__}.byt5_tokenizer")
        super().__init__(tokenizer_path, embedding_directory=None, pad_with_end=False, embedding_size=1472, embedding_key='byt5_small', tokenizer_class=ByT5Tokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data)


--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -19,6 +19,8 @@ from __future__ import annotations

 import contextlib
 import contextvars
+import pickle
+
 import itertools
 import json
 import logging
@ -31,7 +33,7 @@ import threading
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from pickle import UnpicklingError
+from pickle import UnpicklingError, PickleError
 from typing import Optional, Any, Literal, Generator

 import numpy as np
@ -134,6 +136,11 @@ def load_torch_file(ckpt: str, safe_load=False, device=None, return_metadata=Fal
                raise ValueError(f"{message} (File path: {ckpt} The safetensors file is corrupt or invalid. Make sure this is actually a safetensors file and not a ckpt or pt or other filetype.")
            if "MetadataIncompleteBuffer" in message or "InvalidHeaderDeserialization" in message:
                raise ValueError(f"{message} (File path: {ckpt} The safetensors file is corrupt/incomplete. Check the file size and make sure you have copied/downloaded it correctly.")
+            try:
+                assert pickle.dumps(e) is not None
+            except PickleError as serialization_failed:
+                logger.debug("serialization failed", exc_info=serialization_failed)
+                e = RuntimeError(repr(e))
            raise e
    elif ckpt.lower().endswith("index.json"):
        # from accelerate
--- a/tests/inference/test_workflows.py
+++ b/tests/inference/test_workflows.py
@ -156,7 +156,8 @@ async def client(tmp_path_factory, request) -> AsyncGenerator[Any, Any]:


 def _prepare_for_workflows() -> dict[str, Traversable]:
-    add_known_models("loras", KNOWN_LORAS, CivitFile(13941, 16576, "epi_noiseoffset2.safetensors"))
+
+    add_known_models("loras", HuggingFile("artificialguybr/pixelartredmond-1-5v-pixel-art-loras-for-sd-1-5", "PixelArtRedmond15V-PixelArt-PIXARFK.safetensors"))
    add_known_models("checkpoints", HuggingFile("autismanon/modeldump", "cardosAnime_v20.safetensors"))

    return {f.name: f for f in importlib.resources.files(workflows).iterdir() if f.is_file() and f.name.endswith(".json")}
@ -169,7 +170,7 @@ async def test_workflow(workflow_name: str, workflow_file: Traversable, has_gpu:
        pytest.skip("requires gpu")

    if "compile" in workflow_name:
-        pytest.skip("compilation has regressed in 0.4.0 because upcast weights are now permitted to be compiled, causing OOM errors in most cases")
+        pytest.skip("compilation has regressed in 0.4.0 and later because upcast weights are now permitted to be compiled, causing OOM errors in most cases")
        return

    workflow = json.loads(workflow_file.read_text(encoding="utf8"))
--- a/tests/inference/workflows/hunyuan_image-0.json
+++ b/tests/inference/workflows/hunyuan_image-0.json
@ -1,164 +1,276 @@
 {
-    "6": {
-        "inputs": {
-            "text": "cute anime girl with gigantic fennec ears and a big fluffy fox tail with long wavy blonde hair and large blue eyes blonde colored eyelashes wearing a pink sweater a large oversized gold trimmed black winter coat and a long blue maxi skirt and a red scarf, she is happy while singing on stage like an idol while holding a microphone, there are colorful lights, it is a postcard held by a hand in front of a beautiful city at sunset and there is cursive writing that says \"Hunyuan Image\"",
-            "clip": [
-                "38",
-                0
-            ]
-        },
-        "class_type": "CLIPTextEncode",
-        "_meta": {
-            "title": "CLIP Text Encode (Positive Prompt)"
-        }
+  "3": {
+    "inputs": {
+      "seed": 215668140279030,
+      "steps": 1,
+      "cfg": 3.5,
+      "sampler_name": "euler",
+      "scheduler": "simple",
+      "denoise": 1,
+      "model": [
+        "13",
+        0
+      ],
+      "positive": [
+        "6",
+        0
+      ],
+      "negative": [
+        "7",
+        0
+      ],
+      "latent_image": [
+        "29",
+        0
+      ]
    },
-    "8": {
-        "inputs": {
-            "samples": [
-                "13",
-                0
-            ],
-            "vae": [
-                "10",
-                0
-            ]
-        },
-        "class_type": "VAEDecode",
-        "_meta": {
-            "title": "VAE Decode"
-        }
-    },
-    "9": {
-        "inputs": {
-            "filename_prefix": "ComfyUI",
-            "images": [
-                "8",
-                0
-            ]
-        },
-        "class_type": "SaveImage",
-        "_meta": {
-            "title": "Save Image"
-        }
-    },
-    "10": {
-        "inputs": {
-            "vae_name": "hunyuan_image_2.1_vae_fp16.safetensors"
-        },
-        "class_type": "VAELoader",
-        "_meta": {
-            "title": "Load VAE"
-        }
-    },
-    "12": {
-        "inputs": {
-            "unet_name": "hunyuanimage2.1_bf16.safetensors",
-            "weight_dtype": "default"
-        },
-        "class_type": "UNETLoader",
-        "_meta": {
-            "title": "Load Diffusion Model"
-        }
-    },
-    "13": {
-        "inputs": {
-            "noise": [
-                "25",
-                0
-            ],
-            "guider": [
-                "22",
-                0
-            ],
-            "sampler": [
-                "16",
-                0
-            ],
-            "sigmas": [
-                "17",
-                0
-            ],
-            "latent_image": [
-                "27",
-                0
-            ]
-        },
-        "class_type": "SamplerCustomAdvanced",
-        "_meta": {
-            "title": "SamplerCustomAdvanced"
-        }
-    },
-    "16": {
-        "inputs": {
-            "sampler_name": "euler"
-        },
-        "class_type": "KSamplerSelect",
-        "_meta": {
-            "title": "KSamplerSelect"
-        }
-    },
-    "17": {
-        "inputs": {
-            "scheduler": "simple",
-            "steps": 20,
-            "denoise": 1.0,
-            "model": [
-                "12",
-                0
-            ]
-        },
-        "class_type": "BasicScheduler",
-        "_meta": {
-            "title": "BasicScheduler"
-        }
-    },
-    "22": {
-        "inputs": {
-            "model": [
-                "12",
-                0
-            ],
-            "conditioning": [
-                "6",
-                0
-            ]
-        },
-        "class_type": "BasicGuider",
-        "_meta": {
-            "title": "BasicGuider"
-        }
-    },
-    "25": {
-        "inputs": {
-            "noise_seed": 435922656034510
-        },
-        "class_type": "RandomNoise",
-        "_meta": {
-            "title": "RandomNoise"
-        }
-    },
-    "27": {
-        "inputs": {
-            "width": 1024,
-            "height": 1024,
-            "batch_size": 1,
-            "color": 0
-        },
-        "class_type": "EmptyLatentImage",
-        "_meta": {
-            "title": "Empty Latent Image"
-        }
-    },
-    "38": {
-        "inputs": {
-            "clip_name1": "qwen_2.5_vl_7b.safetensors",
-            "clip_name2": "byt5_small_glyphxl_fp16.safetensors",
-            "type": "sdxl",
-            "device": "default"
-        },
-        "class_type": "DualCLIPLoader",
-        "_meta": {
-            "title": "DualCLIPLoader"
-        }
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "KSampler"
    }
+  },
+  "6": {
+    "inputs": {
+      "text": "cute anime girl with massive fennec ears and a big fluffy fox tail with long wavy blonde hair between eyes and large blue eyes blonde colored eyelashes chubby wearing oversized clothes summer uniform large black coat long blue maxi skirt muddy clothes happy sitting on the side of the road in a run down dark gritty cyberpunk city with neon and a crumbling skyscraper in the rain at night while dipping her feet in a river of water she is holding a sign that says \"ComfyUI is the best\" and another one that says \"The Future is Comfy\"",
+      "clip": [
+        "26",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Prompt)"
+    }
+  },
+  "7": {
+    "inputs": {
+      "text": "low quality, bad anatomy, extra digits, missing digits, extra limbs, missing limbs",
+      "clip": [
+        "26",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "3",
+        0
+      ],
+      "vae": [
+        "15",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "13": {
+    "inputs": {
+      "unet_name": "hunyuanimage2.1_bf16.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "15": {
+    "inputs": {
+      "vae_name": "hunyuan_image_2.1_vae_fp16.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  },
+  "26": {
+    "inputs": {
+      "clip_name1": "qwen_2.5_vl_7b.safetensors",
+      "clip_name2": "byt5_small_glyphxl_fp16.safetensors",
+      "type": "hunyuan_image",
+      "device": "default"
+    },
+    "class_type": "DualCLIPLoader",
+    "_meta": {
+      "title": "DualCLIPLoader"
+    }
+  },
+  "29": {
+    "inputs": {
+      "width": 2048,
+      "height": 2048,
+      "batch_size": 1
+    },
+    "class_type": "EmptyHunyuanImageLatent",
+    "_meta": {
+      "title": "EmptyHunyuanImageLatent"
+    }
+  },
+  "41": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "images": [
+        "66:61",
+        0
+      ]
+    },
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "Save Image"
+    }
+  },
+  "67": {
+    "inputs": {},
+    "class_type": "GroupOffload",
+    "_meta": {
+      "title": "GroupOffload"
+    }
+  },
+  "66:58": {
+    "inputs": {
+      "vae_name": "hunyuan_image_refiner_vae_fp16.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  },
+  "66:59": {
+    "inputs": {
+      "pixels": [
+        "8",
+        0
+      ],
+      "vae": [
+        "66:58",
+        0
+      ]
+    },
+    "class_type": "VAEEncode",
+    "_meta": {
+      "title": "VAE Encode"
+    }
+  },
+  "66:61": {
+    "inputs": {
+      "samples": [
+        "66:63",
+        0
+      ],
+      "vae": [
+        "66:58",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "66:62": {
+    "inputs": {
+      "noise_augmentation": 0.1,
+      "positive": [
+        "66:64",
+        0
+      ],
+      "negative": [
+        "66:65",
+        0
+      ],
+      "latent": [
+        "66:59",
+        0
+      ]
+    },
+    "class_type": "HunyuanRefinerLatent",
+    "_meta": {
+      "title": "HunyuanRefinerLatent"
+    }
+  },
+  "66:64": {
+    "inputs": {
+      "text": "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nMake the image high quality\n<|eot_id|>",
+      "clip": [
+        "26",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Prompt)"
+    }
+  },
+  "66:65": {
+    "inputs": {
+      "text": "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n\n<|eot_id|>",
+      "clip": [
+        "26",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Prompt)"
+    }
+  },
+  "66:60": {
+    "inputs": {
+      "unet_name": "hunyuanimage2.1_refiner_bf16.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "66:66": {
+    "inputs": {
+      "model": [
+        "66:60",
+        0
+      ]
+    },
+    "class_type": "GroupOffload",
+    "_meta": {
+      "title": "GroupOffload"
+    }
+  },
+  "66:63": {
+    "inputs": {
+      "seed": 770039891896361,
+      "steps": 1,
+      "cfg": 1,
+      "sampler_name": "euler",
+      "scheduler": "simple",
+      "denoise": 1,
+      "model": [
+        "66:66",
+        0
+      ],
+      "positive": [
+        "66:62",
+        0
+      ],
+      "negative": [
+        "66:62",
+        1
+      ],
+      "latent_image": [
+        "66:62",
+        2
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "KSampler"
+    }
+  }
 }
--- a/tests/inference/workflows/ovis-0.json
+++ b/tests/inference/workflows/ovis-0.json
@ -1,175 +1,141 @@
 {
-    "1": {
-        "inputs": {
-            "noise": [
-                "2",
-                0
-            ],
-            "guider": [
-                "3",
-                0
-            ],
-            "sampler": [
-                "6",
-                0
-            ],
-            "sigmas": [
-                "7",
-                0
-            ],
-            "latent_image": [
-                "9",
-                0
-            ]
-        },
-        "class_type": "SamplerCustomAdvanced",
-        "_meta": {
-            "title": "SamplerCustomAdvanced"
-        }
+  "9": {
+    "inputs": {
+      "filename_prefix": "Ovis_2.5",
+      "images": [
+        "57:43",
+        0
+      ]
    },
-    "2": {
-        "inputs": {
-            "noise_seed": 1038979
-        },
-        "class_type": "RandomNoise",
-        "_meta": {
-            "title": "RandomNoise"
-        }
-    },
-    "3": {
-        "inputs": {
-            "model": [
-                "12",
-                0
-            ],
-            "conditioning": [
-                "4",
-                0
-            ]
-        },
-        "class_type": "BasicGuider",
-        "_meta": {
-            "title": "BasicGuider"
-        }
-    },
-    "4": {
-        "inputs": {
-            "guidance": 3,
-            "conditioning": [
-                "13",
-                0
-            ]
-        },
-        "class_type": "FluxGuidance",
-        "_meta": {
-            "title": "FluxGuidance"
-        }
-    },
-    "6": {
-        "inputs": {
-            "sampler_name": "euler"
-        },
-        "class_type": "KSamplerSelect",
-        "_meta": {
-            "title": "KSamplerSelect"
-        }
-    },
-    "7": {
-        "inputs": {
-            "scheduler": "ddim_uniform",
-            "steps": 1,
-            "denoise": 1,
-            "model": [
-                "12",
-                0
-            ]
-        },
-        "class_type": "BasicScheduler",
-        "_meta": {
-            "title": "BasicScheduler"
-        }
-    },
-    "9": {
-        "inputs": {
-            "width": 1024,
-            "height": 1024,
-            "batch_size": 1
-        },
-        "class_type": "EmptySD3LatentImage",
-        "_meta": {
-            "title": "EmptySD3LatentImage"
-        }
-    },
-    "10": {
-        "inputs": {
-            "samples": [
-                "1",
-                0
-            ],
-            "vae": [
-                "11",
-                0
-            ]
-        },
-        "class_type": "VAEDecode",
-        "_meta": {
-            "title": "VAE Decode"
-        }
-    },
-    "11": {
-        "inputs": {
-            "vae_name": "ae.safetensors"
-        },
-        "class_type": "VAELoader",
-        "_meta": {
-            "title": "Load VAE"
-        }
-    },
-    "12": {
-        "inputs": {
-            "unet_name": "ovis_image_bf16.safetensors",
-            "weight_dtype": "default"
-        },
-        "class_type": "UNETLoader",
-        "_meta": {
-            "title": "Load Diffusion Model"
-        }
-    },
-    "13": {
-        "inputs": {
-            "text": "A photograph of a sheep (Ovis aries) valid for testing.",
-            "clip": [
-                "15",
-                0
-            ]
-        },
-        "class_type": "CLIPTextEncode",
-        "_meta": {
-            "title": "CLIP Text Encode (Prompt)"
-        }
-    },
-    "15": {
-        "inputs": {
-            "clip_name1": "clip_l.safetensors",
-            "clip_name2": "t5xxl_fp16.safetensors",
-            "type": "flux"
-        },
-        "class_type": "DualCLIPLoader",
-        "_meta": {
-            "title": "DualCLIPLoader"
-        }
-    },
-    "16": {
-        "inputs": {
-            "filename_prefix": "ComfyUI",
-            "images": [
-                "10",
-                0
-            ]
-        },
-        "class_type": "SaveImage",
-        "_meta": {
-            "title": "Save Image"
-        }
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "Save Image"
    }
+  },
+  "57:43": {
+    "inputs": {
+      "samples": [
+        "57:44",
+        0
+      ],
+      "vae": [
+        "57:40",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "57:46": {
+    "inputs": {
+      "unet_name": "ovis_image_bf16.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "57:39": {
+    "inputs": {
+      "clip_name": "ovis_2.5.safetensors",
+      "type": "ovis",
+      "device": "default"
+    },
+    "class_type": "CLIPLoader",
+    "_meta": {
+      "title": "Load CLIP"
+    }
+  },
+  "57:41": {
+    "inputs": {
+      "width": 1024,
+      "height": 1024,
+      "batch_size": 1
+    },
+    "class_type": "EmptySD3LatentImage",
+    "_meta": {
+      "title": "EmptySD3LatentImage"
+    }
+  },
+  "57:49": {
+    "inputs": {
+      "text": "",
+      "clip": [
+        "57:39",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Prompt)"
+    }
+  },
+  "57:47": {
+    "inputs": {
+      "shift": 3,
+      "model": [
+        "57:46",
+        0
+      ]
+    },
+    "class_type": "ModelSamplingAuraFlow",
+    "_meta": {
+      "title": "ModelSamplingAuraFlow"
+    }
+  },
+  "57:45": {
+    "inputs": {
+      "text": "Child lying on bed with balloons, streamers, and plush toys, surreal dreamlike scene. Close-up, centered on the child and surrounding objects. Soft, diffused lighting.",
+      "clip": [
+        "57:39",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Prompt)"
+    }
+  },
+  "57:44": {
+    "inputs": {
+      "seed": 775709856082329,
+      "steps": 1,
+      "cfg": 5,
+      "sampler_name": "euler",
+      "scheduler": "simple",
+      "denoise": 1,
+      "model": [
+        "57:47",
+        0
+      ],
+      "positive": [
+        "57:45",
+        0
+      ],
+      "negative": [
+        "57:49",
+        0
+      ],
+      "latent_image": [
+        "57:41",
+        0
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "KSampler"
+    }
+  },
+  "57:40": {
+    "inputs": {
+      "vae_name": "ae.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  }
 }
--- a/tests/inference/workflows/sd-1.5-lora-0.json
+++ b/tests/inference/workflows/sd-1.5-lora-0.json
@ -106,7 +106,7 @@
  },
  "10": {
    "inputs": {
-      "lora_name": "epi_noiseoffset2.safetensors",
+      "lora_name": "PixelArtRedmond15V-PixelArt-PIXARFK.safetensors",
      "strength_model": 1,
      "strength_clip": 1,
      "model": [