Merge branch 'master' into rename-mahiro

2026-07-20 13:28:19 +08:00 · 2026-02-28 20:48:49 -08:00 · 2026-02-28 20:48:49 -08:00 · fa6c7eb86f
commit fa6c7eb86f
parent 8c41e2393b 1080bd442a
24 changed files with 747 additions and 108 deletions
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@ -17,7 +17,7 @@ from importlib.metadata import version
 import requests
 from typing_extensions import NotRequired
-from utils.install_util import get_missing_requirements_message, requirements_path
+from utils.install_util import get_missing_requirements_message, get_required_packages_versions
 from comfy.cli_args import DEFAULT_VERSION_STRING
 import app.logger
@ -45,25 +45,7 @@ def get_installed_frontend_version():
 def get_required_frontend_version():
-    """Get the required frontend version from requirements.txt."""
+    return get_required_packages_versions().get("comfyui-frontend-package", None)
    try:
        with open(requirements_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("comfyui-frontend-package=="):
                    version_str = line.split("==")[-1]
                    if not is_valid_version(version_str):
                        logging.error(f"Invalid version format in requirements.txt: {version_str}")
                        return None
                    return version_str
            logging.error("comfyui-frontend-package not found in requirements.txt")
            return None
    except FileNotFoundError:
        logging.error("requirements.txt not found. Cannot determine required frontend version.")
        return None
    except Exception as e:
        logging.error(f"Error reading requirements.txt: {e}")
        return None
 def check_frontend_version():
@ -217,25 +199,7 @@ class FrontendManager:
    @classmethod
    def get_required_templates_version(cls) -> str:
-        """Get the required workflow templates version from requirements.txt."""
+        return get_required_packages_versions().get("comfyui-workflow-templates", None)
        try:
            with open(requirements_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line.startswith("comfyui-workflow-templates=="):
                        version_str = line.split("==")[-1]
                        if not is_valid_version(version_str):
                            logging.error(f"Invalid templates version format in requirements.txt: {version_str}")
                            return None
                        return version_str
                logging.error("comfyui-workflow-templates not found in requirements.txt")
                return None
        except FileNotFoundError:
            logging.error("requirements.txt not found. Cannot determine required templates version.")
            return None
        except Exception as e:
            logging.error(f"Error reading requirements.txt: {e}")
            return None
    @classmethod
    def default_frontend_path(cls) -> str:
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -146,6 +146,7 @@ parser.add_argument("--reserve-vram", type=float, default=None, help="Set the am
 parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
 parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
 parser.add_argument("--disable-dynamic-vram", action="store_true", help="Disable dynamic VRAM and use estimate based model loading.")
 parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
@ -159,7 +160,6 @@ class PerformanceFeature(enum.Enum):
    Fp8MatrixMultiplication = "fp8_matrix_mult"
    CublasOps = "cublas_ops"
    AutoTune = "autotune"
    DynamicVRAM = "dynamic_vram"
 parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
@ -260,4 +260,4 @@ else:
    args.fast = set(args.fast)
 def enables_dynamic_vram():
-    return PerformanceFeature.DynamicVRAM in args.fast and not args.highvram and not args.gpu_only
+    return not args.disable_dynamic_vram and not args.highvram and not args.gpu_only and not args.novram and not args.cpu
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@ -1621,3 +1621,118 @@ class HumoWanModel(WanModel):
        # unpatchify
        x = self.unpatchify(x, grid_sizes)
        return x
 class SCAILWanModel(WanModel):
    def __init__(self, model_type="scail", patch_size=(1, 2, 2), in_dim=20, dim=5120, operations=None, device=None, dtype=None, **kwargs):
        super().__init__(model_type='i2v', patch_size=patch_size, in_dim=in_dim, dim=dim, operations=operations, device=device, dtype=dtype, **kwargs)
        self.patch_embedding_pose = operations.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=torch.float32)
    def forward_orig(self, x, t, context, clip_fea=None, freqs=None, transformer_options={}, pose_latents=None, reference_latent=None, **kwargs):
        if reference_latent is not None:
            x = torch.cat((reference_latent, x), dim=2)
        # embeddings
        x = self.patch_embedding(x.float()).to(x.dtype)
        grid_sizes = x.shape[2:]
        transformer_options["grid_sizes"] = grid_sizes
        x = x.flatten(2).transpose(1, 2)
        scail_pose_seq_len = 0
        if pose_latents is not None:
            scail_x = self.patch_embedding_pose(pose_latents.float()).to(x.dtype)
            scail_x = scail_x.flatten(2).transpose(1, 2)
            scail_pose_seq_len = scail_x.shape[1]
            x = torch.cat([x, scail_x], dim=1)
            del scail_x
        # time embeddings
        e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
        e = e.reshape(t.shape[0], -1, e.shape[-1])
        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
        # context
        context = self.text_embedding(context)
        context_img_len = None
        if clip_fea is not None:
            if self.img_emb is not None:
                context_clip = self.img_emb(clip_fea)  # bs x 257 x dim
                context = torch.cat([context_clip, context], dim=1)
            context_img_len = clip_fea.shape[-2]
        patches_replace = transformer_options.get("patches_replace", {})
        blocks_replace = patches_replace.get("dit", {})
        transformer_options["total_blocks"] = len(self.blocks)
        transformer_options["block_type"] = "double"
        for i, block in enumerate(self.blocks):
            transformer_options["block_index"] = i
            if ("double_block", i) in blocks_replace:
                def block_wrap(args):
                    out = {}
                    out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
                    return out
                out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
                x = out["img"]
            else:
                x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)
        # head
        x = self.head(x, e)
        if scail_pose_seq_len > 0:
            x = x[:, :-scail_pose_seq_len]
        # unpatchify
        x = self.unpatchify(x, grid_sizes)
        if reference_latent is not None:
            x = x[:, :, reference_latent.shape[2]:]
        return x
    def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, transformer_options={}):
        main_freqs = super().rope_encode(t, h, w, t_start=t_start, steps_t=steps_t, steps_h=steps_h, steps_w=steps_w, device=device, dtype=dtype, transformer_options=transformer_options)
        if pose_latents is None:
            return main_freqs
        ref_t_patches = 0
        if reference_latent is not None:
            ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
        F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
        # if pose is at half resolution, scale_y/scale_x=2 stretches the position range to cover the same RoPE extent as the main frames
        h_scale = h / H_pose
        w_scale = w / W_pose
        # 120 w-offset and shift 0.5 to place positions at midpoints (0.5, 2.5, ...) to match the original code
        h_shift = (h_scale - 1) / 2
        w_shift = (w_scale - 1) / 2
        pose_transformer_options = {"rope_options": {"shift_y": h_shift, "shift_x": 120.0 + w_shift, "scale_y": h_scale, "scale_x": w_scale}}
        pose_freqs = super().rope_encode(F_pose, H_pose, W_pose, t_start=t_start+ref_t_patches, device=device, dtype=dtype, transformer_options=pose_transformer_options)
        return torch.cat([main_freqs, pose_freqs], dim=1)
    def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, pose_latents=None, **kwargs):
        bs, c, t, h, w = x.shape
        x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
        if pose_latents is not None:
            pose_latents = comfy.ldm.common_dit.pad_to_patch_size(pose_latents, self.patch_size)
        t_len = t
        if time_dim_concat is not None:
            time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
            x = torch.cat([x, time_dim_concat], dim=2)
            t_len = x.shape[2]
        reference_latent = None
        if "reference_latent" in kwargs:
            reference_latent = comfy.ldm.common_dit.pad_to_patch_size(kwargs.pop("reference_latent"), self.patch_size)
            t_len += reference_latent.shape[2]
        freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options, pose_latents=pose_latents, reference_latent=reference_latent)
        return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, pose_latents=pose_latents, reference_latent=reference_latent, **kwargs)[:, :, :t, :h, :w]
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -925,6 +925,25 @@ class Flux(BaseModel):
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()[2:]), ref_latents))])
        return out
 class LongCatImage(Flux):
    def _apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
        transformer_options = transformer_options.copy()
        rope_opts = transformer_options.get("rope_options", {})
        rope_opts = dict(rope_opts)
        rope_opts.setdefault("shift_t", 1.0)
        rope_opts.setdefault("shift_y", 512.0)
        rope_opts.setdefault("shift_x", 512.0)
        transformer_options["rope_options"] = rope_opts
        return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)
    def encode_adm(self, **kwargs):
        return None
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        out.pop('guidance', None)
        return out
 class Flux2(Flux):
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
@ -1483,6 +1502,44 @@ class WAN21_FlowRVS(WAN21):
        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
        self.image_to_video = image_to_video
 class WAN21_SCAIL(WAN21):
    def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
        super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.SCAILWanModel)
        self.memory_usage_factor_conds = ("reference_latent", "pose_latents")
        self.memory_usage_shape_process = {"pose_latents": lambda shape: [shape[0], shape[1], 1.5, shape[-2], shape[-1]]}
        self.image_to_video = image_to_video
    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
        reference_latents = kwargs.get("reference_latents", None)
        if reference_latents is not None:
            ref_latent = self.process_latent_in(reference_latents[-1])
            ref_mask = torch.ones_like(ref_latent[:, :4])
            ref_latent = torch.cat([ref_latent, ref_mask], dim=1)
            out['reference_latent'] = comfy.conds.CONDRegular(ref_latent)
        pose_latents = kwargs.get("pose_video_latent", None)
        if pose_latents is not None:
            pose_latents = self.process_latent_in(pose_latents)
            pose_mask = torch.ones_like(pose_latents[:, :4])
            pose_latents = torch.cat([pose_latents, pose_mask], dim=1)
            out['pose_latents'] = comfy.conds.CONDRegular(pose_latents)
        return out
    def extra_conds_shapes(self, **kwargs):
        out = {}
        ref_latents = kwargs.get("reference_latents", None)
        if ref_latents is not None:
            out['reference_latent'] = list([1, 20, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        pose_latents = kwargs.get("pose_video_latent", None)
        if pose_latents is not None:
            out['pose_latents'] = [pose_latents.shape[0], 20, *pose_latents.shape[2:]]
        return out
 class Hunyuan3Dv2(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -279,6 +279,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"])
            if dit_config["yak_mlp"] and dit_config["txt_norm"]:  # Ovis model
                dit_config["txt_ids_dims"] = [1, 2]
            if dit_config.get("context_in_dim") == 3584 and dit_config["vec_in_dim"] is None:  # LongCat-Image
                dit_config["txt_ids_dims"] = [1, 2]
        return dit_config
@ -496,6 +498,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
            dit_config["model_type"] = "humo"
        elif '{}face_adapter.fuser_blocks.0.k_norm.weight'.format(key_prefix) in state_dict_keys:
            dit_config["model_type"] = "animate"
        elif '{}patch_embedding_pose.weight'.format(key_prefix) in state_dict_keys:
            dit_config["model_type"] = "scail"
        else:
            if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
                dit_config["model_type"] = "i2v"
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -180,6 +180,14 @@ def is_ixuca():
        return True
    return False
 def is_wsl():
    version = platform.uname().release
    if version.endswith("-Microsoft"):
        return True
    elif version.endswith("microsoft-standard-WSL2"):
        return True
    return False
 def get_torch_device():
    global directml_enabled
    global cpu_state
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -308,15 +308,22 @@ class ModelPatcher:
    def get_free_memory(self, device):
        return comfy.model_management.get_free_memory(device)
-    def clone(self, disable_dynamic=False):
+    def get_clone_model_override(self):
        return self.model, (self.backup, self.object_patches_backup, self.pinned)
    def clone(self, disable_dynamic=False, model_override=None):
        class_ = self.__class__
        model = self.model
        if self.is_dynamic() and disable_dynamic:
            class_ = ModelPatcher
-            temp_model_patcher = self.cached_patcher_init[0](*self.cached_patcher_init[1], disable_dynamic=True)
+            if model_override is None:
-            model = temp_model_patcher.model
+                if self.cached_patcher_init is None:
                    raise RuntimeError("Cannot create non-dynamic delegate: cached_patcher_init is not initialized.")
                temp_model_patcher = self.cached_patcher_init[0](*self.cached_patcher_init[1], disable_dynamic=True)
                model_override = temp_model_patcher.get_clone_model_override()
        if model_override is None:
            model_override = self.get_clone_model_override()
-        n = class_(model, self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update)
+        n = class_(model_override[0], self.load_device, self.offload_device, self.model_size(), weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
        for k in self.patches:
            n.patches[k] = self.patches[k][:]
@ -325,13 +332,12 @@ class ModelPatcher:
        n.object_patches = self.object_patches.copy()
        n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
        n.model_options = comfy.utils.deepcopy_list_dict(self.model_options)
        n.backup = self.backup
        n.object_patches_backup = self.object_patches_backup
        n.parent = self
        n.pinned = self.pinned
        n.force_cast_weights = self.force_cast_weights
        n.backup, n.object_patches_backup, n.pinned = model_override[1]
        # attachments
        n.attachments = {}
        for k in self.attachments:
@ -1435,6 +1441,7 @@ class ModelPatcherDynamic(ModelPatcher):
            del self.model.model_loaded_weight_memory
        if not hasattr(self.model, "dynamic_vbars"):
            self.model.dynamic_vbars = {}
        self.non_dynamic_delegate_model = None
        assert load_device is not None
    def is_dynamic(self):
@ -1669,4 +1676,10 @@ class ModelPatcherDynamic(ModelPatcher):
    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
        pass
    def get_non_dynamic_delegate(self):
        model_patcher = self.clone(disable_dynamic=True, model_override=self.non_dynamic_delegate_model)
        self.non_dynamic_delegate_model = model_patcher.get_clone_model_override()
        return model_patcher
 CoreModelPatcher = ModelPatcher
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@ -66,6 +66,18 @@ def convert_cond(cond):
        out.append(temp)
    return out
 def cond_has_hooks(cond):
    for c in cond:
        temp = c[1]
        if "hooks" in temp:
            return True
        if "control" in temp:
            control = temp["control"]
            extra_hooks = control.get_extra_hooks()
            if len(extra_hooks) > 0:
                return True
    return False
 def get_additional_models(conds, dtype):
    """loads additional models in conditioning"""
    cnets: list[ControlBase] = []
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@ -946,6 +946,8 @@ class CFGGuider:
    def inner_set_conds(self, conds):
        for k in conds:
            if self.model_patcher.is_dynamic() and comfy.sampler_helpers.cond_has_hooks(conds[k]):
                self.model_patcher = self.model_patcher.get_non_dynamic_delegate()
            self.original_conds[k] = comfy.sampler_helpers.convert_cond(conds[k])
    def __call__(self, *args, **kwargs):
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -60,6 +60,7 @@ import comfy.text_encoders.jina_clip_2
 import comfy.text_encoders.newbie
 import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
 import comfy.model_patcher
 import comfy.lora
@ -203,7 +204,7 @@ def load_bypass_lora_for_models(model, clip, lora, strength_model, strength_clip
 class CLIP:
-    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, state_dict=[], model_options={}):
+    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, state_dict=[], model_options={}, disable_dynamic=False):
        if no_init:
            return
        params = target.params.copy()
@ -232,7 +233,8 @@ class CLIP:
        model_management.archive_model_dtypes(self.cond_stage_model)
        self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
-        self.patcher = comfy.model_patcher.CoreModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
+        ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher
        self.patcher = ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
        #Match torch.float32 hardcode upcast in TE implemention
        self.patcher.set_model_compute_dtype(torch.float32)
        self.patcher.hook_mode = comfy.hooks.EnumHookMode.MinVram
@ -266,9 +268,9 @@ class CLIP:
        logging.info("CLIP/text encoder model load device: {}, offload device: {}, current: {}, dtype: {}".format(load_device, offload_device, params['device'], dtype))
        self.tokenizer_options = {}
-    def clone(self):
+    def clone(self, disable_dynamic=False):
        n = CLIP(no_init=True)
-        n.patcher = self.patcher.clone()
+        n.patcher = self.patcher.clone(disable_dynamic=disable_dynamic)
        n.cond_stage_model = self.cond_stage_model
        n.tokenizer = self.tokenizer
        n.layer_idx = self.layer_idx
@ -1160,16 +1162,24 @@ class CLIPType(Enum):
    KANDINSKY5_IMAGE = 23
    NEWBIE = 24
    FLUX2 = 25
    LONGCAT_IMAGE = 26
-def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
+
 def load_clip_model_patcher(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}, disable_dynamic=False):
    clip = load_clip(ckpt_paths, embedding_directory, clip_type, model_options, disable_dynamic)
    return clip.patcher
 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}, disable_dynamic=False):
    clip_data = []
    for p in ckpt_paths:
        sd, metadata = comfy.utils.load_torch_file(p, safe_load=True, return_metadata=True)
        if model_options.get("custom_operations", None) is None:
            sd, metadata = comfy.utils.convert_old_quants(sd, model_prefix="", metadata=metadata)
        clip_data.append(sd)
-    return load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options)
+    clip = load_text_encoder_state_dicts(clip_data, embedding_directory=embedding_directory, clip_type=clip_type, model_options=model_options, disable_dynamic=disable_dynamic)
    clip.patcher.cached_patcher_init = (load_clip_model_patcher, (ckpt_paths, embedding_directory, clip_type, model_options))
    return clip
 class TEModel(Enum):
@ -1274,7 +1284,7 @@ def llama_detect(clip_data):
    return {}
-def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
+def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}, disable_dynamic=False):
    clip_data = state_dicts
    class EmptyClass:
@ -1372,6 +1382,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            if clip_type == CLIPType.HUNYUAN_IMAGE:
                clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
            elif clip_type == CLIPType.LONGCAT_IMAGE:
                clip_target.clip = comfy.text_encoders.longcat_image.te(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.longcat_image.LongCatImageTokenizer
            else:
                clip_target.clip = comfy.text_encoders.qwen_image.te(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.qwen_image.QwenImageTokenizer
@ -1491,7 +1504,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
        parameters += comfy.utils.calculate_parameters(c)
        tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
-    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, state_dict=clip_data, model_options=model_options)
+    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, state_dict=clip_data, model_options=model_options, disable_dynamic=disable_dynamic)
    return clip
 def load_gligen(ckpt_path):
@ -1536,8 +1549,10 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata, disable_dynamic=disable_dynamic)
    if out is None:
        raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
-    if output_model:
+    if output_model and out[0] is not None:
        out[0].cached_patcher_init = (load_checkpoint_guess_config_model_only, (ckpt_path, embedding_directory, model_options, te_model_options))
    if output_clip and out[1] is not None:
        out[1].patcher.cached_patcher_init = (load_checkpoint_guess_config_clip_only, (ckpt_path, embedding_directory, model_options, te_model_options))
    return out
 def load_checkpoint_guess_config_model_only(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False):
@ -1548,6 +1563,14 @@ def load_checkpoint_guess_config_model_only(ckpt_path, embedding_directory=None,
            disable_dynamic=disable_dynamic)
    return model
 def load_checkpoint_guess_config_clip_only(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False):
    _, clip, *_ = load_checkpoint_guess_config(ckpt_path, False, True, False,
            embedding_directory=embedding_directory, output_model=False,
            model_options=model_options,
            te_model_options=te_model_options,
            disable_dynamic=disable_dynamic)
    return clip.patcher
 def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_clipvision=False, embedding_directory=None, output_model=True, model_options={}, te_model_options={}, metadata=None, disable_dynamic=False):
    clip = None
    clipvision = None
@ -1633,7 +1656,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
            clip_sd = model_config.process_clip_state_dict(sd)
            if len(clip_sd) > 0:
                parameters = comfy.utils.calculate_parameters(clip_sd)
-                clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, state_dict=clip_sd, model_options=te_model_options)
+                clip = CLIP(clip_target, embedding_directory=embedding_directory, tokenizer_data=clip_sd, parameters=parameters, state_dict=clip_sd, model_options=te_model_options, disable_dynamic=disable_dynamic)
            else:
                logging.warning("no CLIP/text encoder weights in checkpoint, the text encoder model will not be loaded.")
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -25,6 +25,7 @@ import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
 import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
 from . import supported_models_base
 from . import latent_formats
@ -1267,6 +1268,16 @@ class WAN21_FlowRVS(WAN21_T2V):
        out = model_base.WAN21_FlowRVS(self, image_to_video=True, device=device)
        return out
 class WAN21_SCAIL(WAN21_T2V):
    unet_config = {
        "image_model": "wan2.1",
        "model_type": "scail",
    }
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.WAN21_SCAIL(self, image_to_video=False, device=device)
        return out
 class Hunyuan3Dv2(supported_models_base.BASE):
    unet_config = {
        "image_model": "hunyuan3d2",
@ -1678,6 +1689,37 @@ class ACEStep15(supported_models_base.BASE):
        return supported_models_base.ClipTarget(comfy.text_encoders.ace15.ACE15Tokenizer, comfy.text_encoders.ace15.te(**detect))
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
+class LongCatImage(supported_models_base.BASE):
    unet_config = {
        "image_model": "flux",
        "guidance_embed": False,
        "vec_in_dim": None,
        "context_in_dim": 3584,
        "txt_ids_dims": [1, 2],
    }
    sampling_settings = {
    }
    unet_extra_config = {}
    latent_format = latent_formats.Flux
    memory_usage_factor = 2.5
    supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]
    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.LongCatImage(self, device=device)
        return out
    def clip_target(self, state_dict={}):
        pref = self.text_encoder_key_prefix[0]
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_7b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.longcat_image.LongCatImageTokenizer, comfy.text_encoders.longcat_image.te(**hunyuan_detect))
 models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima]
 models += [SVD_img2vid]
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@ -328,14 +328,14 @@ class ACE15TEModel(torch.nn.Module):
                return getattr(self, self.lm_model).load_sd(sd)
    def memory_estimation_function(self, token_weight_pairs, device=None):
-        lm_metadata = token_weight_pairs["lm_metadata"]
+        lm_metadata = token_weight_pairs.get("lm_metadata", {})
        constant = self.constant
        if comfy.model_management.should_use_bf16(device):
            constant *= 0.5
        token_weight_pairs = token_weight_pairs.get("lm_prompt", [])
        num_tokens = sum(map(lambda a: len(a), token_weight_pairs))
-        num_tokens += lm_metadata['min_tokens']
+        num_tokens += lm_metadata.get("min_tokens", 0)
        return num_tokens * constant * 1024 * 1024
 def te(dtype_llama=None, llama_quantization_metadata=None, lm_model="qwen3_2b"):
--- a/comfy/text_encoders/longcat_image.py
+++ b/comfy/text_encoders/longcat_image.py
@ -0,0 +1,184 @@
 import re
 import numbers
 import torch
 from comfy import sd1_clip
 from comfy.text_encoders.qwen_image import Qwen25_7BVLITokenizer, Qwen25_7BVLIModel
 import logging
 logger = logging.getLogger(__name__)
 QUOTE_PAIRS = [("'", "'"), ('"', '"'), ("\u2018", "\u2019"), ("\u201c", "\u201d")]
 QUOTE_PATTERN = "|".join(
    [
        re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2)
        for q1, q2 in QUOTE_PAIRS
    ]
 )
 WORD_INTERNAL_QUOTE_RE = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
 def split_quotation(prompt):
    matches = WORD_INTERNAL_QUOTE_RE.findall(prompt)
    mapping = []
    for i, word_src in enumerate(set(matches)):
        word_tgt = "longcat_$##$_longcat" * (i + 1)
        prompt = prompt.replace(word_src, word_tgt)
        mapping.append((word_src, word_tgt))
    parts = re.split(f"({QUOTE_PATTERN})", prompt)
    result = []
    for part in parts:
        for word_src, word_tgt in mapping:
            part = part.replace(word_tgt, word_src)
        if not part:
            continue
        is_quoted = bool(re.match(QUOTE_PATTERN, part))
        result.append((part, is_quoted))
    return result
 class LongCatImageBaseTokenizer(Qwen25_7BVLITokenizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.max_length = 512
    def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
        parts = split_quotation(text)
        all_tokens = []
        for part_text, is_quoted in parts:
            if is_quoted:
                for char in part_text:
                    ids = self.tokenizer(char, add_special_tokens=False)["input_ids"]
                    all_tokens.extend(ids)
            else:
                ids = self.tokenizer(part_text, add_special_tokens=False)["input_ids"]
                all_tokens.extend(ids)
        if len(all_tokens) > self.max_length:
            all_tokens = all_tokens[: self.max_length]
            logger.warning(f"Truncated prompt to {self.max_length} tokens")
        output = [(t, 1.0) for t in all_tokens]
        # Pad to max length
        self.pad_tokens(output, self.max_length - len(output))
        return [output]
 class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(
            embedding_directory=embedding_directory,
            tokenizer_data=tokenizer_data,
            name="qwen25_7b",
            tokenizer=LongCatImageBaseTokenizer,
        )
        self.longcat_template_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
        self.longcat_template_suffix = "<|im_end|>\n<|im_start|>assistant\n"
    def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
        skip_template = False
        if text.startswith("<|im_start|>"):
            skip_template = True
        if text.startswith("<|start_header_id|>"):
            skip_template = True
        if text == "":
            text = " "
        base_tok = getattr(self, "qwen25_7b")
        if skip_template:
            tokens = super().tokenize_with_weights(
                text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
            )
        else:
            prefix_ids = base_tok.tokenizer(
                self.longcat_template_prefix, add_special_tokens=False
            )["input_ids"]
            suffix_ids = base_tok.tokenizer(
                self.longcat_template_suffix, add_special_tokens=False
            )["input_ids"]
            prompt_tokens = base_tok.tokenize_with_weights(
                text, return_word_ids=return_word_ids, **kwargs
            )
            prompt_pairs = prompt_tokens[0]
            prefix_pairs = [(t, 1.0) for t in prefix_ids]
            suffix_pairs = [(t, 1.0) for t in suffix_ids]
            combined = prefix_pairs + prompt_pairs + suffix_pairs
            tokens = {"qwen25_7b": [combined]}
        return tokens
 class LongCatImageTEModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__(
            device=device,
            dtype=dtype,
            name="qwen25_7b",
            clip_model=Qwen25_7BVLIModel,
            model_options=model_options,
        )
    def encode_token_weights(self, token_weight_pairs, template_end=-1):
        out, pooled, extra = super().encode_token_weights(token_weight_pairs)
        tok_pairs = token_weight_pairs["qwen25_7b"][0]
        count_im_start = 0
        if template_end == -1:
            for i, v in enumerate(tok_pairs):
                elem = v[0]
                if not torch.is_tensor(elem):
                    if isinstance(elem, numbers.Integral):
                        if elem == 151644 and count_im_start < 2:
                            template_end = i
                            count_im_start += 1
        if out.shape[1] > (template_end + 3):
            if tok_pairs[template_end + 1][0] == 872:
                if tok_pairs[template_end + 2][0] == 198:
                    template_end += 3
        if template_end == -1:
            template_end = 0
        suffix_start = None
        for i in range(len(tok_pairs) - 1, -1, -1):
            elem = tok_pairs[i][0]
            if not torch.is_tensor(elem) and isinstance(elem, numbers.Integral):
                if elem == 151645:
                    suffix_start = i
                    break
        out = out[:, template_end:]
        if "attention_mask" in extra:
            extra["attention_mask"] = extra["attention_mask"][:, template_end:]
            if extra["attention_mask"].sum() == torch.numel(extra["attention_mask"]):
                extra.pop("attention_mask")
        if suffix_start is not None:
            suffix_len = len(tok_pairs) - suffix_start
            if suffix_len > 0 and out.shape[1] > suffix_len:
                out = out[:, :-suffix_len]
                if "attention_mask" in extra:
                    extra["attention_mask"] = extra["attention_mask"][:, :-suffix_len]
                    if extra["attention_mask"].sum() == torch.numel(
                        extra["attention_mask"]
                    ):
                        extra.pop("attention_mask")
        return out, pooled, extra
 def te(dtype_llama=None, llama_quantization_metadata=None):
    class LongCatImageTEModel_(LongCatImageTEModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if llama_quantization_metadata is not None:
                model_options = model_options.copy()
                model_options["quantization_metadata"] = llama_quantization_metadata
            if dtype_llama is not None:
                dtype = dtype_llama
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return LongCatImageTEModel_
--- a/comfy_extras/nodes_hooks.py
+++ b/comfy_extras/nodes_hooks.py
@ -248,7 +248,7 @@ class SetClipHooks:
    def apply_hooks(self, clip: CLIP, schedule_clip: bool, apply_to_conds: bool, hooks: comfy.hooks.HookGroup=None):
        if hooks is not None:
-            clip = clip.clone()
+            clip = clip.clone(disable_dynamic=True)
            if apply_to_conds:
                clip.apply_hooks_to_conds = hooks
            clip.patcher.forced_hooks = hooks.clone()
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@ -706,8 +706,8 @@ class SplitImageToTileList(IO.ComfyNode):
    @staticmethod
    def get_grid_coords(width, height, tile_width, tile_height, overlap):
        coords = []
-        stride_x = max(1, tile_width - overlap)
+        stride_x = round(max(tile_width * 0.25, tile_width - overlap))
-        stride_y = max(1, tile_height - overlap)
+        stride_y = round(max(tile_width * 0.25, tile_height - overlap))
        y = 0
        while y < height:
@ -764,34 +764,6 @@ class ImageMergeTileList(IO.ComfyNode):
            ],
        )
    @staticmethod
    def get_grid_coords(width, height, tile_width, tile_height, overlap):
        coords = []
        stride_x = max(1, tile_width - overlap)
        stride_y = max(1, tile_height - overlap)
        y = 0
        while y < height:
            x = 0
            y_end = min(y + tile_height, height)
            y_start = max(0, y_end - tile_height)
            while x < width:
                x_end = min(x + tile_width, width)
                x_start = max(0, x_end - tile_width)
                coords.append((x_start, y_start, x_end, y_end))
                if x_end >= width:
                    break
                x += stride_x
            if y_end >= height:
                break
            y += stride_y
        return coords
    @classmethod
    def execute(cls, image_list, final_width, final_height, overlap):
        w = final_width[0]
@ -804,7 +776,7 @@ class ImageMergeTileList(IO.ComfyNode):
        device = first_tile.device
        dtype = first_tile.dtype
-        coords = cls.get_grid_coords(w, h, t_w, t_h, ovlp)
+        coords = SplitImageToTileList.get_grid_coords(w, h, t_w, t_h, ovlp)
        canvas = torch.zeros((b, h, w, c), device=device, dtype=dtype)
        weights = torch.zeros((b, h, w, 1), device=device, dtype=dtype)
--- a/comfy_extras/nodes_resolution.py
+++ b/comfy_extras/nodes_resolution.py
@ -16,15 +16,15 @@ class AspectRatio(str, Enum):
    WIDESCREEN_V = "9:16 (Portrait Widescreen)"
-ASPECT_RATIOS: dict[str, tuple[int, int]] = {
+ASPECT_RATIOS: dict[AspectRatio, tuple[int, int]] = {
-    "1:1 (Square)": (1, 1),
+    AspectRatio.SQUARE: (1, 1),
-    "3:2 (Photo)": (3, 2),
+    AspectRatio.PHOTO_H: (3, 2),
-    "4:3 (Standard)": (4, 3),
+    AspectRatio.STANDARD_H: (4, 3),
-    "16:9 (Widescreen)": (16, 9),
+    AspectRatio.WIDESCREEN_H: (16, 9),
-    "21:9 (Ultrawide)": (21, 9),
+    AspectRatio.ULTRAWIDE_H: (21, 9),
-    "2:3 (Portrait Photo)": (2, 3),
+    AspectRatio.PHOTO_V: (2, 3),
-    "3:4 (Portrait Standard)": (3, 4),
+    AspectRatio.STANDARD_V: (3, 4),
-    "9:16 (Portrait Widescreen)": (9, 16),
+    AspectRatio.WIDESCREEN_V: (9, 16),
 }
@ -55,8 +55,12 @@ class ResolutionSelector(io.ComfyNode):
                ),
            ],
            outputs=[
-                io.Int.Output("width", tooltip="Calculated width in pixels (multiple of 8)."),
+                io.Int.Output(
-                io.Int.Output("height", tooltip="Calculated height in pixels (multiple of 8)."),
+                    "width", tooltip="Calculated width in pixels (multiple of 8)."
                ),
                io.Int.Output(
                    "height", tooltip="Calculated height in pixels (multiple of 8)."
                ),
            ],
        )
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@ -1456,6 +1456,63 @@ class WanInfiniteTalkToVideo(io.ComfyNode):
        return io.NodeOutput(model_patched, positive, negative, out_latent, trim_image)
 class WanSCAILToVideo(io.ComfyNode):
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="WanSCAILToVideo",
            category="conditioning/video_models",
            inputs=[
                io.Conditioning.Input("positive"),
                io.Conditioning.Input("negative"),
                io.Vae.Input("vae"),
                io.Int.Input("width", default=512, min=32, max=nodes.MAX_RESOLUTION, step=32),
                io.Int.Input("height", default=896, min=32, max=nodes.MAX_RESOLUTION, step=32),
                io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
                io.Int.Input("batch_size", default=1, min=1, max=4096),
                io.ClipVisionOutput.Input("clip_vision_output", optional=True),
                io.Image.Input("reference_image", optional=True),
                io.Image.Input("pose_video", optional=True, tooltip="Video used for pose conditioning. Will be downscaled to half the resolution of the main video."),
                io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
                io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step to use pose conditioning."),
                io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step to use pose conditioning."),
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
                io.Latent.Output(display_name="latent", tooltip="Empty latent of the generation size."),
            ],
            is_experimental=True,
        )
    @classmethod
    def execute(cls, positive, negative, vae, width, height, length, batch_size, pose_strength, pose_start, pose_end, reference_image=None, clip_vision_output=None, pose_video=None) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
        ref_latent = None
        if reference_image is not None:
            reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
            ref_latent = vae.encode(reference_image[:, :, :, :3])
        if ref_latent is not None:
            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [torch.zeros_like(ref_latent)]}, append=True)
        if clip_vision_output is not None:
            positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
            negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
        if pose_video is not None:
            pose_video = comfy.utils.common_upscale(pose_video[:length].movedim(-1, 1), width // 2, height // 2, "area", "center").movedim(1, -1)
            pose_video_latent = vae.encode(pose_video[:, :, :, :3]) * pose_strength
            positive = node_helpers.conditioning_set_values_with_timestep_range(positive, {"pose_video_latent": pose_video_latent}, pose_start, pose_end)
            negative = node_helpers.conditioning_set_values_with_timestep_range(negative, {"pose_video_latent": pose_video_latent}, pose_start, pose_end)
        out_latent = {}
        out_latent["samples"] = latent
        return io.NodeOutput(positive, negative, out_latent)
 class WanExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
@ -1476,6 +1533,7 @@ class WanExtension(ComfyExtension):
            WanAnimateToVideo,
            Wan22ImageToVideoLatent,
            WanInfiniteTalkToVideo,
            WanSCAILToVideo,
        ]
 async def comfy_entrypoint() -> WanExtension:
--- a/main.py
+++ b/main.py
@ -192,7 +192,7 @@ import hook_breaker_ac10a0
 import comfy.memory_management
 import comfy.model_patcher
-if enables_dynamic_vram():
+if enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl():
    if comfy.model_management.torch_version_numeric < (2, 8):
        logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
    elif comfy_aimdo.control.init_device(comfy.model_management.get_torch_device().index):
--- a/node_helpers.py
+++ b/node_helpers.py
@ -1,5 +1,6 @@
 import hashlib
 import torch
 import logging
 from comfy.cli_args import args
@ -21,6 +22,36 @@ def conditioning_set_values(conditioning, values={}, append=False):
    return c
 def conditioning_set_values_with_timestep_range(conditioning, values={}, start_percent=0.0, end_percent=1.0):
    """
    Apply values to conditioning only during [start_percent, end_percent], keeping the
    original conditioning active outside that range. Respects existing per-entry ranges.
    """
    if start_percent > end_percent:
        logging.warning(f"start_percent ({start_percent}) must be <= end_percent ({end_percent})")
        return conditioning
    EPS = 1e-5 # the sampler gates entries with strict > / <, shift boundaries slightly to ensure only one conditioning is active per timestep
    c = []
    for t in conditioning:
        cond_start = t[1].get("start_percent", 0.0)
        cond_end   = t[1].get("end_percent",   1.0)
        intersect_start = max(start_percent, cond_start)
        intersect_end   = min(end_percent,   cond_end)
        if intersect_start >= intersect_end: # no overlap: emit unchanged
            c.append(t)
            continue
        if intersect_start > cond_start: # part before the requested range
            c.extend(conditioning_set_values([t], {"start_percent": cond_start, "end_percent": intersect_start - EPS}))
        c.extend(conditioning_set_values([t], {**values, "start_percent": intersect_start, "end_percent": intersect_end}))
        if intersect_end < cond_end: # part after the requested range
            c.extend(conditioning_set_values([t], {"start_percent": intersect_end + EPS, "end_percent": cond_end}))
    return c
 def pillow(fn, arg):
    prev_value = None
    try:
--- a/nodes.py
+++ b/nodes.py
@ -976,7 +976,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
--- a/requirements.txt
+++ b/requirements.txt
@ -31,5 +31,4 @@ spandrel
 pydantic~=2.0
 pydantic-settings~=2.0
 PyOpenGL
 PyOpenGL-accelerate
 glfw
--- a/tests-unit/app_test/frontend_manager_test.py
+++ b/tests-unit/app_test/frontend_manager_test.py
@ -49,6 +49,12 @@ def mock_provider(mock_releases):
    return provider
@pytest.fixture(autouse=True)
 def clear_cache():
    import utils.install_util
    utils.install_util.PACKAGE_VERSIONS = {}
 def test_get_release(mock_provider, mock_releases):
    version = "1.0.0"
    release = mock_provider.get_release(version)
--- a/tests-unit/comfy_test/model_detection_test.py
+++ b/tests-unit/comfy_test/model_detection_test.py
@ -0,0 +1,112 @@
 import torch
 from comfy.model_detection import detect_unet_config, model_config_from_unet_config
 import comfy.supported_models
 def _make_longcat_comfyui_sd():
    """Minimal ComfyUI-format state dict for pre-converted LongCat-Image weights."""
    sd = {}
    H = 32  # Reduce hidden state dimension to reduce memory usage
    C_IN = 16
    C_CTX = 3584
    sd["img_in.weight"] = torch.empty(H, C_IN * 4)
    sd["img_in.bias"] = torch.empty(H)
    sd["txt_in.weight"] = torch.empty(H, C_CTX)
    sd["txt_in.bias"] = torch.empty(H)
    sd["time_in.in_layer.weight"] = torch.empty(H, 256)
    sd["time_in.in_layer.bias"] = torch.empty(H)
    sd["time_in.out_layer.weight"] = torch.empty(H, H)
    sd["time_in.out_layer.bias"] = torch.empty(H)
    sd["final_layer.adaLN_modulation.1.weight"] = torch.empty(2 * H, H)
    sd["final_layer.adaLN_modulation.1.bias"] = torch.empty(2 * H)
    sd["final_layer.linear.weight"] = torch.empty(C_IN * 4, H)
    sd["final_layer.linear.bias"] = torch.empty(C_IN * 4)
    for i in range(19):
        sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128)
        sd[f"double_blocks.{i}.img_attn.qkv.weight"] = torch.empty(3 * H, H)
        sd[f"double_blocks.{i}.img_mod.lin.weight"] = torch.empty(H, H)
    for i in range(38):
        sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H)
    return sd
 def _make_flux_schnell_comfyui_sd():
    """Minimal ComfyUI-format state dict for standard Flux Schnell."""
    sd = {}
    H = 32  # Reduce hidden state dimension to reduce memory usage
    C_IN = 16
    sd["img_in.weight"] = torch.empty(H, C_IN * 4)
    sd["img_in.bias"] = torch.empty(H)
    sd["txt_in.weight"] = torch.empty(H, 4096)
    sd["txt_in.bias"] = torch.empty(H)
    sd["double_blocks.0.img_attn.norm.key_norm.weight"] = torch.empty(128)
    sd["double_blocks.0.img_attn.qkv.weight"] = torch.empty(3 * H, H)
    sd["double_blocks.0.img_mod.lin.weight"] = torch.empty(H, H)
    for i in range(19):
        sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128)
    for i in range(38):
        sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H)
    return sd
 class TestModelDetection:
    """Verify that first-match model detection selects the correct model
    based on list ordering and unet_config specificity."""
    def test_longcat_before_schnell_in_models_list(self):
        """LongCatImage must appear before FluxSchnell in the models list."""
        models = comfy.supported_models.models
        longcat_idx = next(i for i, m in enumerate(models) if m.__name__ == "LongCatImage")
        schnell_idx = next(i for i, m in enumerate(models) if m.__name__ == "FluxSchnell")
        assert longcat_idx < schnell_idx, (
            f"LongCatImage (index {longcat_idx}) must come before "
            f"FluxSchnell (index {schnell_idx}) in the models list"
        )
    def test_longcat_comfyui_detected_as_longcat(self):
        sd = _make_longcat_comfyui_sd()
        unet_config = detect_unet_config(sd, "")
        assert unet_config is not None
        assert unet_config["image_model"] == "flux"
        assert unet_config["context_in_dim"] == 3584
        assert unet_config["vec_in_dim"] is None
        assert unet_config["guidance_embed"] is False
        assert unet_config["txt_ids_dims"] == [1, 2]
        model_config = model_config_from_unet_config(unet_config, sd)
        assert model_config is not None
        assert type(model_config).__name__ == "LongCatImage"
    def test_longcat_comfyui_keys_pass_through_unchanged(self):
        """Pre-converted weights should not be transformed by process_unet_state_dict."""
        sd = _make_longcat_comfyui_sd()
        unet_config = detect_unet_config(sd, "")
        model_config = model_config_from_unet_config(unet_config, sd)
        processed = model_config.process_unet_state_dict(dict(sd))
        assert "img_in.weight" in processed
        assert "txt_in.weight" in processed
        assert "time_in.in_layer.weight" in processed
        assert "final_layer.linear.weight" in processed
    def test_flux_schnell_comfyui_detected_as_flux_schnell(self):
        sd = _make_flux_schnell_comfyui_sd()
        unet_config = detect_unet_config(sd, "")
        assert unet_config is not None
        assert unet_config["image_model"] == "flux"
        assert unet_config["context_in_dim"] == 4096
        assert unet_config["txt_ids_dims"] == []
        model_config = model_config_from_unet_config(unet_config, sd)
        assert model_config is not None
        assert type(model_config).__name__ == "FluxSchnell"
--- a/utils/install_util.py
+++ b/utils/install_util.py
@ -1,5 +1,7 @@
 from pathlib import Path
 import sys
 import logging
 import re
 # The path to the requirements.txt file
 requirements_path = Path(__file__).parents[1] / "requirements.txt"
@ -16,3 +18,34 @@ Please install the updated requirements.txt file by running:
 {sys.executable} {extra}-m pip install -r {requirements_path}
 If you are on the portable package you can run: update\\update_comfyui.bat to solve this problem.
 """.strip()
 def is_valid_version(version: str) -> bool:
    """Validate if a string is a valid semantic version (X.Y.Z format)."""
    pattern = r"^(\d+)\.(\d+)\.(\d+)$"
    return bool(re.match(pattern, version))
 PACKAGE_VERSIONS = {}
 def get_required_packages_versions():
    if len(PACKAGE_VERSIONS) > 0:
        return PACKAGE_VERSIONS.copy()
    out = PACKAGE_VERSIONS
    try:
        with open(requirements_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip().replace(">=", "==")
                s = line.split("==")
                if len(s) == 2:
                    version_str = s[-1]
                    if not is_valid_version(version_str):
                        logging.error(f"Invalid version format in requirements.txt: {version_str}")
                        continue
                    out[s[0]] = version_str
        return out.copy()
    except FileNotFoundError:
        logging.error("requirements.txt not found.")
        return None
    except Exception as e:
        logging.error(f"Error reading requirements.txt: {e}")
        return None