Merge f030d2c425 into be518db5a7

2026-01-30 00:00:26 +08:00 · 2026-01-14 23:56:27 +01:00
10 changed files with 18 additions and 211 deletions
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -1014,7 +1014,6 @@ class CLIPType(Enum):
    KANDINSKY5 = 22
    KANDINSKY5_IMAGE = 23
    NEWBIE = 24
-    FLUX2 = 25


 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
@ -1047,7 +1046,6 @@ class TEModel(Enum):
    QWEN3_2B = 17
    GEMMA_3_12B = 18
    JINA_CLIP_2 = 19
-    QWEN3_8B = 20


 def detect_te_model(sd):
@ -1091,8 +1089,6 @@ def detect_te_model(sd):
                return TEModel.QWEN3_4B
            elif weight.shape[0] == 2048:
                return TEModel.QWEN3_2B
-            elif weight.shape[0] == 4096:
-                return TEModel.QWEN3_8B
        if weight.shape[0] == 5120:
            if "model.layers.39.post_attention_layernorm.weight" in sd:
                return TEModel.MISTRAL3_24B
@ -1218,18 +1214,11 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer
            tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
        elif te_model == TEModel.QWEN3_4B:
-            if clip_type == CLIPType.FLUX or clip_type == CLIPType.FLUX2:
-                clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_4b")
-                clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer
-            else:
-                clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data))
-                clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer
+            clip_target.clip = comfy.text_encoders.z_image.te(**llama_detect(clip_data))
+            clip_target.tokenizer = comfy.text_encoders.z_image.ZImageTokenizer
        elif te_model == TEModel.QWEN3_2B:
            clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data))
            clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer
-        elif te_model == TEModel.QWEN3_8B:
-            clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_8b")
-            clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B
        elif te_model == TEModel.JINA_CLIP_2:
            clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
            clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -763,7 +763,7 @@ class Flux2(Flux):

    def __init__(self, unet_config):
        super().__init__(unet_config)
-        self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * (unet_config['hidden_size'] / 2604)
+        self.memory_usage_factor = self.memory_usage_factor * (2.0 * 2.0) * 2.36

    def get_model(self, state_dict, prefix="", device=None):
        out = model_base.Flux2(self, device=device)
--- a/comfy/text_encoders/flux.py
+++ b/comfy/text_encoders/flux.py
@ -3,7 +3,7 @@ import comfy.text_encoders.t5
 import comfy.text_encoders.sd3_clip
 import comfy.text_encoders.llama
 import comfy.model_management
-from transformers import T5TokenizerFast, LlamaTokenizerFast, Qwen2Tokenizer
+from transformers import T5TokenizerFast, LlamaTokenizerFast
 import torch
 import os
 import json
@ -172,60 +172,3 @@ def flux2_te(dtype_llama=None, llama_quantization_metadata=None, pruned=False):
                model_options["num_layers"] = 30
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return Flux2TEModel_
-
-class Qwen3Tokenizer(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2560, embedding_key='qwen3_4b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=151643, tokenizer_data=tokenizer_data)
-
-class Qwen3Tokenizer8B(sd1_clip.SDTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}):
-        tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
-        super().__init__(tokenizer_path, pad_with_end=False, embedding_size=4096, embedding_key='qwen3_8b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_token=151643, tokenizer_data=tokenizer_data)
-
-class KleinTokenizer(sd1_clip.SD1Tokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}, name="qwen3_4b"):
-        if name == "qwen3_4b":
-            tokenizer = Qwen3Tokenizer
-        elif name == "qwen3_8b":
-            tokenizer = Qwen3Tokenizer8B
-
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=name, tokenizer=tokenizer)
-        self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
-
-    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
-        if llama_template is None:
-            llama_text = self.llama_template.format(text)
-        else:
-            llama_text = llama_template.format(text)
-
-        tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
-        return tokens
-
-class KleinTokenizer8B(KleinTokenizer):
-    def __init__(self, embedding_directory=None, tokenizer_data={}, name="qwen3_8b"):
-        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=name)
-
-class Qwen3_4BModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer=[9, 18, 27], layer_idx=None, dtype=None, attention_mask=True, model_options={}):
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
-
-class Qwen3_8BModel(sd1_clip.SDClipModel):
-    def __init__(self, device="cpu", layer=[9, 18, 27], layer_idx=None, dtype=None, attention_mask=True, model_options={}):
-        super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen3_8B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
-
-def klein_te(dtype_llama=None, llama_quantization_metadata=None, model_type="qwen3_4b"):
-    if model_type == "qwen3_4b":
-        model = Qwen3_4BModel
-    elif model_type == "qwen3_8b":
-        model = Qwen3_8BModel
-
-    class Flux2TEModel_(Flux2TEModel):
-        def __init__(self, device="cpu", dtype=None, model_options={}):
-            if llama_quantization_metadata is not None:
-                model_options = model_options.copy()
-                model_options["quantization_metadata"] = llama_quantization_metadata
-            if dtype_llama is not None:
-                dtype = dtype_llama
-            super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model)
-    return Flux2TEModel_
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -99,28 +99,6 @@ class Qwen3_4BConfig:
    rope_scale = None
    final_norm: bool = True

-@dataclass
-class Qwen3_8BConfig:
-    vocab_size: int = 151936
-    hidden_size: int = 4096
-    intermediate_size: int = 12288
-    num_hidden_layers: int = 36
-    num_attention_heads: int = 32
-    num_key_value_heads: int = 8
-    max_position_embeddings: int = 40960
-    rms_norm_eps: float = 1e-6
-    rope_theta: float = 1000000.0
-    transformer_type: str = "llama"
-    head_dim = 128
-    rms_norm_add = False
-    mlp_activation = "silu"
-    qkv_bias = False
-    rope_dims = None
-    q_norm = "gemma3"
-    k_norm = "gemma3"
-    rope_scale = None
-    final_norm: bool = True
-
@dataclass
 class Ovis25_2BConfig:
    vocab_size: int = 151936
@ -650,15 +628,6 @@ class Qwen3_4B(BaseLlama, torch.nn.Module):
        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
        self.dtype = dtype

-class Qwen3_8B(BaseLlama, torch.nn.Module):
-    def __init__(self, config_dict, dtype, device, operations):
-        super().__init__()
-        config = Qwen3_8BConfig(**config_dict)
-        self.num_layers = config.num_hidden_layers
-
-        self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
-        self.dtype = dtype
-
 class Ovis25_2B(BaseLlama, torch.nn.Module):
    def __init__(self, config_dict, dtype, device, operations):
        super().__init__()
--- a/comfy_api_nodes/apis/bytedance_api.py
+++ b/comfy_api_nodes/apis/bytedance_api.py
@ -65,13 +65,11 @@ class TaskImageContent(BaseModel):
 class Text2VideoTaskCreationRequest(BaseModel):
    model: str = Field(...)
    content: list[TaskTextContent] = Field(..., min_length=1)
-    generate_audio: bool | None = Field(...)


 class Image2VideoTaskCreationRequest(BaseModel):
    model: str = Field(...)
    content: list[TaskTextContent | TaskImageContent] = Field(..., min_length=2)
-    generate_audio: bool | None = Field(...)


 class TaskCreationResponse(BaseModel):
@ -143,9 +141,4 @@ VIDEO_TASKS_EXECUTION_TIME = {
        "720p": 65,
        "1080p": 100,
    },
-    "seedance-1-5-pro-251215": {
-        "480p": 80,
-        "720p": 100,
-        "1080p": 150,
-    },
 }
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@ -477,12 +477,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
            inputs=[
                IO.Combo.Input(
                    "model",
-                    options=[
-                        "seedance-1-5-pro-251215",
-                        "seedance-1-0-pro-250528",
-                        "seedance-1-0-lite-t2v-250428",
-                        "seedance-1-0-pro-fast-251015",
-                    ],
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"],
                    default="seedance-1-0-pro-fast-251015",
                ),
                IO.String.Input(
@ -533,12 +528,6 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
                    tooltip='Whether to add an "AI generated" watermark to the video.',
                    optional=True,
                ),
-                IO.Boolean.Input(
-                    "generate_audio",
-                    default=False,
-                    tooltip="This parameter is ignored for any model except seedance-1-5-pro.",
-                    optional=True,
-                ),
            ],
            outputs=[
                IO.Video.Output(),
@ -563,10 +552,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
        seed: int,
        camera_fixed: bool,
        watermark: bool,
-        generate_audio: bool = False,
    ) -> IO.NodeOutput:
-        if model == "seedance-1-5-pro-251215" and duration < 4:
-            raise ValueError("Minimum supported duration for Seedance 1.5 Pro is 4 seconds.")
        validate_string(prompt, strip_whitespace=True, min_length=1)
        raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"])

@ -581,11 +567,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
        )
        return await process_video_task(
            cls,
-            payload=Text2VideoTaskCreationRequest(
-                model=model,
-                content=[TaskTextContent(text=prompt)],
-                generate_audio=generate_audio if model == "seedance-1-5-pro-251215" else None,
-            ),
+            payload=Text2VideoTaskCreationRequest(model=model, content=[TaskTextContent(text=prompt)]),
            estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))),
        )

@ -602,12 +584,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
            inputs=[
                IO.Combo.Input(
                    "model",
-                    options=[
-                        "seedance-1-5-pro-251215",
-                        "seedance-1-0-pro-250528",
-                        "seedance-1-0-lite-i2v-250428",
-                        "seedance-1-0-pro-fast-251015",
-                    ],
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-t2v-250428", "seedance-1-0-pro-fast-251015"],
                    default="seedance-1-0-pro-fast-251015",
                ),
                IO.String.Input(
@ -662,12 +639,6 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
                    tooltip='Whether to add an "AI generated" watermark to the video.',
                    optional=True,
                ),
-                IO.Boolean.Input(
-                    "generate_audio",
-                    default=False,
-                    tooltip="This parameter is ignored for any model except seedance-1-5-pro.",
-                    optional=True,
-                ),
            ],
            outputs=[
                IO.Video.Output(),
@ -693,10 +664,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
        seed: int,
        camera_fixed: bool,
        watermark: bool,
-        generate_audio: bool = False,
    ) -> IO.NodeOutput:
-        if model == "seedance-1-5-pro-251215" and duration < 4:
-            raise ValueError("Minimum supported duration for Seedance 1.5 Pro is 4 seconds.")
        validate_string(prompt, strip_whitespace=True, min_length=1)
        raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"])
        validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000)
@ -718,7 +686,6 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
            payload=Image2VideoTaskCreationRequest(
                model=model,
                content=[TaskTextContent(text=prompt), TaskImageContent(image_url=TaskImageContentUrl(url=image_url))],
-                generate_audio=generate_audio if model == "seedance-1-5-pro-251215" else None,
            ),
            estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))),
        )
@ -736,7 +703,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
            inputs=[
                IO.Combo.Input(
                    "model",
-                    options=["seedance-1-5-pro-251215", "seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"],
+                    options=["seedance-1-0-pro-250528", "seedance-1-0-lite-i2v-250428"],
                    default="seedance-1-0-lite-i2v-250428",
                ),
                IO.String.Input(
@ -795,12 +762,6 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
                    tooltip='Whether to add an "AI generated" watermark to the video.',
                    optional=True,
                ),
-                IO.Boolean.Input(
-                    "generate_audio",
-                    default=False,
-                    tooltip="This parameter is ignored for any model except seedance-1-5-pro.",
-                    optional=True,
-                ),
            ],
            outputs=[
                IO.Video.Output(),
@ -827,10 +788,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
        seed: int,
        camera_fixed: bool,
        watermark: bool,
-        generate_audio: bool = False,
    ) -> IO.NodeOutput:
-        if model == "seedance-1-5-pro-251215" and duration < 4:
-            raise ValueError("Minimum supported duration for Seedance 1.5 Pro is 4 seconds.")
        validate_string(prompt, strip_whitespace=True, min_length=1)
        raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"])
        for i in (first_frame, last_frame):
@ -863,7 +821,6 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
                    TaskImageContent(image_url=TaskImageContentUrl(url=str(download_urls[0])), role="first_frame"),
                    TaskImageContent(image_url=TaskImageContentUrl(url=str(download_urls[1])), role="last_frame"),
                ],
-                generate_audio=generate_audio if model == "seedance-1-5-pro-251215" else None,
            ),
            estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))),
        )
@ -939,41 +896,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
                IO.Hidden.unique_id,
            ],
            is_api_node=True,
-            price_badge=IO.PriceBadge(
-                depends_on=IO.PriceBadgeDepends(widgets=["model", "duration", "resolution"]),
-                expr="""
-                (
-                  $priceByModel := {
-                    "seedance-1-0-pro": {
-                      "480p":[0.23,0.24],
-                      "720p":[0.51,0.56]
-                    },
-                    "seedance-1-0-lite": {
-                      "480p":[0.17,0.18],
-                      "720p":[0.37,0.41]
-                    }
-                  };
-                  $model := widgets.model;
-                  $modelKey :=
-                    $contains($model, "seedance-1-0-pro")  ? "seedance-1-0-pro" :
-                    "seedance-1-0-lite";
-                  $resolution := widgets.resolution;
-                  $resKey :=
-                    $contains($resolution, "720") ? "720p" :
-                    "480p";
-                  $modelPrices := $lookup($priceByModel, $modelKey);
-                  $baseRange := $lookup($modelPrices, $resKey);
-                  $min10s := $baseRange[0];
-                  $max10s := $baseRange[1];
-                  $scale := widgets.duration / 10;
-                  $minCost := $min10s * $scale;
-                  $maxCost := $max10s * $scale;
-                  ($minCost = $maxCost)
-                    ? {"type":"usd","usd": $minCost}
-                    : {"type":"range_usd","min_usd": $minCost, "max_usd": $maxCost}
-                )
-                """,
-            ),
+            price_badge=PRICE_BADGE_VIDEO,
        )

    @classmethod
@ -1044,15 +967,10 @@ def raise_if_text_params(prompt: str, text_params: list[str]) -> None:


 PRICE_BADGE_VIDEO = IO.PriceBadge(
-    depends_on=IO.PriceBadgeDepends(widgets=["model", "duration", "resolution", "generate_audio"]),
+    depends_on=IO.PriceBadgeDepends(widgets=["model", "duration", "resolution"]),
    expr="""
    (
      $priceByModel := {
-        "seedance-1-5-pro": {
-          "480p":[0.12,0.12],
-          "720p":[0.26,0.26],
-          "1080p":[0.58,0.59]
-        },
        "seedance-1-0-pro": {
          "480p":[0.23,0.24],
          "720p":[0.51,0.56],
@ -1071,7 +989,6 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
      };
      $model := widgets.model;
      $modelKey :=
-        $contains($model, "seedance-1-5-pro")      ? "seedance-1-5-pro" :
        $contains($model, "seedance-1-0-pro-fast") ? "seedance-1-0-pro-fast" :
        $contains($model, "seedance-1-0-pro")      ? "seedance-1-0-pro" :
        "seedance-1-0-lite";
@ -1085,12 +1002,11 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
      $min10s := $baseRange[0];
      $max10s := $baseRange[1];
      $scale := widgets.duration / 10;
-      $audioMultiplier := ($modelKey = "seedance-1-5-pro" and widgets.generate_audio) ? 2 : 1;
-      $minCost := $min10s * $scale * $audioMultiplier;
-      $maxCost := $max10s * $scale * $audioMultiplier;
+      $minCost := $min10s * $scale;
+      $maxCost := $max10s * $scale;
      ($minCost = $maxCost)
-        ? {"type":"usd","usd": $minCost, "format": { "approximate": true }}
-        : {"type":"range_usd","min_usd": $minCost, "max_usd": $maxCost, "format": { "approximate": true }}
+        ? {"type":"usd","usd": $minCost}
+        : {"type":"range_usd","min_usd": $minCost, "max_usd": $maxCost}
    )
    """,
 )
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.9.2"
+__version__ = "0.9.1"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.9.2"
+version = "0.9.1"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.10"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.36.14
-comfyui-workflow-templates==0.8.10
+comfyui-workflow-templates==0.8.4
 comfyui-embedded-docs==0.4.0
 torch
 torchsde
--- a/server.py
+++ b/server.py
@ -686,10 +686,7 @@ class PromptServer():

        @routes.get("/object_info")
        async def get_object_info(request):
-            try:
-                seed_assets(["models"])
-            except Exception as e:
-                logging.error(f"Failed to seed assets: {e}")
+            seed_assets(["models"])
            with folder_paths.cache_helper:
                out = {}
                for x in nodes.NODE_CLASS_MAPPINGS: