Fix with ideogram4

2026-06-26 09:49:26 +08:00 · 2026-06-05 18:16:05 +03:00 · 2026-06-05 18:16:05 +03:00 · d6015910a9
commit d6015910a9
parent f6f5d2d7eb
2 changed files with 50 additions and 4 deletions
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -1618,6 +1618,11 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
            clip_target.clip = comfy.text_encoders.qwen35.te(**llama_detect(clip_data), model_type=qwen35_type)
            clip_target.tokenizer = comfy.text_encoders.qwen35.tokenizer(model_type=qwen35_type)
        elif te_model in (TEModel.QWEN3VL_4B, TEModel.QWEN3VL_8B):
            if clip_type == CLIPType.IDEOGRAM4 and te_model == TEModel.QWEN3VL_8B:  # Ideogram4 reuses the full Qwen3-VL-8B (13-layer tap for conditioning + multimodal generate).
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
            else:
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                qwen3vl_type = {TEModel.QWEN3VL_4B: "qwen3vl_4b", TEModel.QWEN3VL_8B: "qwen3vl_8b"}[te_model]
                clip_target.clip = comfy.text_encoders.qwen3vl.te(**llama_detect(clip_data), model_type=qwen3vl_type)
--- a/comfy/text_encoders/ideogram4.py
+++ b/comfy/text_encoders/ideogram4.py
@ -9,6 +9,7 @@ import os
 from transformers import Qwen2Tokenizer
 import comfy.text_encoders.llama
 import comfy.text_encoders.qwen3vl
 from comfy import sd1_clip
 # Reference taps outputs of layers (0,3,...,35); comfy captures layer inputs, offset by +1.
@ -75,3 +76,43 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
                model_options["quantization_metadata"] = llama_quantization_metadata
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return Ideogram4TEModel_
 # Full Qwen3-VL-8B variant with vision
 class Ideogram4Qwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
    def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}):
        super().__init__(device=device, layer=IDEOGRAM4_TAP_LAYERS, layer_idx=None, dtype=dtype,
                         attention_mask=attention_mask, model_options=model_options, model_type="qwen3vl_8b")
 class Ideogram4Qwen3VLTEModel(sd1_clip.SD1ClipModel):
    def __init__(self, device="cpu", dtype=None, model_options={}):
        super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=Ideogram4Qwen3VLClipModel, model_options=model_options)
    def encode_token_weights(self, token_weight_pairs):
        out, pooled, extra = super().encode_token_weights(token_weight_pairs)
        b, n, seq, h = out.shape  # (B, n_taps=13, seq, 4096), ascending layer order.
        out = out.permute(0, 2, 3, 1).reshape(b, seq, h * n)  # (B, seq, 4096*13 = 53248).
        return out, pooled, extra
 class Ideogram4Qwen3VLTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
    def __init__(self, embedding_directory=None, tokenizer_data={}):
        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
        # Ideogram 4 conditions on the no-think template; default thinking=True drops the empty think block qwen3vl adds.
        return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
 def te_qwen3vl(dtype_llama=None, llama_quantization_metadata=None):
    class Ideogram4Qwen3VLTEModel_(Ideogram4Qwen3VLTEModel):
        def __init__(self, device="cpu", dtype=None, model_options={}):
            if dtype_llama is not None:
                dtype = dtype_llama
            if llama_quantization_metadata is not None:
                model_options = model_options.copy()
                model_options["quantization_metadata"] = llama_quantization_metadata
            super().__init__(device=device, dtype=dtype, model_options=model_options)
    return Ideogram4Qwen3VLTEModel_