mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 13:19:23 +08:00
Inherit TE from Qwen3VL
This commit is contained in:
parent
fd5252096a
commit
2f0e8842f7
@ -1630,7 +1630,8 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer
|
||||
elif clip_type == CLIPType.KREA2: # Krea2: Qwen3-VL-4B LM (12-layer tap)
|
||||
elif clip_type == CLIPType.KREA2 and te_model == TEModel.QWEN3VL_4B: # Krea2: full Qwen3-VL-4B (12-layer tap for conditioning + multimodal generate).
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.krea2.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.krea2.Krea2Tokenizer
|
||||
elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2): # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""Krea 2 (K2) text encoder: Qwen3-VL-4B language model, 12-layer tap.
|
||||
"""Krea 2 (K2) text encoder: Qwen3-VL-4B, 12-layer tap.
|
||||
|
||||
K2 conditions on a stack of hidden states from 12 layers of Qwen3-VL-4B
|
||||
(reference taps ``hidden_states[2,5,8,...,35]``), kept as a ``(B, 12, seq, 2560)`` tensor and
|
||||
@ -6,61 +6,39 @@ consumed by the DiT's internal ``txtfusion`` adapter. Comfy carries conditioning
|
||||
so the 12-layer stack is flattened to ``(B, seq, 12*2560)`` here and unpacked inside the model.
|
||||
"""
|
||||
|
||||
import os
|
||||
import numbers
|
||||
|
||||
import torch
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
import comfy.text_encoders.llama
|
||||
import comfy.text_encoders.qwen3vl
|
||||
from comfy import sd1_clip
|
||||
|
||||
# tap k == hidden_states[k] (no offset).
|
||||
KREA2_TAP_LAYERS = [2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35]
|
||||
QWEN3VL_4B_CONFIG = {"rope_theta": 5000000.0, "final_norm": False, "lm_head": False}
|
||||
|
||||
# Identical system template to Qwen-Image; Krea2 strips the system+user-opening prefix.
|
||||
KREA2_TEMPLATE = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
|
||||
class Qwen3VL4BTokenizer(sd1_clip.SDTokenizer):
|
||||
class Krea2Tokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory,
|
||||
embedding_size=2560, embedding_key='qwen3vl_4b', tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False, has_end_token=False, pad_to_max_length=False,
|
||||
max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_4b")
|
||||
self.llama_template = KREA2_TEMPLATE # conditioning template; image text-gen uses qwen3vl's default image template.
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
|
||||
# Krea2 conditions on the no-think template; thinking=True drops the empty <think> block qwen3vl adds.
|
||||
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
|
||||
|
||||
|
||||
class Krea2Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data,
|
||||
name="qwen3vl_4b", tokenizer=Qwen3VL4BTokenizer)
|
||||
self.llama_template = KREA2_TEMPLATE
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs):
|
||||
if text.startswith('<|im_start|>'):
|
||||
llama_text = text
|
||||
elif llama_template is None:
|
||||
llama_text = self.llama_template.format(text)
|
||||
else:
|
||||
llama_text = llama_template.format(text)
|
||||
return super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
|
||||
|
||||
class Qwen3VL4BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=KREA2_TAP_LAYERS, layer_idx=None,
|
||||
textmodel_json_config=dict(QWEN3VL_4B_CONFIG),
|
||||
dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False,
|
||||
model_class=comfy.text_encoders.llama.Qwen3_4B,
|
||||
enable_attention_masks=attention_mask, return_attention_masks=attention_mask,
|
||||
model_options=model_options)
|
||||
class Krea2Qwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=KREA2_TAP_LAYERS, layer_idx=None, dtype=dtype,
|
||||
attention_mask=attention_mask, model_options=model_options, model_type="qwen3vl_4b")
|
||||
|
||||
|
||||
class Krea2TEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3vl_4b", clip_model=Qwen3VL4BModel, model_options=model_options)
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3vl_4b", clip_model=Krea2Qwen3VLClipModel, model_options=model_options)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs, template_end=-1):
|
||||
out, pooled, extra = super().encode_token_weights(token_weight_pairs) # out: (B, 12, seq, 2560)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user