diff --git a/comfy/ldm/qwen_image/__init__.py b/comfy/ldm/qwen_image/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index c15ab8e40..fc8fd0739 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -2,13 +2,14 @@ import torch import torch.nn as nn import torch.nn.functional as F -from typing import Optional, Tuple from einops import repeat +from typing import Optional, Tuple + +from ..common_dit import pad_to_patch_size +from ..flux.layers import EmbedND +from ..lightricks.model import TimestepEmbedding, Timesteps +from ..modules.attention import optimized_attention_masked -from comfy.ldm.lightricks.model import TimestepEmbedding, Timesteps -from comfy.ldm.modules.attention import optimized_attention_masked -from comfy.ldm.flux.layers import EmbedND -import comfy.ldm.common_dit class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -24,14 +25,14 @@ class GELU(nn.Module): class FeedForward(nn.Module): def __init__( - self, - dim: int, - dim_out: Optional[int] = None, - mult: int = 4, - dropout: float = 0.0, - inner_dim=None, - bias: bool = True, - dtype=None, device=None, operations=None + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + inner_dim=None, + bias: bool = True, + dtype=None, device=None, operations=None ): super().__init__() if inner_dim is None: @@ -78,19 +79,19 @@ class QwenTimestepProjEmbeddings(nn.Module): class Attention(nn.Module): def __init__( - self, - query_dim: int, - dim_head: int = 64, - heads: int = 8, - dropout: float = 0.0, - bias: bool = False, - eps: float = 1e-5, - out_bias: bool = True, - out_dim: int = None, - out_context_dim: int = None, - dtype=None, - device=None, - operations=None + self, + query_dim: int, + dim_head: int = 64, + heads: int = 8, + dropout: float = 0.0, + bias: bool = False, + eps: float = 1e-5, + out_bias: bool = True, + out_dim: int = None, + out_context_dim: int = None, + dtype=None, + device=None, + operations=None ): super().__init__() self.inner_dim = out_dim if out_dim is not None else dim_head * heads @@ -125,12 +126,12 @@ class Attention(nn.Module): self.to_add_out = operations.Linear(self.inner_dim, self.out_context_dim, bias=out_bias, dtype=dtype, device=device) def forward( - self, - hidden_states: torch.FloatTensor, # Image stream - encoder_hidden_states: torch.FloatTensor = None, # Text stream - encoder_hidden_states_mask: torch.FloatTensor = None, - attention_mask: Optional[torch.FloatTensor] = None, - image_rotary_emb: Optional[torch.Tensor] = None, + self, + hidden_states: torch.FloatTensor, # Image stream + encoder_hidden_states: torch.FloatTensor = None, # Text stream + encoder_hidden_states_mask: torch.FloatTensor = None, + attention_mask: Optional[torch.FloatTensor] = None, + image_rotary_emb: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: seq_txt = encoder_hidden_states.shape[1] @@ -172,14 +173,14 @@ class Attention(nn.Module): class QwenImageTransformerBlock(nn.Module): def __init__( - self, - dim: int, - num_attention_heads: int, - attention_head_dim: int, - eps: float = 1e-6, - dtype=None, - device=None, - operations=None + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + eps: float = 1e-6, + dtype=None, + device=None, + operations=None ): super().__init__() self.dim = dim @@ -219,12 +220,12 @@ class QwenImageTransformerBlock(nn.Module): return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) def forward( - self, - hidden_states: torch.Tensor, - encoder_hidden_states: torch.Tensor, - encoder_hidden_states_mask: torch.Tensor, - temb: torch.Tensor, - image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + encoder_hidden_states_mask: torch.Tensor, + temb: torch.Tensor, + image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: img_mod_params = self.img_mod(temb) txt_mod_params = self.txt_mod(temb) @@ -259,13 +260,13 @@ class QwenImageTransformerBlock(nn.Module): class LastLayer(nn.Module): def __init__( - self, - embedding_dim: int, - conditioning_embedding_dim: int, - elementwise_affine=False, - eps=1e-6, - bias=True, - dtype=None, device=None, operations=None + self, + embedding_dim: int, + conditioning_embedding_dim: int, + elementwise_affine=False, + eps=1e-6, + bias=True, + dtype=None, device=None, operations=None ): super().__init__() self.silu = nn.SiLU() @@ -281,21 +282,21 @@ class LastLayer(nn.Module): class QwenImageTransformer2DModel(nn.Module): def __init__( - self, - patch_size: int = 2, - in_channels: int = 64, - out_channels: Optional[int] = 16, - num_layers: int = 60, - attention_head_dim: int = 128, - num_attention_heads: int = 24, - joint_attention_dim: int = 3584, - pooled_projection_dim: int = 768, - guidance_embeds: bool = False, - axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), - image_model=None, - dtype=None, - device=None, - operations=None, + self, + patch_size: int = 2, + in_channels: int = 64, + out_channels: Optional[int] = 16, + num_layers: int = 60, + attention_head_dim: int = 128, + num_attention_heads: int = 24, + joint_attention_dim: int = 3584, + pooled_projection_dim: int = 768, + guidance_embeds: bool = False, + axes_dims_rope: Tuple[int, int, int] = (16, 56, 56), + image_model=None, + dtype=None, + device=None, + operations=None, ): super().__init__() self.dtype = dtype @@ -350,13 +351,13 @@ class QwenImageTransformer2DModel(nn.Module): return self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) def forward( - self, - x, - timesteps, - context, - attention_mask=None, - guidance: torch.Tensor = None, - **kwargs + self, + x, + timesteps, + context, + attention_mask=None, + guidance: torch.Tensor = None, + **kwargs ): timestep = timesteps encoder_hidden_states = context @@ -364,7 +365,7 @@ class QwenImageTransformer2DModel(nn.Module): image_rotary_emb = self.pos_embeds(x, context) - hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) + hidden_states = pad_to_patch_size(x, (1, self.patch_size, self.patch_size)) orig_shape = hidden_states.shape hidden_states = hidden_states.view(orig_shape[0], orig_shape[1], orig_shape[-2] // 2, 2, orig_shape[-1] // 2, 2) hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5) diff --git a/comfy/lora.py b/comfy/lora.py index 41a6918d4..6afdc46e2 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -305,7 +305,7 @@ def model_lora_keys_unet(model, key_map=None): key_lora = k[len("diffusion_model."):-len(".weight")] key_map["{}".format(key_lora)] = k - if isinstance(model, comfy.model_base.QwenImage): + if isinstance(model, model_base.QwenImage): for k in sdk: if k.startswith("diffusion_model.") and k.endswith(".weight"): #QwenImage lora format key_lora = k[len("diffusion_model."):-len(".weight")] diff --git a/comfy/model_base.py b/comfy/model_base.py index c8eb369bd..43365b12e 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1370,5 +1370,5 @@ class QwenImage(BaseModel): out = super().extra_conds(**kwargs) cross_attn = kwargs.get("cross_attn", None) if cross_attn is not None: - out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + out['c_crossattn'] = conds.CONDRegular(cross_attn) return out diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py index fe4f367f4..96cc22adf 100644 --- a/comfy/model_downloader.py +++ b/comfy/model_downloader.py @@ -505,6 +505,7 @@ KNOWN_VAES: Final[KnownDownloadables] = KnownDownloadables([ HuggingFile("Comfy-Org/Lumina_Image_2.0_Repackaged", "split_files/vae/ae.safetensors", save_with_filename="lumina_image_2.0-ae.safetensors"), HuggingFile("Comfy-Org/Wan_2.1_ComfyUI_repackaged", "split_files/vae/wan_2.1_vae.safetensors"), HuggingFile("Comfy-Org/Wan_2.2_ComfyUI_Repackaged", "split_files/vae/wan2.2_vae.safetensors"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/vae/qwen_image_vae.safetensors"), ], folder_name="vae") KNOWN_HUGGINGFACE_MODEL_REPOS: Final[Set[str]] = { @@ -579,6 +580,12 @@ KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([ HuggingFile("QuantStack/Wan2.2-T2V-A14B-GGUF", "HighNoise/Wan2.2-T2V-A14B-HighNoise-Q4_K_M.gguf"), HuggingFile("QuantStack/Wan2.2-T2V-A14B-GGUF", "LowNoise/Wan2.2-T2V-A14B-LowNoise-Q8_0.gguf"), HuggingFile("QuantStack/Wan2.2-T2V-A14B-GGUF", "LowNoise/Wan2.2-T2V-A14B-LowNoise-Q4_K_M.gguf"), + HuggingFile("city96/Qwen-Image-gguf", "qwen-image-Q4_K_M.gguf"), + HuggingFile("city96/Qwen-Image-gguf", "qwen-image-Q8_0.gguf"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/diffusion_models/qwen_image_bf16.safetensors"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/diffusion_models/qwen_image_fp8_e4m3fn.safetensors"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "non_official/diffusion_models/qwen_image_distill_full_bf16.safetensors"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "non_official/diffusion_models/qwen_image_distill_full_fp8_e4m3fn.safetensors"), ], folder_names=["diffusion_models", "unet"]) KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([ @@ -601,6 +608,8 @@ KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([ HuggingFile("Comfy-Org/HiDream-I1_ComfyUI", "split_files/text_encoders/clip_l_hidream.safetensors"), HuggingFile("Comfy-Org/HiDream-I1_ComfyUI", "split_files/text_encoders/clip_g_hidream.safetensors"), HuggingFile("Comfy-Org/HiDream-I1_ComfyUI", "split_files/text_encoders/llama_3.1_8b_instruct_fp8_scaled.safetensors"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/text_encoders/qwen_2.5_vl_7b.safetensors"), + HuggingFile("Comfy-Org/Qwen-Image_ComfyUI", "split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors"), ], folder_names=["clip", "text_encoders"]) KNOWN_STYLE_MODELS: Final[KnownDownloadables] = KnownDownloadables([ diff --git a/comfy/text_encoders/qwen25_tokenizer/__init__.py b/comfy/text_encoders/qwen25_tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comfy/text_encoders/qwen_image.py b/comfy/text_encoders/qwen_image.py index ce5c98097..ef01f1d17 100644 --- a/comfy/text_encoders/qwen_image.py +++ b/comfy/text_encoders/qwen_image.py @@ -1,22 +1,28 @@ -from transformers import Qwen2Tokenizer -from comfy import sd1_clip -import comfy.text_encoders.llama -import os -import torch import numbers +import torch +from transformers import Qwen2Tokenizer + +from .llama import Qwen25_7BVLI +from .. import sd1_clip +from ..component_model import files + class Qwen25_7BVLITokenizer(sd1_clip.SDTokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer") + def __init__(self, embedding_directory=None, tokenizer_data=None): + if tokenizer_data is None: + tokenizer_data = {} + tokenizer_path = files.get_package_as_path("comfy.text_encoders.qwen25_tokenizer") super().__init__(tokenizer_path, pad_with_end=False, embedding_size=3584, embedding_key='qwen25_7b', tokenizer_class=Qwen2Tokenizer, has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data) class QwenImageTokenizer(sd1_clip.SD1Tokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): + def __init__(self, embedding_directory=None, tokenizer_data=None): + if tokenizer_data is None: + tokenizer_data = {} super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="qwen25_7b", tokenizer=Qwen25_7BVLITokenizer) self.llama_template = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n" - def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None,**kwargs): + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, **kwargs): if llama_template is None: llama_text = self.llama_template.format(text) else: @@ -25,18 +31,23 @@ class QwenImageTokenizer(sd1_clip.SD1Tokenizer): class Qwen25_7BVLIModel(sd1_clip.SDClipModel): - def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): - super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options=None): + if model_options is None: + model_options = {} + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False, model_class=Qwen25_7BVLI, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) class QwenImageTEModel(sd1_clip.SD1ClipModel): - def __init__(self, device="cpu", dtype=None, model_options={}): + def __init__(self, device="cpu", dtype=None, model_options=None): + if model_options is None: + model_options = {} super().__init__(device=device, dtype=dtype, name="qwen25_7b", clip_model=Qwen25_7BVLIModel, model_options=model_options) def encode_token_weights(self, token_weight_pairs): out, pooled, extra = super().encode_token_weights(token_weight_pairs) tok_pairs = token_weight_pairs["qwen25_7b"][0] count_im_start = 0 + template_end = 0 for i, v in enumerate(tok_pairs): elem = v[0] if not torch.is_tensor(elem): @@ -61,11 +72,14 @@ class QwenImageTEModel(sd1_clip.SD1ClipModel): def te(dtype_llama=None, llama_scaled_fp8=None): class QwenImageTEModel_(QwenImageTEModel): - def __init__(self, device="cpu", dtype=None, model_options={}): + def __init__(self, device="cpu", dtype=None, model_options=None): + if model_options is None: + model_options = {} if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: model_options = model_options.copy() model_options["scaled_fp8"] = llama_scaled_fp8 if dtype_llama is not None: dtype = dtype_llama super().__init__(device=device, dtype=dtype, model_options=model_options) - return QwenImageTEModel_ + + return QwenImageTEModel_ \ No newline at end of file diff --git a/comfy_extras/nodes/nodes_model_merging_model_specific.py b/comfy_extras/nodes/nodes_model_merging_model_specific.py index 681078012..577d4c98f 100644 --- a/comfy_extras/nodes/nodes_model_merging_model_specific.py +++ b/comfy_extras/nodes/nodes_model_merging_model_specific.py @@ -322,7 +322,7 @@ class ModelMergeCosmosPredict2_14B(nodes_model_merging.ModelMergeBlocks): return {"required": arg_dict} -class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks): +class ModelMergeQwenImage(nodes_model_merging.ModelMergeBlocks): CATEGORY = "advanced/model_merging/model_specific" @classmethod