diff --git a/README.md b/README.md index c75353d36..bcec86377 100644 --- a/README.md +++ b/README.md @@ -140,7 +140,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang - Commits outside of the stable release tags may be very unstable and break many custom nodes. - Serves as the foundation for the desktop release -2. **[ComfyUI Desktop](https://github.com/Comfy-Org/Comfy-Desktop)** +2. **[Comfy Desktop](https://github.com/Comfy-Org/Comfy-Desktop)** - Builds a new release using the latest stable core version 3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)** diff --git a/comfy/ldm/boogu/model.py b/comfy/ldm/boogu/model.py new file mode 100644 index 000000000..966f3c583 --- /dev/null +++ b/comfy/ldm/boogu/model.py @@ -0,0 +1,321 @@ +# Boogu-Image-0.1 transformer +# Architecture is an OmniGen2 derivative (see comfy/ldm/omnigen/omnigen2.py) with an +# added dual-stream ("double_stream") stage before the single-stream layers, conditioned +# by a Qwen3-VL multimodal LLM. Reuses the OmniGen2/Lumina building blocks and the Flux +# RoPE core, the only new component is the double-stream block + the hybrid forward order. + +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from einops import rearrange + +import comfy.ldm.common_dit +import comfy.ldm.omnigen.omnigen2 +from comfy.ldm.modules.attention import optimized_attention_masked +from comfy.ldm.omnigen.omnigen2 import ( + OmniGen2RotaryPosEmbed, + Lumina2CombinedTimestepCaptionEmbedding, + LuminaRMSNormZero, + LuminaLayerNormContinuous, + LuminaFeedForward, + Attention, + OmniGen2TransformerBlock, + apply_rotary_emb, +) + +class BooguDoubleStreamProcessor(nn.Module): + # Joint attention over [instruct ; img] with separate per-stream q/k/v and output projections. + def __init__(self, dim, head_dim, heads, kv_heads, dtype=None, device=None, operations=None): + super().__init__() + query_dim = head_dim * heads + kv_dim = head_dim * kv_heads + + self.img_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device) + self.img_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device) + self.img_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device) + + self.instruct_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device) + self.instruct_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device) + self.instruct_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device) + + self.instruct_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device) + self.img_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device) + + def forward(self, attn, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}): + batch_size = img_hidden_states.shape[0] + L_instruct = instruct_hidden_states.shape[1] + + img_q = self.img_to_q(img_hidden_states) + img_k = self.img_to_k(img_hidden_states) + img_v = self.img_to_v(img_hidden_states) + + instruct_q = self.instruct_to_q(instruct_hidden_states) + instruct_k = self.instruct_to_k(instruct_hidden_states) + instruct_v = self.instruct_to_v(instruct_hidden_states) + + # Concatenate instruction first, then image (matches reference processor order). + query = torch.cat([instruct_q, img_q], dim=1) + key = torch.cat([instruct_k, img_k], dim=1) + value = torch.cat([instruct_v, img_v], dim=1) + + query = query.view(batch_size, -1, attn.heads, attn.dim_head) + key = key.view(batch_size, -1, attn.kv_heads, attn.dim_head) + value = value.view(batch_size, -1, attn.kv_heads, attn.dim_head) + + query = attn.norm_q(query) + key = attn.norm_k(key) + + if rotary_emb is not None: + query = apply_rotary_emb(query, rotary_emb) + key = apply_rotary_emb(key, rotary_emb) + + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + + if attn.kv_heads < attn.heads: + key = key.repeat_interleave(attn.heads // attn.kv_heads, dim=1) + value = value.repeat_interleave(attn.heads // attn.kv_heads, dim=1) + + hidden_states = optimized_attention_masked(query, key, value, attn.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options) + + # Split back to instruction/image, apply per-stream output projections, recombine. + instruct_hidden_states = self.instruct_out(hidden_states[:, :L_instruct]) + img_hidden_states = self.img_out(hidden_states[:, L_instruct:]) + hidden_states = torch.cat([instruct_hidden_states, img_hidden_states], dim=1) + + hidden_states = attn.to_out[0](hidden_states) + return hidden_states + + +class BooguJointAttention(nn.Module): + # Holds the shared q/k RMSNorm + final output projection + def __init__(self, dim, head_dim, heads, kv_heads, eps=1e-5, dtype=None, device=None, operations=None): + super().__init__() + self.heads = heads + self.kv_heads = kv_heads + self.dim_head = head_dim + self.scale = head_dim ** -0.5 + + self.norm_q = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device) + self.norm_k = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device) + self.to_out = nn.Sequential( + operations.Linear(heads * head_dim, dim, bias=False, dtype=dtype, device=device), + nn.Dropout(0.0), + ) + self.processor = BooguDoubleStreamProcessor(dim, head_dim, heads, kv_heads, dtype=dtype, device=device, operations=operations) + + def forward(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}): + return self.processor(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask, transformer_options=transformer_options) + + +class BooguDoubleStreamBlock(nn.Module): + # Dual-stream block: joint attention over [instruct ; img] + image self-attention, each stream with its own modulation/MLP. + def __init__(self, dim, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=None, device=None, operations=None): + super().__init__() + head_dim = dim // num_attention_heads + + self.img_instruct_attn = BooguJointAttention(dim, head_dim, num_attention_heads, num_kv_heads, eps=1e-5, dtype=dtype, device=device, operations=operations) + self.img_self_attn = Attention( + query_dim=dim, dim_head=head_dim, heads=num_attention_heads, kv_heads=num_kv_heads, + eps=1e-5, bias=False, dtype=dtype, device=device, operations=operations, + ) + + self.img_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations) + self.instruct_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations) + + self.img_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations) + self.img_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations) + self.img_norm3 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations) + self.instruct_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations) + self.instruct_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations) + + self.img_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + self.img_self_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + self.img_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + self.img_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + + self.instruct_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + self.instruct_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + self.instruct_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device) + + def forward(self, img_hidden_states, instruct_hidden_states, joint_rotary_emb, img_rotary_emb, temb, joint_attention_mask=None, img_attention_mask=None, transformer_options={}): + L_instruct = instruct_hidden_states.shape[1] + + img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb) + img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb) + img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb) + + instruct_norm1_out, instruct_gate_msa, instruct_scale_mlp, instruct_gate_mlp = self.instruct_norm1(instruct_hidden_states, temb) + instruct_norm2_out, instruct_shift_mlp, _, _ = self.instruct_norm2(instruct_hidden_states, temb) + + joint_attn_out = self.img_instruct_attn(img_norm1_out, instruct_norm1_out, joint_rotary_emb, joint_attention_mask, transformer_options=transformer_options) + instruct_attn_out = joint_attn_out[:, :L_instruct] + img_attn_out = joint_attn_out[:, L_instruct:] + + img_self_attn_out = self.img_self_attn(img_norm3_out, img_norm3_out, img_attention_mask, img_rotary_emb, transformer_options=transformer_options) + + img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out) + img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out) + img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1) + img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input)) + img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out) + + instruct_hidden_states = instruct_hidden_states + instruct_gate_msa.unsqueeze(1).tanh() * self.instruct_attn_norm(instruct_attn_out) + instruct_mlp_input = (1 + instruct_scale_mlp.unsqueeze(1)) * instruct_norm2_out + instruct_shift_mlp.unsqueeze(1) + instruct_mlp_out = self.instruct_feed_forward(self.instruct_ffn_norm1(instruct_mlp_input)) + instruct_hidden_states = instruct_hidden_states + instruct_gate_mlp.unsqueeze(1).tanh() * self.instruct_ffn_norm2(instruct_mlp_out) + + return img_hidden_states, instruct_hidden_states + + +class BooguTransformer2DModel(nn.Module): + def __init__( + self, + patch_size: int = 2, + in_channels: int = 16, + out_channels: Optional[int] = None, + hidden_size: int = 3360, + num_layers: int = 32, + num_double_stream_layers: int = 8, + num_refiner_layers: int = 2, + num_attention_heads: int = 28, + num_kv_heads: int = 7, + multiple_of: int = 256, + ffn_dim_multiplier: Optional[float] = None, + norm_eps: float = 1e-5, + axes_dim_rope: Tuple[int, int, int] = (40, 40, 40), + axes_lens: Tuple[int, int, int] = (2048, 1664, 1664), + instruction_feat_dim: int = 4096, + timestep_scale: float = 1000.0, + image_model=None, + device=None, dtype=None, operations=None, + ): + super().__init__() + + self.patch_size = patch_size + self.out_channels = out_channels or in_channels + self.hidden_size = hidden_size + self.dtype = dtype + + self.rope_embedder = OmniGen2RotaryPosEmbed( + theta=10000, + axes_dim=axes_dim_rope, + axes_lens=axes_lens, + patch_size=patch_size, + ) + + self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device) + self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device) + + self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding( + hidden_size=hidden_size, + text_feat_dim=instruction_feat_dim, + norm_eps=norm_eps, + timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations + ) + + self.noise_refiner = nn.ModuleList([ + OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations) + for _ in range(num_refiner_layers) + ]) + + self.ref_image_refiner = nn.ModuleList([ + OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations) + for _ in range(num_refiner_layers) + ]) + + self.context_refiner = nn.ModuleList([ + OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations) + for _ in range(num_refiner_layers) + ]) + + self.double_stream_layers = nn.ModuleList([ + BooguDoubleStreamBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=dtype, device=device, operations=operations) + for _ in range(num_double_stream_layers) + ]) + + self.single_stream_layers = nn.ModuleList([ + OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations) + for _ in range(num_layers) + ]) + + self.norm_out = LuminaLayerNormContinuous( + embedding_dim=hidden_size, + conditioning_embedding_dim=min(hidden_size, 1024), + elementwise_affine=False, + eps=1e-6, + out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations + ) + + self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype)) + + # Patchify/refine helpers are identical to OmniGen2; reuse via bound methods. + flat_and_pad_to_seq = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.flat_and_pad_to_seq + img_patch_embed_and_refine = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.img_patch_embed_and_refine + + def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs): + B, C, H, W = x.shape + hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) + _, _, H_padded, W_padded = hidden_states.shape + timestep = 1.0 - timesteps + text_hidden_states = context + text_attention_mask = attention_mask + ref_image_hidden_states = ref_latents + device = hidden_states.device + + temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype) + + ( + hidden_states, ref_image_hidden_states, + img_mask, ref_img_mask, + l_effective_ref_img_len, l_effective_img_len, + ref_img_sizes, img_sizes, + ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states) + + ( + context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb, + rotary_emb, encoder_seq_lengths, seq_lengths, + ) = self.rope_embedder( + hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0], + l_effective_ref_img_len, l_effective_img_len, + ref_img_sizes, img_sizes, device, + ) + + for layer in self.context_refiner: + text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options) + + img_len = hidden_states.shape[1] + combined_img_hidden_states = self.img_patch_embed_and_refine( + hidden_states, ref_image_hidden_states, + img_mask, ref_img_mask, + noise_rotary_emb, ref_img_rotary_emb, + l_effective_ref_img_len, l_effective_img_len, + temb, + transformer_options=transformer_options, + ) + + # Double-stream stage: the image self-attention only sees the [ref ; noise] tokens, + # which sit after the instruction tokens in the joint rope. + L_instruct = text_hidden_states.shape[1] + combined_img_rotary_emb = rotary_emb[:, L_instruct:] + for layer in self.double_stream_layers: + combined_img_hidden_states, text_hidden_states = layer( + combined_img_hidden_states, text_hidden_states, + rotary_emb, combined_img_rotary_emb, temb, + joint_attention_mask=None, img_attention_mask=None, + transformer_options=transformer_options, + ) + + hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1) + + for layer in self.single_stream_layers: + hidden_states = layer(hidden_states, None, rotary_emb, temb, transformer_options=transformer_options) + + hidden_states = self.norm_out(hidden_states, temb) + + p = self.patch_size + output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=H_padded // p, w=W_padded // p, p1=p, p2=p)[:, :, :H, :W] + + return -output diff --git a/comfy/ldm/omnigen/omnigen2.py b/comfy/ldm/omnigen/omnigen2.py index e9ca5229d..b8da4cf39 100644 --- a/comfy/ldm/omnigen/omnigen2.py +++ b/comfy/ldm/omnigen/omnigen2.py @@ -22,7 +22,7 @@ def apply_rotary_emb(x, freqs_cis): def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: - return F.silu(x) * y + return F.silu(x, inplace=True).mul_(y) class TimestepEmbedding(nn.Module): diff --git a/comfy/model_base.py b/comfy/model_base.py index d143dc06f..f49da50ae 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -54,6 +54,7 @@ import comfy.ldm.pixeldit.model import comfy.ldm.pixeldit.pid import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 +import comfy.ldm.boogu.model import comfy.ldm.qwen_image.model import comfy.ldm.ideogram4.model import comfy.ldm.kandinsky5.model @@ -2103,6 +2104,11 @@ class Omnigen2(BaseModel): out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16]) return out +class Boogu(Omnigen2): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super(Omnigen2, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.boogu.model.BooguTransformer2DModel) + self.memory_usage_factor_conds = ("ref_latents",) + class QwenImage(BaseModel): def __init__(self, model_config, model_type=ModelType.FLUX, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 7d0cab308..b773f0393 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -761,6 +761,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): return dit_config + if '{}double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight'.format(key_prefix) in state_dict_keys: # Boogu-Image (OmniGen2 derivative + dual-stream stage) + dit_config = {} + dit_config["image_model"] = "boogu" + dit_config["hidden_size"] = state_dict['{}x_embedder.weight'.format(key_prefix)].shape[0] + dit_config["num_layers"] = count_blocks(state_dict_keys, '{}single_stream_layers.'.format(key_prefix) + '{}.') + dit_config["num_double_stream_layers"] = count_blocks(state_dict_keys, '{}double_stream_layers.'.format(key_prefix) + '{}.') + dit_config["num_refiner_layers"] = count_blocks(state_dict_keys, '{}noise_refiner.'.format(key_prefix) + '{}.') + dit_config["instruction_feat_dim"] = state_dict['{}time_caption_embed.caption_embedder.0.weight'.format(key_prefix)].shape[0] + return dit_config + if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: # Omnigen2 dit_config = {} dit_config["image_model"] = "omnigen2" diff --git a/comfy/sd.py b/comfy/sd.py index 348fe4958..d9b1c0553 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -68,6 +68,7 @@ import comfy.text_encoders.ace15 import comfy.text_encoders.longcat_image import comfy.text_encoders.qwen35 import comfy.text_encoders.qwen3vl +import comfy.text_encoders.boogu import comfy.text_encoders.ernie import comfy.text_encoders.gemma4 import comfy.text_encoders.cogvideo @@ -1301,6 +1302,7 @@ class CLIPType(Enum): LENS = 28 PIXELDIT = 29 IDEOGRAM4 = 30 + BOOGU = 31 @@ -1622,6 +1624,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."}) clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer + elif clip_type == CLIPType.BOOGU and te_model == TEModel.QWEN3VL_8B: # Boogu-Image: full Qwen3-VL-8B, last hidden state, no-think template. + clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."}) + clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2): # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused. klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b" clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 3be935577..cc05908ee 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -25,6 +25,7 @@ import comfy.text_encoders.hunyuan_image import comfy.text_encoders.kandinsky5 import comfy.text_encoders.z_image import comfy.text_encoders.ideogram4 +import comfy.text_encoders.boogu import comfy.text_encoders.anima import comfy.text_encoders.ace15 import comfy.text_encoders.longcat_image @@ -1758,6 +1759,27 @@ class Omnigen2(supported_models_base.BASE): hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect)) +class Boogu(Omnigen2): + unet_config = { + "image_model": "boogu", + } + + sampling_settings = { + "multiplier": 1.0, + "shift": 3.16, + } + + memory_usage_factor = 2.15 + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Boogu(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.boogu.BooguTokenizer, comfy.text_encoders.boogu.te(**hunyuan_detect)) + class Ideogram4(supported_models_base.BASE): unet_config = { "image_model": "ideogram4", @@ -2300,6 +2322,7 @@ models = [ ACEStep, ACEStep15, Omnigen2, + Boogu, QwenImage, Ideogram4, Flux2, diff --git a/comfy/text_encoders/boogu.py b/comfy/text_encoders/boogu.py new file mode 100644 index 000000000..d9de92f10 --- /dev/null +++ b/comfy/text_encoders/boogu.py @@ -0,0 +1,58 @@ +"""Boogu-Image text encoder: full Qwen3-VL-8B, last hidden state (4096-dim). + +Boogu uses the final hidden state of Qwen3-VL as the per-token instruction feature +(num_instruction_feature_layers=1, reduce_type=mean -> just the last layer). +The model itself is the standard Qwen3-VL TE, only the chat template differs +(a fixed system prompt and no block). +""" + +import comfy.text_encoders.qwen3vl +from comfy import sd1_clip + + +# System prompts from the reference pipeline (pipeline_boogu.py). +# T2I (non-empty instruction, no image) uses the helpful-assistant prompt +# everything else (the CFG negative / "drop" condition, and any image case) uses the TI2I "describe" prompt. +BOOGU_T2I_SYSTEM = "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows." +BOOGU_DROP_SYSTEM = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate." + + +class BooguTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b") + # apply_chat_template without add_generation_prompt + self.llama_template = "<|im_start|>system\n" + BOOGU_T2I_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n" + self.llama_template_images = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n" + # Reference SYSTEM_PROMPT_DROP: used for the empty negative/uncond instruction. + self.llama_template_drop = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n" + + def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs): + if llama_template is None and len(images) == 0 and text.strip() == "": + llama_template = self.llama_template_drop + # Boogu conditions on the no-think template; thinking=True drops the empty block qwen3vl adds by default. + return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs) + + +class BooguQwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel): + def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"): + super().__init__(device=device, dtype=dtype, attention_mask=attention_mask, model_options=model_options, model_type=model_type) + # apply the final RMSNorm to the tapped last layer + self.layer_norm_hidden_state = True + + +class BooguTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + clip_model = lambda **kw: BooguQwen3VLClipModel(**kw, model_type="qwen3vl_8b") + super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=clip_model, model_options=model_options) + + +def te(dtype_llama=None, llama_quantization_metadata=None): + class BooguTEModel_(BooguTEModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if dtype_llama is not None: + dtype = dtype_llama + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["quantization_metadata"] = llama_quantization_metadata + super().__init__(device=device, dtype=dtype, model_options=model_options) + return BooguTEModel_ diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py index adb5a3144..0f30608a9 100644 --- a/comfy_api/feature_flags.py +++ b/comfy_api/feature_flags.py @@ -25,6 +25,11 @@ CLI_FEATURE_FLAG_REGISTRY: dict[str, FeatureFlagInfo] = { "default": False, "description": "Show the sign-in button in the frontend even when not signed in", }, + "enable_telemetry": { + "type": "bool", + "default": False, + "description": "Signal the frontend that telemetry collection is enabled", + }, } diff --git a/comfy_api_nodes/apis/kling.py b/comfy_api_nodes/apis/kling.py index fe0f97cb3..2c98c23b7 100644 --- a/comfy_api_nodes/apis/kling.py +++ b/comfy_api_nodes/apis/kling.py @@ -149,3 +149,59 @@ class MotionControlRequest(BaseModel): character_orientation: str = Field(...) mode: str = Field(..., description="'pro' or 'std'") model_name: str = Field(...) + + +class Kling3TurboSettings(BaseModel): + resolution: str = Field("720p", description="'720p' or '1080p'") + aspect_ratio: str | None = Field(None, description="'16:9'/'9:16'/'1:1'; text-to-video only") + duration: int = Field(5, description="3-15 second") + + +class Kling3TurboText2VideoRequest(BaseModel): + prompt: str = Field(..., description="<=3072 chars; may use multi-shot 'shot n, m, words; ...'") + settings: Kling3TurboSettings | None = Field(None) + + +class Kling3TurboContent(BaseModel): + type: str = Field(..., description="'prompt' or 'first_frame'") + text: str | None = Field(None, description="for type=prompt; <=2500 chars") + url: str | None = Field(None, description="for type=first_frame") + + +class Kling3TurboImage2VideoRequest(BaseModel): + contents: list[Kling3TurboContent] = Field(..., description="prompt + first_frame materials") + settings: Kling3TurboSettings | None = Field(None) + + +class Kling3TurboCreateData(BaseModel): + id: str | None = Field(None, description="Task ID") + status: str | None = Field(None) + message: str | None = Field(None) + + +class Kling3TurboCreateResponse(BaseModel): + code: int | None = Field(None) + message: str | None = Field(None) + request_id: str | None = Field(None) + data: Kling3TurboCreateData | None = Field(None) + + +class Kling3TurboOutput(BaseModel): + type: str | None = Field(None, description="'video', 'image', 'audio', ...") + id: str | None = Field(None) + url: str | None = Field(None) + duration: str | None = Field(None) + + +class Kling3TurboTaskData(BaseModel): + id: str | None = Field(None) + status: str | None = Field(None, description="submitted | processing | succeeded | failed") + message: str | None = Field(None) + outputs: list[Kling3TurboOutput] | None = Field(None) + + +class Kling3TurboQueryResponse(BaseModel): + code: int | None = Field(None) + message: str | None = Field(None) + request_id: str | None = Field(None) + data: list[Kling3TurboTaskData] | None = Field(None) diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index c81d3503d..b27de2549 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -60,6 +60,12 @@ from comfy_api_nodes.apis.kling import ( OmniProImageRequest, OmniProReferences2VideoRequest, OmniProText2VideoRequest, + Kling3TurboSettings, + Kling3TurboText2VideoRequest, + Kling3TurboContent, + Kling3TurboImage2VideoRequest, + Kling3TurboCreateResponse, + Kling3TurboQueryResponse, TaskStatusResponse, TextToVideoWithAudioRequest, ) @@ -2847,6 +2853,67 @@ class MotionControl(IO.ComfyNode): return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url)) +def build_turbo_shot_prompt(multi_prompt: list[MultiPromptEntry]) -> str: + """Render storyboard entries into the Turbo multi-shot prompt 'shot n, m, words; ...'.""" + return "; ".join(f"shot {i}, {int(e.duration)}, {e.prompt}" for i, e in enumerate(multi_prompt, 1)) + ";" + + +def _turbo_video_url(response: Kling3TurboQueryResponse) -> str: + """Extract the result video URL from a /tasks response (data[].outputs[] where type == 'video').""" + task = response.data[0] if response.data else None + if task and task.outputs: + for output in task.outputs: + if output.type == "video" and output.url: + return output.url + raise RuntimeError(f"Kling 3.0 Turbo task finished without a video output: {response.model_dump()}") + + +async def execute_kling_turbo( + cls: type[IO.ComfyNode], + *, + prompt: str, + resolution: str, + aspect_ratio: str, + duration: int, + start_frame: torch.Tensor | None, +) -> IO.NodeOutput: + """Create + poll a Kling 3.0 Turbo task. Image-to-video when start_frame is given, else text-to-video.""" + if start_frame is not None: + validate_image_dimensions(start_frame, min_width=300, min_height=300) + validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1)) + contents = [Kling3TurboContent(type="first_frame", url=tensor_to_base64_string(start_frame))] + if prompt: + contents.insert(0, Kling3TurboContent(type="prompt", text=prompt)) + create = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/image-to-video/kling-3.0-turbo", method="POST"), + response_model=Kling3TurboCreateResponse, + data=Kling3TurboImage2VideoRequest( + contents=contents, + settings=Kling3TurboSettings(resolution=resolution, duration=duration), # i2v: no aspect_ratio + ), + ) + else: + create = await sync_op( + cls, + ApiEndpoint(path="/proxy/kling/text-to-video/kling-3.0-turbo", method="POST"), + response_model=Kling3TurboCreateResponse, + data=Kling3TurboText2VideoRequest( + prompt=prompt, + settings=Kling3TurboSettings(resolution=resolution, aspect_ratio=aspect_ratio, duration=duration), + ), + ) + if not (create.data and create.data.id): + raise RuntimeError(f"Kling 3.0 Turbo create failed. Code: {create.code}, Message: {create.message}") + final_response = await poll_op( + cls, + ApiEndpoint(path="/proxy/kling/tasks", query_params={"task_ids": create.data.id}), + response_model=Kling3TurboQueryResponse, + status_extractor=lambda r: (r.data[0].status if r.data else None), + ) + return IO.NodeOutput(await download_url_to_video_output(_turbo_video_url(final_response))) + + class KlingVideoNode(IO.ComfyNode): @classmethod @@ -2884,7 +2951,11 @@ class KlingVideoNode(IO.ComfyNode): ], tooltip="Generate a series of video segments with individual prompts and durations.", ), - IO.Boolean.Input("generate_audio", default=True), + IO.Boolean.Input( + "generate_audio", + default=True, + tooltip="'kling-3.0-turbo' always generates native audio, so the audio toggle is ignored.", + ), IO.DynamicCombo.Input( "model", options=[ @@ -2899,6 +2970,17 @@ class KlingVideoNode(IO.ComfyNode): ), ], ), + IO.DynamicCombo.Option( + "kling-3.0-turbo", + [ + IO.Combo.Input("resolution", options=["1080p", "720p"], default="720p"), + IO.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16", "1:1"], + tooltip="Ignored in image-to-video mode.", + ), + ], + ), ], tooltip="Model and generation settings.", ), @@ -2930,6 +3012,7 @@ class KlingVideoNode(IO.ComfyNode): price_badge=IO.PriceBadge( depends_on=IO.PriceBadgeDepends( widgets=[ + "model", "model.resolution", "generate_audio", "multi_shot", @@ -2944,14 +3027,7 @@ class KlingVideoNode(IO.ComfyNode): ), expr=""" ( - $rates := { - "4k": {"off": 0.42, "on": 0.42}, - "1080p": {"off": 0.112, "on": 0.168}, - "720p": {"off": 0.084, "on": 0.126} - }; $res := $lookup(widgets, "model.resolution"); - $audio := widgets.generate_audio ? "on" : "off"; - $rate := $lookup($lookup($rates, $res), $audio); $ms := widgets.multi_shot; $isSb := $ms != "disabled"; $n := $isSb ? $number($substring($ms, 0, 1)) : 0; @@ -2962,7 +3038,18 @@ class KlingVideoNode(IO.ComfyNode): $d5 := $n >= 5 ? $lookup(widgets, "multi_shot.storyboard_5_duration") : 0; $d6 := $n >= 6 ? $lookup(widgets, "multi_shot.storyboard_6_duration") : 0; $dur := $isSb ? $d1 + $d2 + $d3 + $d4 + $d5 + $d6 : $lookup(widgets, "multi_shot.duration"); - {"type":"usd","usd": $rate * $dur} + widgets.model = "kling-3.0-turbo" + ? {"type":"usd","usd": ($res = "1080p" ? 0.14 : 0.112) * $dur} + : ( + $rates := { + "4k": {"off": 0.42, "on": 0.42}, + "1080p": {"off": 0.112, "on": 0.168}, + "720p": {"off": 0.084, "on": 0.126} + }; + $audio := widgets.generate_audio ? "on" : "off"; + $rate := $lookup($lookup($rates, $res), $audio); + {"type":"usd","usd": $rate * $dur} + ) ) """, ), @@ -3015,6 +3102,17 @@ class KlingVideoNode(IO.ComfyNode): duration = multi_shot["duration"] validate_string(multi_shot["prompt"], min_length=1, max_length=2500) + if model["model"] == "kling-3.0-turbo": + turbo_prompt = build_turbo_shot_prompt(multi_prompt_list) if custom_multi_shot else multi_shot["prompt"] + return await execute_kling_turbo( + cls, + prompt=turbo_prompt, + resolution=model["resolution"], + aspect_ratio=model["aspect_ratio"], + duration=duration, + start_frame=start_frame, + ) + if start_frame is not None: validate_image_dimensions(start_frame, min_width=300, min_height=300) validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1)) diff --git a/comfy_extras/nodes_boogu.py b/comfy_extras/nodes_boogu.py new file mode 100644 index 000000000..f3951c290 --- /dev/null +++ b/comfy_extras/nodes_boogu.py @@ -0,0 +1,97 @@ +import math + +import node_helpers +import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + +class TextEncodeBooguEdit(io.ComfyNode): + """Boogu-Image Edit conditioning. + + The edit image is used twice, matching the reference pipeline: + - Qwen3-VL vision tokens (instruction understanding) -> positive only + - VAE reference latent (image identity) -> positive and negative + The ref latent is in both conds so it cancels under CFG (identity preserved); + the vision tokens are only in the positive so CFG amplifies the instruction. + The tokenizer selects the right system prompt automatically (image -> TI2I, + empty negative -> DROP), so no template plumbing is needed here. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="TextEncodeBooguEdit", + category="model/conditioning/boogu", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.String.Input("negative_prompt", multiline=True, dynamic_prompts=True, advanced=True), + io.Vae.Input("vae"), + io.Autogrow.Input( + "images", + template=io.Autogrow.TemplateNames( + io.Image.Input("image"), + names=[f"image_{i}" for i in range(1, 17)], + min=0, + ), + tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.", + ), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + ) + + @classmethod + def execute(cls, clip, prompt, negative_prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput: + ref_latents = [] + images_vl = [] + + images = images or {} + for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])): + image = images[name] + if image is None: + continue + samples = image.movedim(-1, 1) + + # Vision tower input: the reference caps the VLM image at 384x384 + # (max_vlm_input_pil_pixels in pipeline_boogu.py). + total = int(384 * 384) + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + images_vl.append(s.movedim(1, -1)[:, :, :, :3]) + + # Reference latent: align to 16 px (VAE /8 * patch_size 2). + if vae is not None: + total = int(1024 * 1024) + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by / 16.0) * 16 + height = round(samples.shape[2] * scale_by / 16.0) * 16 + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled") + ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3])) + + # positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both. + positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl)) + negative = clip.encode_from_tokens_scheduled(clip.tokenize(negative_prompt)) + + if len(ref_latents) > 0: + positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True) + negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True) + + return io.NodeOutput(positive, negative) + + +class BooguExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TextEncodeBooguEdit, + ] + + +async def comfy_entrypoint() -> BooguExtension: + return BooguExtension() diff --git a/nodes.py b/nodes.py index bb4649478..0b3fdab63 100644 --- a/nodes.py +++ b/nodes.py @@ -969,7 +969,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4", "boogu"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -2425,6 +2425,7 @@ async def init_builtin_extra_nodes(): "nodes_tcfg.py", "nodes_context_windows.py", "nodes_qwen.py", + "nodes_boogu.py", "nodes_chroma_radiance.py", "nodes_pid.py", "nodes_model_patch.py", diff --git a/openapi.yaml b/openapi.yaml index 5193e1773..2446e64e4 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -673,46 +673,32 @@ components: - created_at - updated_at type: object - JobsBatchCancelNotFoundResponse: - description: | - Returned with 404 from POST /api/jobs/cancel when one or more - requested job ids are unknown. The batch is fail-fast, so no job - was cancelled. - properties: - error: - description: Human-readable error message - type: string - unknown_ids: - description: The subset of requested job ids that were not found - items: - type: string - type: array - required: - - error - - unknown_ids - type: object - JobsBatchCancelRequest: + JobsCancelRequest: additionalProperties: false - description: Request body for batch job cancellation + description: Request to cancel multiple jobs by ID. properties: job_ids: - description: Ids (UUIDs) of the jobs to cancel + description: Job identifiers (UUIDs) to cancel. items: format: uuid type: string + maxItems: 100 + minItems: 1 type: array required: - job_ids type: object - JobsBatchCancelResponse: - description: Response for POST /api/jobs/cancel when all requested jobs were known. + JobsCancelResponse: + description: Response for POST /api/jobs/cancel. properties: cancelled: description: | - True when a cancel event was dispatched for at least one job in - the batch. False when every requested job was already in a - terminal state (the call is still 200 — idempotent). - type: boolean + Job IDs for which a cancel event was successfully dispatched by this + call. Jobs already in a terminal or cancelling state are idempotently + skipped and will not appear here. + items: + type: string + type: array required: - cancelled type: object @@ -1049,7 +1035,7 @@ components: description: If true, clear all pending jobs from the queue type: boolean delete: - description: Array of PENDING job IDs to cancel + description: Array of job IDs to cancel; pending and running jobs transition to cancelled items: type: string type: array @@ -1865,6 +1851,83 @@ paths: summary: Update asset metadata tags: - file + /api/assets/{id}/content: + get: + description: | + Returns the binary content of an asset by ID. + + The contract is the same across runtimes — "GET this path and you + receive the asset's bytes" — but the mechanism differs: + - **Local ComfyUI** streams the bytes directly (`200`, + `application/octet-stream`). + - **Cloud** does not proxy large files; it responds `302` with a + `Location` redirect to a short-lived signed storage URL. Clients that + follow redirects (browsers, `fetch`/XHR, ``/`