feat: Support Boogu-Image (CORE-308) (#14523)

2026-06-18 22:09:38 +08:00 · 2026-06-18 00:22:36 +03:00 · 2026-06-18 00:22:36 +03:00 · e25c391888
commit e25c391888
parent ca3dbe206c
9 changed files with 523 additions and 2 deletions
--- a/comfy/ldm/boogu/model.py
+++ b/comfy/ldm/boogu/model.py
@ -0,0 +1,321 @@
+# Boogu-Image-0.1 transformer
+# Architecture is an OmniGen2 derivative (see comfy/ldm/omnigen/omnigen2.py) with an
+# added dual-stream ("double_stream") stage before the single-stream layers, conditioned
+# by a Qwen3-VL multimodal LLM. Reuses the OmniGen2/Lumina building blocks and the Flux
+# RoPE core, the only new component is the double-stream block + the hybrid forward order.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+import comfy.ldm.common_dit
+import comfy.ldm.omnigen.omnigen2
+from comfy.ldm.modules.attention import optimized_attention_masked
+from comfy.ldm.omnigen.omnigen2 import (
+    OmniGen2RotaryPosEmbed,
+    Lumina2CombinedTimestepCaptionEmbedding,
+    LuminaRMSNormZero,
+    LuminaLayerNormContinuous,
+    LuminaFeedForward,
+    Attention,
+    OmniGen2TransformerBlock,
+    apply_rotary_emb,
+)
+
+class BooguDoubleStreamProcessor(nn.Module):
+    # Joint attention over [instruct ; img] with separate per-stream q/k/v and output projections.
+    def __init__(self, dim, head_dim, heads, kv_heads, dtype=None, device=None, operations=None):
+        super().__init__()
+        query_dim = head_dim * heads
+        kv_dim = head_dim * kv_heads
+
+        self.img_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
+        self.img_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
+        self.img_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
+
+        self.instruct_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
+        self.instruct_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
+        self.instruct_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
+
+        self.instruct_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
+        self.img_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
+
+    def forward(self, attn, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
+        batch_size = img_hidden_states.shape[0]
+        L_instruct = instruct_hidden_states.shape[1]
+
+        img_q = self.img_to_q(img_hidden_states)
+        img_k = self.img_to_k(img_hidden_states)
+        img_v = self.img_to_v(img_hidden_states)
+
+        instruct_q = self.instruct_to_q(instruct_hidden_states)
+        instruct_k = self.instruct_to_k(instruct_hidden_states)
+        instruct_v = self.instruct_to_v(instruct_hidden_states)
+
+        # Concatenate instruction first, then image (matches reference processor order).
+        query = torch.cat([instruct_q, img_q], dim=1)
+        key = torch.cat([instruct_k, img_k], dim=1)
+        value = torch.cat([instruct_v, img_v], dim=1)
+
+        query = query.view(batch_size, -1, attn.heads, attn.dim_head)
+        key = key.view(batch_size, -1, attn.kv_heads, attn.dim_head)
+        value = value.view(batch_size, -1, attn.kv_heads, attn.dim_head)
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        if rotary_emb is not None:
+            query = apply_rotary_emb(query, rotary_emb)
+            key = apply_rotary_emb(key, rotary_emb)
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if attn.kv_heads < attn.heads:
+            key = key.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
+            value = value.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
+
+        hidden_states = optimized_attention_masked(query, key, value, attn.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options)
+
+        # Split back to instruction/image, apply per-stream output projections, recombine.
+        instruct_hidden_states = self.instruct_out(hidden_states[:, :L_instruct])
+        img_hidden_states = self.img_out(hidden_states[:, L_instruct:])
+        hidden_states = torch.cat([instruct_hidden_states, img_hidden_states], dim=1)
+
+        hidden_states = attn.to_out[0](hidden_states)
+        return hidden_states
+
+
+class BooguJointAttention(nn.Module):
+    # Holds the shared q/k RMSNorm + final output projection
+    def __init__(self, dim, head_dim, heads, kv_heads, eps=1e-5, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.heads = heads
+        self.kv_heads = kv_heads
+        self.dim_head = head_dim
+        self.scale = head_dim ** -0.5
+
+        self.norm_q = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
+        self.norm_k = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
+        self.to_out = nn.Sequential(
+            operations.Linear(heads * head_dim, dim, bias=False, dtype=dtype, device=device),
+            nn.Dropout(0.0),
+        )
+        self.processor = BooguDoubleStreamProcessor(dim, head_dim, heads, kv_heads, dtype=dtype, device=device, operations=operations)
+
+    def forward(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
+        return self.processor(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask, transformer_options=transformer_options)
+
+
+class BooguDoubleStreamBlock(nn.Module):
+    # Dual-stream block: joint attention over [instruct ; img] + image self-attention, each stream with its own modulation/MLP.
+    def __init__(self, dim, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=None, device=None, operations=None):
+        super().__init__()
+        head_dim = dim // num_attention_heads
+
+        self.img_instruct_attn = BooguJointAttention(dim, head_dim, num_attention_heads, num_kv_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
+        self.img_self_attn = Attention(
+            query_dim=dim, dim_head=head_dim, heads=num_attention_heads, kv_heads=num_kv_heads,
+            eps=1e-5, bias=False, dtype=dtype, device=device, operations=operations,
+        )
+
+        self.img_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
+        self.instruct_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
+
+        self.img_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+        self.img_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+        self.img_norm3 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+        self.instruct_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+        self.instruct_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
+
+        self.img_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+        self.img_self_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+        self.img_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+        self.img_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+
+        self.instruct_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+        self.instruct_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+        self.instruct_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
+
+    def forward(self, img_hidden_states, instruct_hidden_states, joint_rotary_emb, img_rotary_emb, temb, joint_attention_mask=None, img_attention_mask=None, transformer_options={}):
+        L_instruct = instruct_hidden_states.shape[1]
+
+        img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
+        img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
+        img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)
+
+        instruct_norm1_out, instruct_gate_msa, instruct_scale_mlp, instruct_gate_mlp = self.instruct_norm1(instruct_hidden_states, temb)
+        instruct_norm2_out, instruct_shift_mlp, _, _ = self.instruct_norm2(instruct_hidden_states, temb)
+
+        joint_attn_out = self.img_instruct_attn(img_norm1_out, instruct_norm1_out, joint_rotary_emb, joint_attention_mask, transformer_options=transformer_options)
+        instruct_attn_out = joint_attn_out[:, :L_instruct]
+        img_attn_out = joint_attn_out[:, L_instruct:]
+
+        img_self_attn_out = self.img_self_attn(img_norm3_out, img_norm3_out, img_attention_mask, img_rotary_emb, transformer_options=transformer_options)
+
+        img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
+        img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)
+        img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
+        img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
+        img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)
+
+        instruct_hidden_states = instruct_hidden_states + instruct_gate_msa.unsqueeze(1).tanh() * self.instruct_attn_norm(instruct_attn_out)
+        instruct_mlp_input = (1 + instruct_scale_mlp.unsqueeze(1)) * instruct_norm2_out + instruct_shift_mlp.unsqueeze(1)
+        instruct_mlp_out = self.instruct_feed_forward(self.instruct_ffn_norm1(instruct_mlp_input))
+        instruct_hidden_states = instruct_hidden_states + instruct_gate_mlp.unsqueeze(1).tanh() * self.instruct_ffn_norm2(instruct_mlp_out)
+
+        return img_hidden_states, instruct_hidden_states
+
+
+class BooguTransformer2DModel(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        out_channels: Optional[int] = None,
+        hidden_size: int = 3360,
+        num_layers: int = 32,
+        num_double_stream_layers: int = 8,
+        num_refiner_layers: int = 2,
+        num_attention_heads: int = 28,
+        num_kv_heads: int = 7,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+        norm_eps: float = 1e-5,
+        axes_dim_rope: Tuple[int, int, int] = (40, 40, 40),
+        axes_lens: Tuple[int, int, int] = (2048, 1664, 1664),
+        instruction_feat_dim: int = 4096,
+        timestep_scale: float = 1000.0,
+        image_model=None,
+        device=None, dtype=None, operations=None,
+    ):
+        super().__init__()
+
+        self.patch_size = patch_size
+        self.out_channels = out_channels or in_channels
+        self.hidden_size = hidden_size
+        self.dtype = dtype
+
+        self.rope_embedder = OmniGen2RotaryPosEmbed(
+            theta=10000,
+            axes_dim=axes_dim_rope,
+            axes_lens=axes_lens,
+            patch_size=patch_size,
+        )
+
+        self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
+        self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
+
+        self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
+            hidden_size=hidden_size,
+            text_feat_dim=instruction_feat_dim,
+            norm_eps=norm_eps,
+            timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
+        )
+
+        self.noise_refiner = nn.ModuleList([
+            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
+            for _ in range(num_refiner_layers)
+        ])
+
+        self.ref_image_refiner = nn.ModuleList([
+            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
+            for _ in range(num_refiner_layers)
+        ])
+
+        self.context_refiner = nn.ModuleList([
+            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations)
+            for _ in range(num_refiner_layers)
+        ])
+
+        self.double_stream_layers = nn.ModuleList([
+            BooguDoubleStreamBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=dtype, device=device, operations=operations)
+            for _ in range(num_double_stream_layers)
+        ])
+
+        self.single_stream_layers = nn.ModuleList([
+            OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
+            for _ in range(num_layers)
+        ])
+
+        self.norm_out = LuminaLayerNormContinuous(
+            embedding_dim=hidden_size,
+            conditioning_embedding_dim=min(hidden_size, 1024),
+            elementwise_affine=False,
+            eps=1e-6,
+            out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations
+        )
+
+        self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype))
+
+    # Patchify/refine helpers are identical to OmniGen2; reuse via bound methods.
+    flat_and_pad_to_seq = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.flat_and_pad_to_seq
+    img_patch_embed_and_refine = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.img_patch_embed_and_refine
+
+    def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs):
+        B, C, H, W = x.shape
+        hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+        _, _, H_padded, W_padded = hidden_states.shape
+        timestep = 1.0 - timesteps
+        text_hidden_states = context
+        text_attention_mask = attention_mask
+        ref_image_hidden_states = ref_latents
+        device = hidden_states.device
+
+        temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
+
+        (
+            hidden_states, ref_image_hidden_states,
+            img_mask, ref_img_mask,
+            l_effective_ref_img_len, l_effective_img_len,
+            ref_img_sizes, img_sizes,
+        ) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
+
+        (
+            context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb,
+            rotary_emb, encoder_seq_lengths, seq_lengths,
+        ) = self.rope_embedder(
+            hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0],
+            l_effective_ref_img_len, l_effective_img_len,
+            ref_img_sizes, img_sizes, device,
+        )
+
+        for layer in self.context_refiner:
+            text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options)
+
+        img_len = hidden_states.shape[1]
+        combined_img_hidden_states = self.img_patch_embed_and_refine(
+            hidden_states, ref_image_hidden_states,
+            img_mask, ref_img_mask,
+            noise_rotary_emb, ref_img_rotary_emb,
+            l_effective_ref_img_len, l_effective_img_len,
+            temb,
+            transformer_options=transformer_options,
+        )
+
+        # Double-stream stage: the image self-attention only sees the [ref ; noise] tokens,
+        # which sit after the instruction tokens in the joint rope.
+        L_instruct = text_hidden_states.shape[1]
+        combined_img_rotary_emb = rotary_emb[:, L_instruct:]
+        for layer in self.double_stream_layers:
+            combined_img_hidden_states, text_hidden_states = layer(
+                combined_img_hidden_states, text_hidden_states,
+                rotary_emb, combined_img_rotary_emb, temb,
+                joint_attention_mask=None, img_attention_mask=None,
+                transformer_options=transformer_options,
+            )
+
+        hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
+
+        for layer in self.single_stream_layers:
+            hidden_states = layer(hidden_states, None, rotary_emb, temb, transformer_options=transformer_options)
+
+        hidden_states = self.norm_out(hidden_states, temb)
+
+        p = self.patch_size
+        output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=H_padded // p, w=W_padded // p, p1=p, p2=p)[:, :, :H, :W]
+
+        return -output
--- a/comfy/ldm/omnigen/omnigen2.py
+++ b/comfy/ldm/omnigen/omnigen2.py
@ -22,7 +22,7 @@ def apply_rotary_emb(x, freqs_cis):


 def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    return F.silu(x) * y
+    return F.silu(x, inplace=True).mul_(y)


 class TimestepEmbedding(nn.Module):
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -54,6 +54,7 @@ import comfy.ldm.pixeldit.model
 import comfy.ldm.pixeldit.pid
 import comfy.ldm.ace.model
 import comfy.ldm.omnigen.omnigen2
+import comfy.ldm.boogu.model
 import comfy.ldm.qwen_image.model
 import comfy.ldm.ideogram4.model
 import comfy.ldm.kandinsky5.model
@ -2103,6 +2104,11 @@ class Omnigen2(BaseModel):
            out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
        return out

+class Boogu(Omnigen2):
+    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+        super(Omnigen2, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.boogu.model.BooguTransformer2DModel)
+        self.memory_usage_factor_conds = ("ref_latents",)
+
 class QwenImage(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -761,6 +761,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):

        return dit_config

+    if '{}double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight'.format(key_prefix) in state_dict_keys:  # Boogu-Image (OmniGen2 derivative + dual-stream stage)
+        dit_config = {}
+        dit_config["image_model"] = "boogu"
+        dit_config["hidden_size"] = state_dict['{}x_embedder.weight'.format(key_prefix)].shape[0]
+        dit_config["num_layers"] = count_blocks(state_dict_keys, '{}single_stream_layers.'.format(key_prefix) + '{}.')
+        dit_config["num_double_stream_layers"] = count_blocks(state_dict_keys, '{}double_stream_layers.'.format(key_prefix) + '{}.')
+        dit_config["num_refiner_layers"] = count_blocks(state_dict_keys, '{}noise_refiner.'.format(key_prefix) + '{}.')
+        dit_config["instruction_feat_dim"] = state_dict['{}time_caption_embed.caption_embedder.0.weight'.format(key_prefix)].shape[0]
+        return dit_config
+
    if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys:  # Omnigen2
        dit_config = {}
        dit_config["image_model"] = "omnigen2"
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -68,6 +68,7 @@ import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
 import comfy.text_encoders.qwen35
 import comfy.text_encoders.qwen3vl
+import comfy.text_encoders.boogu
 import comfy.text_encoders.ernie
 import comfy.text_encoders.gemma4
 import comfy.text_encoders.cogvideo
@ -1301,6 +1302,7 @@ class CLIPType(Enum):
    LENS = 28
    PIXELDIT = 29
    IDEOGRAM4 = 30
+    BOOGU = 31



@ -1622,6 +1624,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
                clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
                clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
+            elif clip_type == CLIPType.BOOGU and te_model == TEModel.QWEN3VL_8B:  # Boogu-Image: full Qwen3-VL-8B, last hidden state, no-think template.
+                clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
+                clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data))
+                clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer
            elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2):  # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
                klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b"
                clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type)
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -25,6 +25,7 @@ import comfy.text_encoders.hunyuan_image
 import comfy.text_encoders.kandinsky5
 import comfy.text_encoders.z_image
 import comfy.text_encoders.ideogram4
+import comfy.text_encoders.boogu
 import comfy.text_encoders.anima
 import comfy.text_encoders.ace15
 import comfy.text_encoders.longcat_image
@ -1758,6 +1759,27 @@ class Omnigen2(supported_models_base.BASE):
        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
        return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))

+class Boogu(Omnigen2):
+    unet_config = {
+        "image_model": "boogu",
+    }
+
+    sampling_settings = {
+        "multiplier": 1.0,
+        "shift": 3.16,
+    }
+
+    memory_usage_factor = 1.95 #TODO
+
+    def get_model(self, state_dict, prefix="", device=None):
+        out = model_base.Boogu(self, device=device)
+        return out
+
+    def clip_target(self, state_dict={}):
+        pref = self.text_encoder_key_prefix[0]
+        hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref))
+        return supported_models_base.ClipTarget(comfy.text_encoders.boogu.BooguTokenizer, comfy.text_encoders.boogu.te(**hunyuan_detect))
+
 class Ideogram4(supported_models_base.BASE):
    unet_config = {
        "image_model": "ideogram4",
@ -2300,6 +2322,7 @@ models = [
    ACEStep,
    ACEStep15,
    Omnigen2,
+    Boogu,
    QwenImage,
    Ideogram4,
    Flux2,
--- a/comfy/text_encoders/boogu.py
+++ b/comfy/text_encoders/boogu.py
@ -0,0 +1,58 @@
+"""Boogu-Image text encoder: full Qwen3-VL-8B, last hidden state (4096-dim).
+
+Boogu uses the final hidden state of Qwen3-VL as the per-token instruction feature
+(num_instruction_feature_layers=1, reduce_type=mean -> just the last layer).
+The model itself is the standard Qwen3-VL TE, only the chat template differs
+(a fixed system prompt and no <think> block).
+"""
+
+import comfy.text_encoders.qwen3vl
+from comfy import sd1_clip
+
+
+# System prompts from the reference pipeline (pipeline_boogu.py).
+# T2I (non-empty instruction, no image) uses the helpful-assistant prompt
+# everything else (the CFG negative / "drop" condition, and any image case) uses the TI2I "describe" prompt.
+BOOGU_T2I_SYSTEM = "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows."
+BOOGU_DROP_SYSTEM = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
+
+
+class BooguTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
+    def __init__(self, embedding_directory=None, tokenizer_data={}):
+        super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
+        # apply_chat_template without add_generation_prompt
+        self.llama_template = "<|im_start|>system\n" + BOOGU_T2I_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
+        self.llama_template_images = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n"
+        # Reference SYSTEM_PROMPT_DROP: used for the empty negative/uncond instruction.
+        self.llama_template_drop = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
+
+    def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
+        if llama_template is None and len(images) == 0 and text.strip() == "":
+            llama_template = self.llama_template_drop
+        # Boogu conditions on the no-think template; thinking=True drops the empty <think> block qwen3vl adds by default.
+        return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
+
+
+class BooguQwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
+    def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
+        super().__init__(device=device, dtype=dtype, attention_mask=attention_mask, model_options=model_options, model_type=model_type)
+        # apply the final RMSNorm to the tapped last layer
+        self.layer_norm_hidden_state = True
+
+
+class BooguTEModel(sd1_clip.SD1ClipModel):
+    def __init__(self, device="cpu", dtype=None, model_options={}):
+        clip_model = lambda **kw: BooguQwen3VLClipModel(**kw, model_type="qwen3vl_8b")
+        super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=clip_model, model_options=model_options)
+
+
+def te(dtype_llama=None, llama_quantization_metadata=None):
+    class BooguTEModel_(BooguTEModel):
+        def __init__(self, device="cpu", dtype=None, model_options={}):
+            if dtype_llama is not None:
+                dtype = dtype_llama
+            if llama_quantization_metadata is not None:
+                model_options = model_options.copy()
+                model_options["quantization_metadata"] = llama_quantization_metadata
+            super().__init__(device=device, dtype=dtype, model_options=model_options)
+    return BooguTEModel_
--- a/comfy_extras/nodes_boogu.py
+++ b/comfy_extras/nodes_boogu.py
@ -0,0 +1,96 @@
+import math
+
+import node_helpers
+import comfy.utils
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+
+class TextEncodeBooguEdit(io.ComfyNode):
+    """Boogu-Image Edit conditioning.
+
+    The edit image is used twice, matching the reference pipeline:
+      - Qwen3-VL vision tokens (instruction understanding) -> positive only
+      - VAE reference latent (image identity)              -> positive and negative
+    The ref latent is in both conds so it cancels under CFG (identity preserved);
+    the vision tokens are only in the positive so CFG amplifies the instruction.
+    The tokenizer selects the right system prompt automatically (image -> TI2I,
+    empty negative -> DROP), so no template plumbing is needed here.
+    """
+
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TextEncodeBooguEdit",
+            category="model/conditioning/boogu",
+            inputs=[
+                io.Clip.Input("clip"),
+                io.String.Input("prompt", multiline=True, dynamic_prompts=True),
+                io.Vae.Input("vae"),
+                io.Autogrow.Input(
+                    "images",
+                    template=io.Autogrow.TemplateNames(
+                        io.Image.Input("image"),
+                        names=[f"image_{i}" for i in range(1, 17)],
+                        min=1,
+                    ),
+                    tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
+                ),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+            ],
+        )
+
+    @classmethod
+    def execute(cls, clip, prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
+        ref_latents = []
+        images_vl = []
+
+        images = images or {}
+        for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
+            image = images[name]
+            if image is None:
+                continue
+            samples = image.movedim(-1, 1)
+
+            # Vision tower input: the reference caps the VLM image at 384x384
+            # (max_vlm_input_pil_pixels in pipeline_boogu.py).
+            total = int(384 * 384)
+            scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+            width = round(samples.shape[3] * scale_by)
+            height = round(samples.shape[2] * scale_by)
+            s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+            images_vl.append(s.movedim(1, -1)[:, :, :, :3])
+
+            # Reference latent: align to 16 px (VAE /8 * patch_size 2).
+            if vae is not None:
+                total = int(1024 * 1024)
+                scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+                width = round(samples.shape[3] * scale_by / 16.0) * 16
+                height = round(samples.shape[2] * scale_by / 16.0) * 16
+                s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+                ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
+
+        # positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
+        positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
+        negative = clip.encode_from_tokens_scheduled(clip.tokenize(""))
+
+        if len(ref_latents) > 0:
+            positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
+            negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
+
+        return io.NodeOutput(positive, negative)
+
+
+class BooguExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            TextEncodeBooguEdit,
+        ]
+
+
+async def comfy_entrypoint() -> BooguExtension:
+    return BooguExtension()
--- a/nodes.py
+++ b/nodes.py
@ -969,7 +969,7 @@ class CLIPLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
-                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ),
+                              "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4", "boogu"], ),
                              },
                "optional": {
                              "device": (["default", "cpu"], {"advanced": True}),
@ -2425,6 +2425,7 @@ async def init_builtin_extra_nodes():
        "nodes_tcfg.py",
        "nodes_context_windows.py",
        "nodes_qwen.py",
+        "nodes_boogu.py",
        "nodes_chroma_radiance.py",
        "nodes_pid.py",
        "nodes_model_patch.py",