qwen_image: propagate attention mask. (#11966)

2026-03-10 03:37:33 +08:00 · 2026-01-23 03:02:31 +02:00 · 2026-01-23 03:02:31 +02:00 · d7f3241bf6
commit d7f3241bf6
parent 09a2e67151
2 changed files with 13 additions and 1 deletions
--- a/comfy/ldm/qwen_image/model.py
+++ b/comfy/ldm/qwen_image/model.py
@ -170,8 +170,14 @@ class Attention(nn.Module):
        joint_query = apply_rope1(joint_query, image_rotary_emb)
        joint_key = apply_rope1(joint_key, image_rotary_emb)

+        if encoder_hidden_states_mask is not None:
+            attn_mask = torch.zeros((batch_size, 1, seq_txt + seq_img), dtype=hidden_states.dtype, device=hidden_states.device)
+            attn_mask[:, 0, :seq_txt] = encoder_hidden_states_mask
+        else:
+            attn_mask = None
+
        joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads,
-                                                         attention_mask, transformer_options=transformer_options,
+                                                         attn_mask, transformer_options=transformer_options,
                                                         skip_reshape=True)

        txt_attn_output = joint_hidden_states[:, :seq_txt, :]
@ -430,6 +436,9 @@ class QwenImageTransformer2DModel(nn.Module):
        encoder_hidden_states = context
        encoder_hidden_states_mask = attention_mask

+        if encoder_hidden_states_mask is not None and not torch.is_floating_point(encoder_hidden_states_mask):
+            encoder_hidden_states_mask = (encoder_hidden_states_mask - 1).to(x.dtype) * torch.finfo(x.dtype).max
+
        hidden_states, img_ids, orig_shape = self.process_img(x)
        num_embeds = hidden_states.shape[1]

--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@ -1578,6 +1578,9 @@ class QwenImage(BaseModel):

    def extra_conds(self, **kwargs):
        out = super().extra_conds(**kwargs)
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)