feat: Add optional attention_mask input to LTXVAddGuide (CORE-220) (#13965)

2026-05-25 16:37:23 +08:00 · 2026-05-18 15:07:04 -06:00 · 2026-05-18 15:07:04 -06:00 · 292814c31e
commit 292814c31e
parent 187e5237e1
1 changed files with 12 additions and 5 deletions
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@ -175,7 +175,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
    generate = execute  # TODO: remove
-def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_shape, strength=1.0):
+def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_shape, strength=1.0, attention_mask=None):
    """Append a guide_attention_entry to both positive and negative conditioning.
    Each entry tracks one guide reference for per-reference attention control.
@ -184,9 +184,10 @@ def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_s
    new_entry = {
        "pre_filter_count": pre_filter_count,
        "strength": strength,
-        "pixel_mask": None,
+        "pixel_mask": attention_mask.unsqueeze(0).unsqueeze(0) if attention_mask is not None else None,  # reshape to (1, 1, F, H, W)
        "latent_shape": latent_shape,
    }
    results = []
    for cond in (positive, negative):
        # Read existing entries from this specific conditioning
@ -196,8 +197,7 @@ def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_s
            if found is not None:
                existing = found
                break
-        # Shallow copy and append (no deepcopy needed — entries contain
+        # Shallow copy only and append (pixel_mask is never mutated).
        # only scalars and None for pixel_mask at this call site).
        entries = [*existing, new_entry]
        results.append(node_helpers.conditioning_set_values(
            cond, {"guide_attention_entries": entries}
@ -263,6 +263,12 @@ class LTXVAddGuide(io.ComfyNode):
                            "down to the nearest multiple of 8. Negative values are counted from the end of the video.",
                ),
                io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
                io.Mask.Input(
                    "attention_mask",
                    optional=True,
                    tooltip="Optional pixel-space spatial mask. Controls per-region "
                            "conditioning influence via self-attention, multiplied by strength.",
                ),
                ICLoRAParameters.Input(
                    "iclora_parameters",
                    optional=True,
@ -410,7 +416,7 @@ class LTXVAddGuide(io.ComfyNode):
        return latent_image, noise_mask
    @classmethod
-    def execute(cls, positive, negative, vae, latent, image, frame_idx, strength, iclora_parameters=None) -> io.NodeOutput:
+    def execute(cls, positive, negative, vae, latent, image, frame_idx, strength, attention_mask=None, iclora_parameters=None) -> io.NodeOutput:
        scale_factors = vae.downscale_index_formula
        latent_image = latent["samples"]
        noise_mask = get_noise_mask(latent)
@ -469,6 +475,7 @@ class LTXVAddGuide(io.ComfyNode):
        pre_filter_count = t.shape[2] * t.shape[3] * t.shape[4]
        positive, negative = _append_guide_attention_entry(
            positive, negative, pre_filter_count, guide_latent_shape, strength=strength,
            attention_mask=attention_mask,
        )
        return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})