From 831351a29e91ea758437227c2f3c915a6be6d1a6 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 13 Feb 2026 17:15:23 -0800
Subject: [PATCH] Support generating attention masks for left padded text
 encoders. (#12454)

---
 comfy/sd1_clip.py            | 15 +++++++++++----
 comfy/text_encoders/ace15.py |  8 +-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py
index 4c817d468..b564d1529 100644
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -171,8 +171,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
 
     def process_tokens(self, tokens, device):
         end_token = self.special_tokens.get("end", None)
+        pad_token = self.special_tokens.get("pad", -1)
         if end_token is None:
-            cmp_token = self.special_tokens.get("pad", -1)
+            cmp_token = pad_token
         else:
             cmp_token = end_token
 
@@ -186,15 +187,21 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
             other_embeds = []
             eos = False
             index = 0
+            left_pad = False
             for y in x:
                 if isinstance(y, numbers.Integral):
-                    if eos:
+                    token = int(y)
+                    if index == 0 and token == pad_token:
+                        left_pad = True
+
+                    if eos or (left_pad and token == pad_token):
                         attention_mask.append(0)
                     else:
                         attention_mask.append(1)
-                    token = int(y)
+                        left_pad = False
+
                     tokens_temp += [token]
-                    if not eos and token == cmp_token:
+                    if not eos and token == cmp_token and not left_pad:
                         if end_token is None:
                             attention_mask[-1] = 0
                         eos = True
diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py
index 0fdd4669f..f135d74c1 100644
--- a/comfy/text_encoders/ace15.py
+++ b/comfy/text_encoders/ace15.py
@@ -10,7 +10,6 @@ import comfy.utils
 def sample_manual_loop_no_classes(
     model,
     ids=None,
-    paddings=[],
     execution_dtype=None,
     cfg_scale: float = 2.0,
     temperature: float = 0.85,
@@ -36,9 +35,6 @@ def sample_manual_loop_no_classes(
 
     embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device)
     embeds_batch = embeds.shape[0]
-    for i, t in enumerate(paddings):
-        attention_mask[i, :t] = 0
-        attention_mask[i, t:] = 1
 
     output_audio_codes = []
     past_key_values = []
@@ -135,13 +131,11 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102
             pos_pad = (len(negative) - len(positive))
             positive = [model.special_tokens["pad"]] * pos_pad + positive
 
-        paddings = [pos_pad, neg_pad]
         ids = [positive, negative]
     else:
-        paddings = []
         ids = [positive]
 
-    return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
+    return sample_manual_loop_no_classes(model, ids, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens)
 
 
 class ACE15Tokenizer(sd1_clip.SD1Tokenizer):