From 831351a29e91ea758437227c2f3c915a6be6d1a6 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:15:23 -0800 Subject: [PATCH] Support generating attention masks for left padded text encoders. (#12454) --- comfy/sd1_clip.py | 15 +++++++++++---- comfy/text_encoders/ace15.py | 8 +------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index 4c817d468..b564d1529 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -171,8 +171,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def process_tokens(self, tokens, device): end_token = self.special_tokens.get("end", None) + pad_token = self.special_tokens.get("pad", -1) if end_token is None: - cmp_token = self.special_tokens.get("pad", -1) + cmp_token = pad_token else: cmp_token = end_token @@ -186,15 +187,21 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): other_embeds = [] eos = False index = 0 + left_pad = False for y in x: if isinstance(y, numbers.Integral): - if eos: + token = int(y) + if index == 0 and token == pad_token: + left_pad = True + + if eos or (left_pad and token == pad_token): attention_mask.append(0) else: attention_mask.append(1) - token = int(y) + left_pad = False + tokens_temp += [token] - if not eos and token == cmp_token: + if not eos and token == cmp_token and not left_pad: if end_token is None: attention_mask[-1] = 0 eos = True diff --git a/comfy/text_encoders/ace15.py b/comfy/text_encoders/ace15.py index 0fdd4669f..f135d74c1 100644 --- a/comfy/text_encoders/ace15.py +++ b/comfy/text_encoders/ace15.py @@ -10,7 +10,6 @@ import comfy.utils def sample_manual_loop_no_classes( model, ids=None, - paddings=[], execution_dtype=None, cfg_scale: float = 2.0, temperature: float = 0.85, @@ -36,9 +35,6 @@ def sample_manual_loop_no_classes( embeds, attention_mask, num_tokens, embeds_info = model.process_tokens(ids, device) embeds_batch = embeds.shape[0] - for i, t in enumerate(paddings): - attention_mask[i, :t] = 0 - attention_mask[i, t:] = 1 output_audio_codes = [] past_key_values = [] @@ -135,13 +131,11 @@ def generate_audio_codes(model, positive, negative, min_tokens=1, max_tokens=102 pos_pad = (len(negative) - len(positive)) positive = [model.special_tokens["pad"]] * pos_pad + positive - paddings = [pos_pad, neg_pad] ids = [positive, negative] else: - paddings = [] ids = [positive] - return sample_manual_loop_no_classes(model, ids, paddings, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens) + return sample_manual_loop_no_classes(model, ids, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p, seed=seed, min_tokens=min_tokens, max_new_tokens=max_tokens) class ACE15Tokenizer(sd1_clip.SD1Tokenizer):