From 9e738b989eb1eed38bc1952d564f1d6c1adaf14b Mon Sep 17 00:00:00 2001
From: BigStationW <giratinaemeraude@gmail.com>
Date: Wed, 7 May 2025 00:09:07 +0200
Subject: [PATCH 1/6] Make the rendering of Comfy's implementation is identical
 to the Chroma workflow.

---
 comfy/sd.py | 86 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 26 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index da9b36d0e..d382e1227 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -84,9 +84,12 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
 
 
 class CLIP:
-    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}):
+    def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}, clip_type_enum=None): # MODIFIED: Added clip_type_enum
         if no_init:
             return
+        
+        self.clip_type_enum = clip_type_enum # MODIFIED: Store the original CLIPType
+
         params = target.params.copy()
         clip = target.clip
         tokenizer = target.tokenizer
@@ -131,6 +134,7 @@ class CLIP:
         n.tokenizer_options = self.tokenizer_options.copy()
         n.use_clip_schedule = self.use_clip_schedule
         n.apply_hooks_to_conds = self.apply_hooks_to_conds
+        n.clip_type_enum = self.clip_type_enum # MODIFIED: Clone the stored CLIPType
         return n
 
     def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
@@ -159,11 +163,9 @@ class CLIP:
         all_cond_pooled: list[tuple[torch.Tensor, dict[str]]] = []
         all_hooks = self.patcher.forced_hooks
         if all_hooks is None or not self.use_clip_schedule:
-            # if no hooks or shouldn't use clip schedule, do unscheduled encode_from_tokens and perform add_dict
             return_pooled = "unprojected" if unprojected else True
             pooled_dict = self.encode_from_tokens(tokens, return_pooled=return_pooled, return_dict=True)
             cond = pooled_dict.pop("cond")
-            # add/update any keys with the provided add_dict
             pooled_dict.update(add_dict)
             all_cond_pooled.append([cond, pooled_dict])
         else:
@@ -183,7 +185,6 @@ class CLIP:
 
             for scheduled_opts in scheduled_keyframes:
                 t_range = scheduled_opts[0]
-                # don't bother encoding any conds outside of start_percent and end_percent bounds
                 if "start_percent" in add_dict:
                     if t_range[1] < add_dict["start_percent"]:
                         continue
@@ -193,18 +194,25 @@ class CLIP:
                 hooks_keyframes = scheduled_opts[1]
                 for hook, keyframe in hooks_keyframes:
                     hook.hook_keyframe._current_keyframe = keyframe
-                # apply appropriate hooks with values that match new hook_keyframe
                 self.patcher.patch_hooks(all_hooks)
-                # perform encoding as normal
                 o = self.cond_stage_model.encode_token_weights(tokens)
                 cond, pooled = o[:2]
+                
+                # --- MODIFICATION FOR SCHEDULED PATH (CONSISTENCY) ---
+                # Populate initial pooled_dict including o[2] if present, then filter
                 pooled_dict = {"pooled_output": pooled}
-                # add clip_start_percent and clip_end_percent in pooled
+                if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict
+                    pooled_dict.update(o[2])
+                
+                if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA:
+                    if 'attention_mask' in pooled_dict:
+                        logging.debug(f"CLIP type {self.clip_type_enum.name} (scheduled path): Removing 'attention_mask' from conditioning output.")
+                        pooled_dict.pop('attention_mask', None)
+                # --- END MODIFICATION FOR SCHEDULED PATH ---
+
                 pooled_dict["clip_start_percent"] = t_range[0]
                 pooled_dict["clip_end_percent"] = t_range[1]
-                # add/update any keys with the provided add_dict
                 pooled_dict.update(add_dict)
-                # add hooks stored on clip
                 self.add_hooks_to_dict(pooled_dict)
                 all_cond_pooled.append([cond, pooled_dict])
                 if show_pbar:
@@ -227,10 +235,17 @@ class CLIP:
         cond, pooled = o[:2]
         if return_dict:
             out = {"cond": cond, "pooled_output": pooled}
-            if len(o) > 2:
+            if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict
                 for k in o[2]:
                     out[k] = o[2][k]
             self.add_hooks_to_dict(out)
+
+            # ---- START MODIFICATION for non-scheduled path ----
+            if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA:
+                if 'attention_mask' in out:
+                    logging.debug(f"CLIP type {self.clip_type_enum.name} (non-scheduled path): Removing 'attention_mask' from conditioning output.")
+                    out.pop('attention_mask', None)
+            # ---- END MODIFICATION for non-scheduled path ----
             return out
 
         if return_pooled:
@@ -261,6 +276,7 @@ class CLIP:
     def get_key_patches(self):
         return self.patcher.get_key_patches()
 
+
 class VAE:
     def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
         if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
@@ -788,8 +804,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         if "transformer.resblocks.0.ln_1.weight" in clip_data[i]:
             clip_data[i] = comfy.utils.clip_text_transformers_convert(clip_data[i], "", "")
         else:
-            if "text_projection" in clip_data[i]:
-                clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) #old models saved with the CLIPSave node
+            # Ensure "text_projection" exists and is a tensor before trying to transpose
+            if "text_projection" in clip_data[i] and isinstance(clip_data[i]["text_projection"], torch.Tensor):
+                clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) 
 
     tokenizer_data = {}
     clip_target = EmptyClass()
@@ -813,25 +830,39 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
             clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
         elif te_model == TEModel.T5_XXL:
+            common_t5_args = t5xxl_detect(clip_data) 
             if clip_type == CLIPType.SD3:
-                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data))
+                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
             elif clip_type == CLIPType.LTXV:
-                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data))
+                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
-            elif clip_type == CLIPType.PIXART or clip_type == CLIPType.CHROMA:
-                clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data))
+            # ---- START MODIFICATION for T5_XXL model selection ----
+            elif clip_type == CLIPType.PIXART: # CHROMA removed from this OR condition
+                                               # PIXART keeps its specific text encoder.
+                clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
+            # ---- END MODIFICATION for T5_XXL model selection ----
             elif clip_type == CLIPType.WAN:
-                clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data))
+                clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
                 tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
             elif clip_type == CLIPType.HIDREAM:
-                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
+                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**common_t5_args,
                                                                         clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
                 clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
-            else: #CLIPType.MOCHI
-                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
+            else: 
+                # This 'else' now covers:
+                # - MOCHI (T5XXL)
+                # - CHROMA (T5XXL) - because it's not caught by the PIXART elif anymore
+                # - STABLE_DIFFUSION (T5XXL) - if it falls here by default
+                # - Any other unhandled CLIPType with T5XXL
+                # All these will use comfy.text_encoders.genmo.mochi_te
+                if clip_type == CLIPType.CHROMA:
+                    logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.")
+                else:
+                    logging.debug(f"TEModel.T5_XXL with CLIPType.{clip_type.name if clip_type else 'Unknown'}: Falling to Mochi-like TE (comfy.text_encoders.genmo.mochi_te).")
+                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
         elif te_model == TEModel.T5_XXL_OLD:
             clip_target.clip = comfy.text_encoders.cosmos.te(**t5xxl_detect(clip_data))
@@ -851,14 +882,17 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                                                                         clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
             clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
         else:
-            # clip_l
+            # clip_l default
+            # This branch is taken for TEModel.CLIP_L or if te_model is None/unrecognized.
+            # If clip_type is CHROMA here (e.g. Chroma with a CLIP-L model),
+            # sd1_clip.SD1ClipModel will be used, and its attention_mask will be removed by the CLIP class logic.
             if clip_type == CLIPType.SD3:
                 clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                 clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
             elif clip_type == CLIPType.HIDREAM:
                 clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
                 clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
-            else:
+            else: # Default for CLIP_L like models (includes STABLE_DIFFUSION, and CHROMA if CLIP_L)
                 clip_target.clip = sd1_clip.SD1ClipModel
                 clip_target.tokenizer = sd1_clip.SD1Tokenizer
     elif len(clip_data) == 2:
@@ -876,7 +910,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             clip_target.clip = comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data))
             clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer
         elif clip_type == CLIPType.HIDREAM:
-            # Detect
             hidream_dualclip_classes = []
             for hidream_te in clip_data:
                 te_model = detect_te_model(hidream_te)
@@ -886,8 +919,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             clip_g = TEModel.CLIP_G in hidream_dualclip_classes
             t5 = TEModel.T5_XXL in hidream_dualclip_classes
             llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
-
-            # Initialize t5xxl_detect and llama_detect kwargs if needed
+            
             t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
             llama_kwargs = llama_detect(clip_data) if llama else {}
 
@@ -908,7 +940,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         parameters += comfy.utils.calculate_parameters(c)
         tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
 
-    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options)
+    # MODIFIED: Pass the original clip_type (enum) to the CLIP constructor
+    clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options, clip_type_enum=clip_type)
+    
     for c in clip_data:
         m, u = clip.load_sd(c)
         if len(m) > 0:

From 62529c4221193f1789161767c840e1709903c245 Mon Sep 17 00:00:00 2001
From: BigStationW <giratinaemeraude@gmail.com>
Date: Wed, 7 May 2025 00:41:01 +0200
Subject: [PATCH 2/6] Clean up the comments

---
 comfy/sd.py | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index d382e1227..acab6864d 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -88,7 +88,7 @@ class CLIP:
         if no_init:
             return
         
-        self.clip_type_enum = clip_type_enum # MODIFIED: Store the original CLIPType
+        self.clip_type_enum = clip_type_enum
 
         params = target.params.copy()
         clip = target.clip
@@ -134,7 +134,7 @@ class CLIP:
         n.tokenizer_options = self.tokenizer_options.copy()
         n.use_clip_schedule = self.use_clip_schedule
         n.apply_hooks_to_conds = self.apply_hooks_to_conds
-        n.clip_type_enum = self.clip_type_enum # MODIFIED: Clone the stored CLIPType
+        n.clip_type_enum = self.clip_type_enum
         return n
 
     def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
@@ -198,17 +198,14 @@ class CLIP:
                 o = self.cond_stage_model.encode_token_weights(tokens)
                 cond, pooled = o[:2]
                 
-                # --- MODIFICATION FOR SCHEDULED PATH (CONSISTENCY) ---
-                # Populate initial pooled_dict including o[2] if present, then filter
                 pooled_dict = {"pooled_output": pooled}
-                if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict
+                if len(o) > 2 and isinstance(o[2], dict):
                     pooled_dict.update(o[2])
                 
                 if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA:
                     if 'attention_mask' in pooled_dict:
                         logging.debug(f"CLIP type {self.clip_type_enum.name} (scheduled path): Removing 'attention_mask' from conditioning output.")
                         pooled_dict.pop('attention_mask', None)
-                # --- END MODIFICATION FOR SCHEDULED PATH ---
 
                 pooled_dict["clip_start_percent"] = t_range[0]
                 pooled_dict["clip_end_percent"] = t_range[1]
@@ -235,17 +232,15 @@ class CLIP:
         cond, pooled = o[:2]
         if return_dict:
             out = {"cond": cond, "pooled_output": pooled}
-            if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict
+            if len(o) > 2 and isinstance(o[2], dict):
                 for k in o[2]:
                     out[k] = o[2][k]
             self.add_hooks_to_dict(out)
 
-            # ---- START MODIFICATION for non-scheduled path ----
             if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA:
                 if 'attention_mask' in out:
                     logging.debug(f"CLIP type {self.clip_type_enum.name} (non-scheduled path): Removing 'attention_mask' from conditioning output.")
                     out.pop('attention_mask', None)
-            # ---- END MODIFICATION for non-scheduled path ----
             return out
 
         if return_pooled:
@@ -837,12 +832,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             elif clip_type == CLIPType.LTXV:
                 clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
-            # ---- START MODIFICATION for T5_XXL model selection ----
-            elif clip_type == CLIPType.PIXART: # CHROMA removed from this OR condition
-                                               # PIXART keeps its specific text encoder.
+            elif clip_type == CLIPType.PIXART: 
                 clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
-            # ---- END MODIFICATION for T5_XXL model selection ----
             elif clip_type == CLIPType.WAN:
                 clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args)
                 clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
@@ -852,12 +844,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                                                                         clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
                 clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
             else: 
-                # This 'else' now covers:
-                # - MOCHI (T5XXL)
-                # - CHROMA (T5XXL) - because it's not caught by the PIXART elif anymore
-                # - STABLE_DIFFUSION (T5XXL) - if it falls here by default
-                # - Any other unhandled CLIPType with T5XXL
-                # All these will use comfy.text_encoders.genmo.mochi_te
                 if clip_type == CLIPType.CHROMA:
                     logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.")
                 else:
@@ -882,17 +868,13 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                                                                         clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
             clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
         else:
-            # clip_l default
-            # This branch is taken for TEModel.CLIP_L or if te_model is None/unrecognized.
-            # If clip_type is CHROMA here (e.g. Chroma with a CLIP-L model),
-            # sd1_clip.SD1ClipModel will be used, and its attention_mask will be removed by the CLIP class logic.
             if clip_type == CLIPType.SD3:
                 clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                 clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
             elif clip_type == CLIPType.HIDREAM:
                 clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
                 clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
-            else: # Default for CLIP_L like models (includes STABLE_DIFFUSION, and CHROMA if CLIP_L)
+            else: 
                 clip_target.clip = sd1_clip.SD1ClipModel
                 clip_target.tokenizer = sd1_clip.SD1Tokenizer
     elif len(clip_data) == 2:
@@ -940,7 +922,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
         parameters += comfy.utils.calculate_parameters(c)
         tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
 
-    # MODIFIED: Pass the original clip_type (enum) to the CLIP constructor
     clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options, clip_type_enum=clip_type)
     
     for c in clip_data:

From 0cb5aadce310a3b460e5a1e6a8a57a1d05169593 Mon Sep 17 00:00:00 2001
From: BigStationW <giratinaemeraude@gmail.com>
Date: Wed, 7 May 2025 00:48:34 +0200
Subject: [PATCH 3/6] put the legacy comments back

---
 comfy/sd.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/comfy/sd.py b/comfy/sd.py
index acab6864d..d2445a4d5 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -163,10 +163,13 @@ class CLIP:
         all_cond_pooled: list[tuple[torch.Tensor, dict[str]]] = []
         all_hooks = self.patcher.forced_hooks
         if all_hooks is None or not self.use_clip_schedule:
+           # if no hooks or shouldn't use clip schedule, do unscheduled encode_from_tokens and perform add_dict
             return_pooled = "unprojected" if unprojected else True
             pooled_dict = self.encode_from_tokens(tokens, return_pooled=return_pooled, return_dict=True)
             cond = pooled_dict.pop("cond")
+            # add/update any keys with the provided add_dict
             pooled_dict.update(add_dict)
+            # add hooks stored on clip
             all_cond_pooled.append([cond, pooled_dict])
         else:
             scheduled_keyframes = all_hooks.get_hooks_for_clip_schedule()
@@ -185,6 +188,7 @@ class CLIP:
 
             for scheduled_opts in scheduled_keyframes:
                 t_range = scheduled_opts[0]
+                # don't bother encoding any conds outside of start_percent and end_percent bounds
                 if "start_percent" in add_dict:
                     if t_range[1] < add_dict["start_percent"]:
                         continue
@@ -194,7 +198,9 @@ class CLIP:
                 hooks_keyframes = scheduled_opts[1]
                 for hook, keyframe in hooks_keyframes:
                     hook.hook_keyframe._current_keyframe = keyframe
+                # apply appropriate hooks with values that match new hook_keyframe
                 self.patcher.patch_hooks(all_hooks)
+                # perform encoding as normal
                 o = self.cond_stage_model.encode_token_weights(tokens)
                 cond, pooled = o[:2]
                 
@@ -872,6 +878,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
                 clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
                 clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
             elif clip_type == CLIPType.HIDREAM:
+                # Detect
                 clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
                 clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
             else: 
@@ -901,6 +908,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             clip_g = TEModel.CLIP_G in hidream_dualclip_classes
             t5 = TEModel.T5_XXL in hidream_dualclip_classes
             llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
+            # Initialize t5xxl_detect and llama_detect kwargs if needed
             
             t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
             llama_kwargs = llama_detect(clip_data) if llama else {}

From df8a1b1107ed78c70699716234c0458ffac30b30 Mon Sep 17 00:00:00 2001
From: BigStationW <giratinaemeraude@gmail.com>
Date: Wed, 7 May 2025 01:37:16 +0200
Subject: [PATCH 4/6] Simplify a part

---
 comfy/sd.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/comfy/sd.py b/comfy/sd.py
index d2445a4d5..e1e42a2ad 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -831,30 +831,25 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
             clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
             clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
         elif te_model == TEModel.T5_XXL:
-            common_t5_args = t5xxl_detect(clip_data) 
             if clip_type == CLIPType.SD3:
-                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **common_t5_args)
+                clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data))
                 clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
             elif clip_type == CLIPType.LTXV:
-                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args)
+                clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data))
                 clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
-            elif clip_type == CLIPType.PIXART: 
-                clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args)
+            elif clip_type == CLIPType.PIXART:
+                clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data))
                 clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
             elif clip_type == CLIPType.WAN:
-                clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args)
+                clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data))
                 clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
                 tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
             elif clip_type == CLIPType.HIDREAM:
-                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**common_t5_args,
+                clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
                                                                         clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
                 clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
-            else: 
-                if clip_type == CLIPType.CHROMA:
-                    logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.")
-                else:
-                    logging.debug(f"TEModel.T5_XXL with CLIPType.{clip_type.name if clip_type else 'Unknown'}: Falling to Mochi-like TE (comfy.text_encoders.genmo.mochi_te).")
-                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**common_t5_args)
+            else: #CLIPType.MOCHI or CLIPType.CHROMA
+                clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
                 clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
         elif te_model == TEModel.T5_XXL_OLD:
             clip_target.clip = comfy.text_encoders.cosmos.te(**t5xxl_detect(clip_data))

From 8af5c6571ea03e4d3bbdf267a94c8057729cb15a Mon Sep 17 00:00:00 2001
From: BigStationW <giratinaemeraude@gmail.com>
Date: Wed, 7 May 2025 01:42:29 +0200
Subject: [PATCH 5/6] forgot that legacy comment

---
 comfy/sd.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/comfy/sd.py b/comfy/sd.py
index e1e42a2ad..21049ca82 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -205,6 +205,7 @@ class CLIP:
                 cond, pooled = o[:2]
                 
                 pooled_dict = {"pooled_output": pooled}
+                # add clip_start_percent and clip_end_percent in pooled
                 if len(o) > 2 and isinstance(o[2], dict):
                     pooled_dict.update(o[2])
                 

From 5da8950b5c0c7a515b603c3c5fd80593c778c6c4 Mon Sep 17 00:00:00 2001
From: BigStationW <giratinaemeraude@gmail.com>
Date: Wed, 7 May 2025 01:46:11 +0200
Subject: [PATCH 6/6] forgot that one

---
 comfy/sd.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/comfy/sd.py b/comfy/sd.py
index 21049ca82..84f284e9a 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -216,7 +216,9 @@ class CLIP:
 
                 pooled_dict["clip_start_percent"] = t_range[0]
                 pooled_dict["clip_end_percent"] = t_range[1]
+                # add/update any keys with the provided add_dict
                 pooled_dict.update(add_dict)
+                # add hooks stored on clip
                 self.add_hooks_to_dict(pooled_dict)
                 all_cond_pooled.append([cond, pooled_dict])
                 if show_pbar: