From 9e738b989eb1eed38bc1952d564f1d6c1adaf14b Mon Sep 17 00:00:00 2001 From: BigStationW Date: Wed, 7 May 2025 00:09:07 +0200 Subject: [PATCH 1/6] Make the rendering of Comfy's implementation is identical to the Chroma workflow. --- comfy/sd.py | 86 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index da9b36d0e..d382e1227 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -84,9 +84,12 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip): class CLIP: - def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}): + def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}, clip_type_enum=None): # MODIFIED: Added clip_type_enum if no_init: return + + self.clip_type_enum = clip_type_enum # MODIFIED: Store the original CLIPType + params = target.params.copy() clip = target.clip tokenizer = target.tokenizer @@ -131,6 +134,7 @@ class CLIP: n.tokenizer_options = self.tokenizer_options.copy() n.use_clip_schedule = self.use_clip_schedule n.apply_hooks_to_conds = self.apply_hooks_to_conds + n.clip_type_enum = self.clip_type_enum # MODIFIED: Clone the stored CLIPType return n def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): @@ -159,11 +163,9 @@ class CLIP: all_cond_pooled: list[tuple[torch.Tensor, dict[str]]] = [] all_hooks = self.patcher.forced_hooks if all_hooks is None or not self.use_clip_schedule: - # if no hooks or shouldn't use clip schedule, do unscheduled encode_from_tokens and perform add_dict return_pooled = "unprojected" if unprojected else True pooled_dict = self.encode_from_tokens(tokens, return_pooled=return_pooled, return_dict=True) cond = pooled_dict.pop("cond") - # add/update any keys with the provided add_dict pooled_dict.update(add_dict) all_cond_pooled.append([cond, pooled_dict]) else: @@ -183,7 +185,6 @@ class CLIP: for scheduled_opts in scheduled_keyframes: t_range = scheduled_opts[0] - # don't bother encoding any conds outside of start_percent and end_percent bounds if "start_percent" in add_dict: if t_range[1] < add_dict["start_percent"]: continue @@ -193,18 +194,25 @@ class CLIP: hooks_keyframes = scheduled_opts[1] for hook, keyframe in hooks_keyframes: hook.hook_keyframe._current_keyframe = keyframe - # apply appropriate hooks with values that match new hook_keyframe self.patcher.patch_hooks(all_hooks) - # perform encoding as normal o = self.cond_stage_model.encode_token_weights(tokens) cond, pooled = o[:2] + + # --- MODIFICATION FOR SCHEDULED PATH (CONSISTENCY) --- + # Populate initial pooled_dict including o[2] if present, then filter pooled_dict = {"pooled_output": pooled} - # add clip_start_percent and clip_end_percent in pooled + if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict + pooled_dict.update(o[2]) + + if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA: + if 'attention_mask' in pooled_dict: + logging.debug(f"CLIP type {self.clip_type_enum.name} (scheduled path): Removing 'attention_mask' from conditioning output.") + pooled_dict.pop('attention_mask', None) + # --- END MODIFICATION FOR SCHEDULED PATH --- + pooled_dict["clip_start_percent"] = t_range[0] pooled_dict["clip_end_percent"] = t_range[1] - # add/update any keys with the provided add_dict pooled_dict.update(add_dict) - # add hooks stored on clip self.add_hooks_to_dict(pooled_dict) all_cond_pooled.append([cond, pooled_dict]) if show_pbar: @@ -227,10 +235,17 @@ class CLIP: cond, pooled = o[:2] if return_dict: out = {"cond": cond, "pooled_output": pooled} - if len(o) > 2: + if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict for k in o[2]: out[k] = o[2][k] self.add_hooks_to_dict(out) + + # ---- START MODIFICATION for non-scheduled path ---- + if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA: + if 'attention_mask' in out: + logging.debug(f"CLIP type {self.clip_type_enum.name} (non-scheduled path): Removing 'attention_mask' from conditioning output.") + out.pop('attention_mask', None) + # ---- END MODIFICATION for non-scheduled path ---- return out if return_pooled: @@ -261,6 +276,7 @@ class CLIP: def get_key_patches(self): return self.patcher.get_key_patches() + class VAE: def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None): if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format @@ -788,8 +804,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip if "transformer.resblocks.0.ln_1.weight" in clip_data[i]: clip_data[i] = comfy.utils.clip_text_transformers_convert(clip_data[i], "", "") else: - if "text_projection" in clip_data[i]: - clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) #old models saved with the CLIPSave node + # Ensure "text_projection" exists and is a tensor before trying to transpose + if "text_projection" in clip_data[i] and isinstance(clip_data[i]["text_projection"], torch.Tensor): + clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) tokenizer_data = {} clip_target = EmptyClass() @@ -813,25 +830,39 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer elif te_model == TEModel.T5_XXL: + common_t5_args = t5xxl_detect(clip_data) if clip_type == CLIPType.SD3: - clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data)) + clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **common_t5_args) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.LTXV: - clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data)) + clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer - elif clip_type == CLIPType.PIXART or clip_type == CLIPType.CHROMA: - clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data)) + # ---- START MODIFICATION for T5_XXL model selection ---- + elif clip_type == CLIPType.PIXART: # CHROMA removed from this OR condition + # PIXART keeps its specific text encoder. + clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer + # ---- END MODIFICATION for T5_XXL model selection ---- elif clip_type == CLIPType.WAN: - clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data)) + clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif clip_type == CLIPType.HIDREAM: - clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), + clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**common_t5_args, clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer - else: #CLIPType.MOCHI - clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data)) + else: + # This 'else' now covers: + # - MOCHI (T5XXL) + # - CHROMA (T5XXL) - because it's not caught by the PIXART elif anymore + # - STABLE_DIFFUSION (T5XXL) - if it falls here by default + # - Any other unhandled CLIPType with T5XXL + # All these will use comfy.text_encoders.genmo.mochi_te + if clip_type == CLIPType.CHROMA: + logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.") + else: + logging.debug(f"TEModel.T5_XXL with CLIPType.{clip_type.name if clip_type else 'Unknown'}: Falling to Mochi-like TE (comfy.text_encoders.genmo.mochi_te).") + clip_target.clip = comfy.text_encoders.genmo.mochi_te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer elif te_model == TEModel.T5_XXL_OLD: clip_target.clip = comfy.text_encoders.cosmos.te(**t5xxl_detect(clip_data)) @@ -851,14 +882,17 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: - # clip_l + # clip_l default + # This branch is taken for TEModel.CLIP_L or if te_model is None/unrecognized. + # If clip_type is CHROMA here (e.g. Chroma with a CLIP-L model), + # sd1_clip.SD1ClipModel will be used, and its attention_mask will be removed by the CLIP class logic. if clip_type == CLIPType.SD3: clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.HIDREAM: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer - else: + else: # Default for CLIP_L like models (includes STABLE_DIFFUSION, and CHROMA if CLIP_L) clip_target.clip = sd1_clip.SD1ClipModel clip_target.tokenizer = sd1_clip.SD1Tokenizer elif len(clip_data) == 2: @@ -876,7 +910,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer elif clip_type == CLIPType.HIDREAM: - # Detect hidream_dualclip_classes = [] for hidream_te in clip_data: te_model = detect_te_model(hidream_te) @@ -886,8 +919,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_g = TEModel.CLIP_G in hidream_dualclip_classes t5 = TEModel.T5_XXL in hidream_dualclip_classes llama = TEModel.LLAMA3_8 in hidream_dualclip_classes - - # Initialize t5xxl_detect and llama_detect kwargs if needed + t5_kwargs = t5xxl_detect(clip_data) if t5 else {} llama_kwargs = llama_detect(clip_data) if llama else {} @@ -908,7 +940,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip parameters += comfy.utils.calculate_parameters(c) tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options) - clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options) + # MODIFIED: Pass the original clip_type (enum) to the CLIP constructor + clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options, clip_type_enum=clip_type) + for c in clip_data: m, u = clip.load_sd(c) if len(m) > 0: From 62529c4221193f1789161767c840e1709903c245 Mon Sep 17 00:00:00 2001 From: BigStationW Date: Wed, 7 May 2025 00:41:01 +0200 Subject: [PATCH 2/6] Clean up the comments --- comfy/sd.py | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index d382e1227..acab6864d 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -88,7 +88,7 @@ class CLIP: if no_init: return - self.clip_type_enum = clip_type_enum # MODIFIED: Store the original CLIPType + self.clip_type_enum = clip_type_enum params = target.params.copy() clip = target.clip @@ -134,7 +134,7 @@ class CLIP: n.tokenizer_options = self.tokenizer_options.copy() n.use_clip_schedule = self.use_clip_schedule n.apply_hooks_to_conds = self.apply_hooks_to_conds - n.clip_type_enum = self.clip_type_enum # MODIFIED: Clone the stored CLIPType + n.clip_type_enum = self.clip_type_enum return n def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): @@ -198,17 +198,14 @@ class CLIP: o = self.cond_stage_model.encode_token_weights(tokens) cond, pooled = o[:2] - # --- MODIFICATION FOR SCHEDULED PATH (CONSISTENCY) --- - # Populate initial pooled_dict including o[2] if present, then filter pooled_dict = {"pooled_output": pooled} - if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict + if len(o) > 2 and isinstance(o[2], dict): pooled_dict.update(o[2]) if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA: if 'attention_mask' in pooled_dict: logging.debug(f"CLIP type {self.clip_type_enum.name} (scheduled path): Removing 'attention_mask' from conditioning output.") pooled_dict.pop('attention_mask', None) - # --- END MODIFICATION FOR SCHEDULED PATH --- pooled_dict["clip_start_percent"] = t_range[0] pooled_dict["clip_end_percent"] = t_range[1] @@ -235,17 +232,15 @@ class CLIP: cond, pooled = o[:2] if return_dict: out = {"cond": cond, "pooled_output": pooled} - if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict + if len(o) > 2 and isinstance(o[2], dict): for k in o[2]: out[k] = o[2][k] self.add_hooks_to_dict(out) - # ---- START MODIFICATION for non-scheduled path ---- if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA: if 'attention_mask' in out: logging.debug(f"CLIP type {self.clip_type_enum.name} (non-scheduled path): Removing 'attention_mask' from conditioning output.") out.pop('attention_mask', None) - # ---- END MODIFICATION for non-scheduled path ---- return out if return_pooled: @@ -837,12 +832,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif clip_type == CLIPType.LTXV: clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer - # ---- START MODIFICATION for T5_XXL model selection ---- - elif clip_type == CLIPType.PIXART: # CHROMA removed from this OR condition - # PIXART keeps its specific text encoder. + elif clip_type == CLIPType.PIXART: clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer - # ---- END MODIFICATION for T5_XXL model selection ---- elif clip_type == CLIPType.WAN: clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args) clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer @@ -852,12 +844,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: - # This 'else' now covers: - # - MOCHI (T5XXL) - # - CHROMA (T5XXL) - because it's not caught by the PIXART elif anymore - # - STABLE_DIFFUSION (T5XXL) - if it falls here by default - # - Any other unhandled CLIPType with T5XXL - # All these will use comfy.text_encoders.genmo.mochi_te if clip_type == CLIPType.CHROMA: logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.") else: @@ -882,17 +868,13 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: - # clip_l default - # This branch is taken for TEModel.CLIP_L or if te_model is None/unrecognized. - # If clip_type is CHROMA here (e.g. Chroma with a CLIP-L model), - # sd1_clip.SD1ClipModel will be used, and its attention_mask will be removed by the CLIP class logic. if clip_type == CLIPType.SD3: clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.HIDREAM: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer - else: # Default for CLIP_L like models (includes STABLE_DIFFUSION, and CHROMA if CLIP_L) + else: clip_target.clip = sd1_clip.SD1ClipModel clip_target.tokenizer = sd1_clip.SD1Tokenizer elif len(clip_data) == 2: @@ -940,7 +922,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip parameters += comfy.utils.calculate_parameters(c) tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options) - # MODIFIED: Pass the original clip_type (enum) to the CLIP constructor clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options, clip_type_enum=clip_type) for c in clip_data: From 0cb5aadce310a3b460e5a1e6a8a57a1d05169593 Mon Sep 17 00:00:00 2001 From: BigStationW Date: Wed, 7 May 2025 00:48:34 +0200 Subject: [PATCH 3/6] put the legacy comments back --- comfy/sd.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/comfy/sd.py b/comfy/sd.py index acab6864d..d2445a4d5 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -163,10 +163,13 @@ class CLIP: all_cond_pooled: list[tuple[torch.Tensor, dict[str]]] = [] all_hooks = self.patcher.forced_hooks if all_hooks is None or not self.use_clip_schedule: + # if no hooks or shouldn't use clip schedule, do unscheduled encode_from_tokens and perform add_dict return_pooled = "unprojected" if unprojected else True pooled_dict = self.encode_from_tokens(tokens, return_pooled=return_pooled, return_dict=True) cond = pooled_dict.pop("cond") + # add/update any keys with the provided add_dict pooled_dict.update(add_dict) + # add hooks stored on clip all_cond_pooled.append([cond, pooled_dict]) else: scheduled_keyframes = all_hooks.get_hooks_for_clip_schedule() @@ -185,6 +188,7 @@ class CLIP: for scheduled_opts in scheduled_keyframes: t_range = scheduled_opts[0] + # don't bother encoding any conds outside of start_percent and end_percent bounds if "start_percent" in add_dict: if t_range[1] < add_dict["start_percent"]: continue @@ -194,7 +198,9 @@ class CLIP: hooks_keyframes = scheduled_opts[1] for hook, keyframe in hooks_keyframes: hook.hook_keyframe._current_keyframe = keyframe + # apply appropriate hooks with values that match new hook_keyframe self.patcher.patch_hooks(all_hooks) + # perform encoding as normal o = self.cond_stage_model.encode_token_weights(tokens) cond, pooled = o[:2] @@ -872,6 +878,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.HIDREAM: + # Detect clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer else: @@ -901,6 +908,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_g = TEModel.CLIP_G in hidream_dualclip_classes t5 = TEModel.T5_XXL in hidream_dualclip_classes llama = TEModel.LLAMA3_8 in hidream_dualclip_classes + # Initialize t5xxl_detect and llama_detect kwargs if needed t5_kwargs = t5xxl_detect(clip_data) if t5 else {} llama_kwargs = llama_detect(clip_data) if llama else {} From df8a1b1107ed78c70699716234c0458ffac30b30 Mon Sep 17 00:00:00 2001 From: BigStationW Date: Wed, 7 May 2025 01:37:16 +0200 Subject: [PATCH 4/6] Simplify a part --- comfy/sd.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index d2445a4d5..e1e42a2ad 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -831,30 +831,25 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer elif te_model == TEModel.T5_XXL: - common_t5_args = t5xxl_detect(clip_data) if clip_type == CLIPType.SD3: - clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **common_t5_args) + clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer elif clip_type == CLIPType.LTXV: - clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args) + clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer - elif clip_type == CLIPType.PIXART: - clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args) + elif clip_type == CLIPType.PIXART: + clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer elif clip_type == CLIPType.WAN: - clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args) + clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif clip_type == CLIPType.HIDREAM: - clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**common_t5_args, + clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer - else: - if clip_type == CLIPType.CHROMA: - logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.") - else: - logging.debug(f"TEModel.T5_XXL with CLIPType.{clip_type.name if clip_type else 'Unknown'}: Falling to Mochi-like TE (comfy.text_encoders.genmo.mochi_te).") - clip_target.clip = comfy.text_encoders.genmo.mochi_te(**common_t5_args) + else: #CLIPType.MOCHI or CLIPType.CHROMA + clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer elif te_model == TEModel.T5_XXL_OLD: clip_target.clip = comfy.text_encoders.cosmos.te(**t5xxl_detect(clip_data)) From 8af5c6571ea03e4d3bbdf267a94c8057729cb15a Mon Sep 17 00:00:00 2001 From: BigStationW Date: Wed, 7 May 2025 01:42:29 +0200 Subject: [PATCH 5/6] forgot that legacy comment --- comfy/sd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comfy/sd.py b/comfy/sd.py index e1e42a2ad..21049ca82 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -205,6 +205,7 @@ class CLIP: cond, pooled = o[:2] pooled_dict = {"pooled_output": pooled} + # add clip_start_percent and clip_end_percent in pooled if len(o) > 2 and isinstance(o[2], dict): pooled_dict.update(o[2]) From 5da8950b5c0c7a515b603c3c5fd80593c778c6c4 Mon Sep 17 00:00:00 2001 From: BigStationW Date: Wed, 7 May 2025 01:46:11 +0200 Subject: [PATCH 6/6] forgot that one --- comfy/sd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/sd.py b/comfy/sd.py index 21049ca82..84f284e9a 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -216,7 +216,9 @@ class CLIP: pooled_dict["clip_start_percent"] = t_range[0] pooled_dict["clip_end_percent"] = t_range[1] + # add/update any keys with the provided add_dict pooled_dict.update(add_dict) + # add hooks stored on clip self.add_hooks_to_dict(pooled_dict) all_cond_pooled.append([cond, pooled_dict]) if show_pbar: