mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-15 07:52:35 +08:00
Make the rendering of Comfy's implementation is identical to the Chroma workflow.
This commit is contained in:
parent
271c9c5b9e
commit
9e738b989e
84
comfy/sd.py
84
comfy/sd.py
@ -84,9 +84,12 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
|
|||||||
|
|
||||||
|
|
||||||
class CLIP:
|
class CLIP:
|
||||||
def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}):
|
def __init__(self, target=None, embedding_directory=None, no_init=False, tokenizer_data={}, parameters=0, model_options={}, clip_type_enum=None): # MODIFIED: Added clip_type_enum
|
||||||
if no_init:
|
if no_init:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
self.clip_type_enum = clip_type_enum # MODIFIED: Store the original CLIPType
|
||||||
|
|
||||||
params = target.params.copy()
|
params = target.params.copy()
|
||||||
clip = target.clip
|
clip = target.clip
|
||||||
tokenizer = target.tokenizer
|
tokenizer = target.tokenizer
|
||||||
@ -131,6 +134,7 @@ class CLIP:
|
|||||||
n.tokenizer_options = self.tokenizer_options.copy()
|
n.tokenizer_options = self.tokenizer_options.copy()
|
||||||
n.use_clip_schedule = self.use_clip_schedule
|
n.use_clip_schedule = self.use_clip_schedule
|
||||||
n.apply_hooks_to_conds = self.apply_hooks_to_conds
|
n.apply_hooks_to_conds = self.apply_hooks_to_conds
|
||||||
|
n.clip_type_enum = self.clip_type_enum # MODIFIED: Clone the stored CLIPType
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
|
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
|
||||||
@ -159,11 +163,9 @@ class CLIP:
|
|||||||
all_cond_pooled: list[tuple[torch.Tensor, dict[str]]] = []
|
all_cond_pooled: list[tuple[torch.Tensor, dict[str]]] = []
|
||||||
all_hooks = self.patcher.forced_hooks
|
all_hooks = self.patcher.forced_hooks
|
||||||
if all_hooks is None or not self.use_clip_schedule:
|
if all_hooks is None or not self.use_clip_schedule:
|
||||||
# if no hooks or shouldn't use clip schedule, do unscheduled encode_from_tokens and perform add_dict
|
|
||||||
return_pooled = "unprojected" if unprojected else True
|
return_pooled = "unprojected" if unprojected else True
|
||||||
pooled_dict = self.encode_from_tokens(tokens, return_pooled=return_pooled, return_dict=True)
|
pooled_dict = self.encode_from_tokens(tokens, return_pooled=return_pooled, return_dict=True)
|
||||||
cond = pooled_dict.pop("cond")
|
cond = pooled_dict.pop("cond")
|
||||||
# add/update any keys with the provided add_dict
|
|
||||||
pooled_dict.update(add_dict)
|
pooled_dict.update(add_dict)
|
||||||
all_cond_pooled.append([cond, pooled_dict])
|
all_cond_pooled.append([cond, pooled_dict])
|
||||||
else:
|
else:
|
||||||
@ -183,7 +185,6 @@ class CLIP:
|
|||||||
|
|
||||||
for scheduled_opts in scheduled_keyframes:
|
for scheduled_opts in scheduled_keyframes:
|
||||||
t_range = scheduled_opts[0]
|
t_range = scheduled_opts[0]
|
||||||
# don't bother encoding any conds outside of start_percent and end_percent bounds
|
|
||||||
if "start_percent" in add_dict:
|
if "start_percent" in add_dict:
|
||||||
if t_range[1] < add_dict["start_percent"]:
|
if t_range[1] < add_dict["start_percent"]:
|
||||||
continue
|
continue
|
||||||
@ -193,18 +194,25 @@ class CLIP:
|
|||||||
hooks_keyframes = scheduled_opts[1]
|
hooks_keyframes = scheduled_opts[1]
|
||||||
for hook, keyframe in hooks_keyframes:
|
for hook, keyframe in hooks_keyframes:
|
||||||
hook.hook_keyframe._current_keyframe = keyframe
|
hook.hook_keyframe._current_keyframe = keyframe
|
||||||
# apply appropriate hooks with values that match new hook_keyframe
|
|
||||||
self.patcher.patch_hooks(all_hooks)
|
self.patcher.patch_hooks(all_hooks)
|
||||||
# perform encoding as normal
|
|
||||||
o = self.cond_stage_model.encode_token_weights(tokens)
|
o = self.cond_stage_model.encode_token_weights(tokens)
|
||||||
cond, pooled = o[:2]
|
cond, pooled = o[:2]
|
||||||
|
|
||||||
|
# --- MODIFICATION FOR SCHEDULED PATH (CONSISTENCY) ---
|
||||||
|
# Populate initial pooled_dict including o[2] if present, then filter
|
||||||
pooled_dict = {"pooled_output": pooled}
|
pooled_dict = {"pooled_output": pooled}
|
||||||
# add clip_start_percent and clip_end_percent in pooled
|
if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict
|
||||||
|
pooled_dict.update(o[2])
|
||||||
|
|
||||||
|
if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA:
|
||||||
|
if 'attention_mask' in pooled_dict:
|
||||||
|
logging.debug(f"CLIP type {self.clip_type_enum.name} (scheduled path): Removing 'attention_mask' from conditioning output.")
|
||||||
|
pooled_dict.pop('attention_mask', None)
|
||||||
|
# --- END MODIFICATION FOR SCHEDULED PATH ---
|
||||||
|
|
||||||
pooled_dict["clip_start_percent"] = t_range[0]
|
pooled_dict["clip_start_percent"] = t_range[0]
|
||||||
pooled_dict["clip_end_percent"] = t_range[1]
|
pooled_dict["clip_end_percent"] = t_range[1]
|
||||||
# add/update any keys with the provided add_dict
|
|
||||||
pooled_dict.update(add_dict)
|
pooled_dict.update(add_dict)
|
||||||
# add hooks stored on clip
|
|
||||||
self.add_hooks_to_dict(pooled_dict)
|
self.add_hooks_to_dict(pooled_dict)
|
||||||
all_cond_pooled.append([cond, pooled_dict])
|
all_cond_pooled.append([cond, pooled_dict])
|
||||||
if show_pbar:
|
if show_pbar:
|
||||||
@ -227,10 +235,17 @@ class CLIP:
|
|||||||
cond, pooled = o[:2]
|
cond, pooled = o[:2]
|
||||||
if return_dict:
|
if return_dict:
|
||||||
out = {"cond": cond, "pooled_output": pooled}
|
out = {"cond": cond, "pooled_output": pooled}
|
||||||
if len(o) > 2:
|
if len(o) > 2 and isinstance(o[2], dict): # Check if o[2] is a dict
|
||||||
for k in o[2]:
|
for k in o[2]:
|
||||||
out[k] = o[2][k]
|
out[k] = o[2][k]
|
||||||
self.add_hooks_to_dict(out)
|
self.add_hooks_to_dict(out)
|
||||||
|
|
||||||
|
# ---- START MODIFICATION for non-scheduled path ----
|
||||||
|
if hasattr(self, 'clip_type_enum') and self.clip_type_enum == CLIPType.CHROMA:
|
||||||
|
if 'attention_mask' in out:
|
||||||
|
logging.debug(f"CLIP type {self.clip_type_enum.name} (non-scheduled path): Removing 'attention_mask' from conditioning output.")
|
||||||
|
out.pop('attention_mask', None)
|
||||||
|
# ---- END MODIFICATION for non-scheduled path ----
|
||||||
return out
|
return out
|
||||||
|
|
||||||
if return_pooled:
|
if return_pooled:
|
||||||
@ -261,6 +276,7 @@ class CLIP:
|
|||||||
def get_key_patches(self):
|
def get_key_patches(self):
|
||||||
return self.patcher.get_key_patches()
|
return self.patcher.get_key_patches()
|
||||||
|
|
||||||
|
|
||||||
class VAE:
|
class VAE:
|
||||||
def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
|
def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None):
|
||||||
if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
|
if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
|
||||||
@ -788,8 +804,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
if "transformer.resblocks.0.ln_1.weight" in clip_data[i]:
|
if "transformer.resblocks.0.ln_1.weight" in clip_data[i]:
|
||||||
clip_data[i] = comfy.utils.clip_text_transformers_convert(clip_data[i], "", "")
|
clip_data[i] = comfy.utils.clip_text_transformers_convert(clip_data[i], "", "")
|
||||||
else:
|
else:
|
||||||
if "text_projection" in clip_data[i]:
|
# Ensure "text_projection" exists and is a tensor before trying to transpose
|
||||||
clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) #old models saved with the CLIPSave node
|
if "text_projection" in clip_data[i] and isinstance(clip_data[i]["text_projection"], torch.Tensor):
|
||||||
|
clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1)
|
||||||
|
|
||||||
tokenizer_data = {}
|
tokenizer_data = {}
|
||||||
clip_target = EmptyClass()
|
clip_target = EmptyClass()
|
||||||
@ -813,25 +830,39 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
|
clip_target.clip = comfy.text_encoders.sd2_clip.SD2ClipModel
|
||||||
clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.sd2_clip.SD2Tokenizer
|
||||||
elif te_model == TEModel.T5_XXL:
|
elif te_model == TEModel.T5_XXL:
|
||||||
|
common_t5_args = t5xxl_detect(clip_data)
|
||||||
if clip_type == CLIPType.SD3:
|
if clip_type == CLIPType.SD3:
|
||||||
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **t5xxl_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=False, clip_g=False, t5=True, **common_t5_args)
|
||||||
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
||||||
elif clip_type == CLIPType.LTXV:
|
elif clip_type == CLIPType.LTXV:
|
||||||
clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.lt.ltxv_te(**common_t5_args)
|
||||||
clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer
|
||||||
elif clip_type == CLIPType.PIXART or clip_type == CLIPType.CHROMA:
|
# ---- START MODIFICATION for T5_XXL model selection ----
|
||||||
clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data))
|
elif clip_type == CLIPType.PIXART: # CHROMA removed from this OR condition
|
||||||
|
# PIXART keeps its specific text encoder.
|
||||||
|
clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**common_t5_args)
|
||||||
clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
|
clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer
|
||||||
|
# ---- END MODIFICATION for T5_XXL model selection ----
|
||||||
elif clip_type == CLIPType.WAN:
|
elif clip_type == CLIPType.WAN:
|
||||||
clip_target.clip = comfy.text_encoders.wan.te(**t5xxl_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.wan.te(**common_t5_args)
|
||||||
clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.wan.WanT5Tokenizer
|
||||||
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
|
||||||
elif clip_type == CLIPType.HIDREAM:
|
elif clip_type == CLIPType.HIDREAM:
|
||||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
|
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**common_t5_args,
|
||||||
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
|
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
else: #CLIPType.MOCHI
|
else:
|
||||||
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
|
# This 'else' now covers:
|
||||||
|
# - MOCHI (T5XXL)
|
||||||
|
# - CHROMA (T5XXL) - because it's not caught by the PIXART elif anymore
|
||||||
|
# - STABLE_DIFFUSION (T5XXL) - if it falls here by default
|
||||||
|
# - Any other unhandled CLIPType with T5XXL
|
||||||
|
# All these will use comfy.text_encoders.genmo.mochi_te
|
||||||
|
if clip_type == CLIPType.CHROMA:
|
||||||
|
logging.debug(f"TEModel.T5_XXL with CLIPType.CHROMA: Using Mochi-like TE (comfy.text_encoders.genmo.mochi_te) for tensor generation.")
|
||||||
|
else:
|
||||||
|
logging.debug(f"TEModel.T5_XXL with CLIPType.{clip_type.name if clip_type else 'Unknown'}: Falling to Mochi-like TE (comfy.text_encoders.genmo.mochi_te).")
|
||||||
|
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**common_t5_args)
|
||||||
clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
|
||||||
elif te_model == TEModel.T5_XXL_OLD:
|
elif te_model == TEModel.T5_XXL_OLD:
|
||||||
clip_target.clip = comfy.text_encoders.cosmos.te(**t5xxl_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.cosmos.te(**t5xxl_detect(clip_data))
|
||||||
@ -851,14 +882,17 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
|
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
else:
|
else:
|
||||||
# clip_l
|
# clip_l default
|
||||||
|
# This branch is taken for TEModel.CLIP_L or if te_model is None/unrecognized.
|
||||||
|
# If clip_type is CHROMA here (e.g. Chroma with a CLIP-L model),
|
||||||
|
# sd1_clip.SD1ClipModel will be used, and its attention_mask will be removed by the CLIP class logic.
|
||||||
if clip_type == CLIPType.SD3:
|
if clip_type == CLIPType.SD3:
|
||||||
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
|
clip_target.clip = comfy.text_encoders.sd3_clip.sd3_clip(clip_l=True, clip_g=False, t5=False)
|
||||||
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
clip_target.tokenizer = comfy.text_encoders.sd3_clip.SD3Tokenizer
|
||||||
elif clip_type == CLIPType.HIDREAM:
|
elif clip_type == CLIPType.HIDREAM:
|
||||||
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
|
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(clip_l=True, clip_g=False, t5=False, llama=False, dtype_t5=None, dtype_llama=None, t5xxl_scaled_fp8=None, llama_scaled_fp8=None)
|
||||||
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
|
||||||
else:
|
else: # Default for CLIP_L like models (includes STABLE_DIFFUSION, and CHROMA if CLIP_L)
|
||||||
clip_target.clip = sd1_clip.SD1ClipModel
|
clip_target.clip = sd1_clip.SD1ClipModel
|
||||||
clip_target.tokenizer = sd1_clip.SD1Tokenizer
|
clip_target.tokenizer = sd1_clip.SD1Tokenizer
|
||||||
elif len(clip_data) == 2:
|
elif len(clip_data) == 2:
|
||||||
@ -876,7 +910,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
clip_target.clip = comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data))
|
clip_target.clip = comfy.text_encoders.hunyuan_video.hunyuan_video_clip(**llama_detect(clip_data))
|
||||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer
|
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideoTokenizer
|
||||||
elif clip_type == CLIPType.HIDREAM:
|
elif clip_type == CLIPType.HIDREAM:
|
||||||
# Detect
|
|
||||||
hidream_dualclip_classes = []
|
hidream_dualclip_classes = []
|
||||||
for hidream_te in clip_data:
|
for hidream_te in clip_data:
|
||||||
te_model = detect_te_model(hidream_te)
|
te_model = detect_te_model(hidream_te)
|
||||||
@ -887,7 +920,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
t5 = TEModel.T5_XXL in hidream_dualclip_classes
|
t5 = TEModel.T5_XXL in hidream_dualclip_classes
|
||||||
llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
|
llama = TEModel.LLAMA3_8 in hidream_dualclip_classes
|
||||||
|
|
||||||
# Initialize t5xxl_detect and llama_detect kwargs if needed
|
|
||||||
t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
|
t5_kwargs = t5xxl_detect(clip_data) if t5 else {}
|
||||||
llama_kwargs = llama_detect(clip_data) if llama else {}
|
llama_kwargs = llama_detect(clip_data) if llama else {}
|
||||||
|
|
||||||
@ -908,7 +940,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
|||||||
parameters += comfy.utils.calculate_parameters(c)
|
parameters += comfy.utils.calculate_parameters(c)
|
||||||
tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
|
tokenizer_data, model_options = comfy.text_encoders.long_clipl.model_options_long_clip(c, tokenizer_data, model_options)
|
||||||
|
|
||||||
clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options)
|
# MODIFIED: Pass the original clip_type (enum) to the CLIP constructor
|
||||||
|
clip = CLIP(clip_target, embedding_directory=embedding_directory, parameters=parameters, tokenizer_data=tokenizer_data, model_options=model_options, clip_type_enum=clip_type)
|
||||||
|
|
||||||
for c in clip_data:
|
for c in clip_data:
|
||||||
m, u = clip.load_sd(c)
|
m, u = clip.load_sd(c)
|
||||||
if len(m) > 0:
|
if len(m) > 0:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user