diff --git a/comfy/ldm/ace/ace_step15.py b/comfy/ldm/ace/ace_step15.py index 69338336d..1d7dc59a8 100644 --- a/comfy/ldm/ace/ace_step15.py +++ b/comfy/ldm/ace/ace_step15.py @@ -1110,7 +1110,7 @@ class AceStepConditionGenerationModel(nn.Module): return encoder_hidden, encoder_mask, context_latents - def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, **kwargs): + def forward(self, x, timestep, context, lyric_embed=None, refer_audio=None, audio_codes=None, is_covers=None, replace_with_null_embeds=False, **kwargs): text_attention_mask = None lyric_attention_mask = None refer_audio_order_mask = None @@ -1140,6 +1140,9 @@ class AceStepConditionGenerationModel(nn.Module): src_latents, chunk_masks, is_covers, precomputed_lm_hints_25Hz=precomputed_lm_hints_25Hz, audio_codes=audio_codes ) + if replace_with_null_embeds: + enc_hidden[:] = self.null_condition_emb.to(enc_hidden) + out = self.decoder(hidden_states=x, timestep=timestep, timestep_r=timestep, diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py index c270e6333..2268bff38 100644 --- a/comfy/ldm/cosmos/predict2.py +++ b/comfy/ldm/cosmos/predict2.py @@ -335,7 +335,7 @@ class FinalLayer(nn.Module): device=None, dtype=None, operations=None ): super().__init__() - self.layer_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + self.layer_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) self.linear = operations.Linear( hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, device=device, dtype=dtype ) @@ -463,6 +463,8 @@ class Block(nn.Module): extra_per_block_pos_emb: Optional[torch.Tensor] = None, transformer_options: Optional[dict] = {}, ) -> torch.Tensor: + residual_dtype = x_B_T_H_W_D.dtype + compute_dtype = emb_B_T_D.dtype if extra_per_block_pos_emb is not None: x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb @@ -512,7 +514,7 @@ class Block(nn.Module): result_B_T_H_W_D = rearrange( self.self_attn( # normalized_x_B_T_HW_D, - rearrange(normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"), + rearrange(normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"), None, rope_emb=rope_emb_L_1_1_D, transformer_options=transformer_options, @@ -522,7 +524,7 @@ class Block(nn.Module): h=H, w=W, ) - x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D * result_B_T_H_W_D + x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype) def _x_fn( _x_B_T_H_W_D: torch.Tensor, @@ -536,7 +538,7 @@ class Block(nn.Module): ) _result_B_T_H_W_D = rearrange( self.cross_attn( - rearrange(_normalized_x_B_T_H_W_D, "b t h w d -> b (t h w) d"), + rearrange(_normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"), crossattn_emb, rope_emb=rope_emb_L_1_1_D, transformer_options=transformer_options, @@ -555,7 +557,7 @@ class Block(nn.Module): shift_cross_attn_B_T_1_1_D, transformer_options=transformer_options, ) - x_B_T_H_W_D = result_B_T_H_W_D * gate_cross_attn_B_T_1_1_D + x_B_T_H_W_D + x_B_T_H_W_D = result_B_T_H_W_D.to(residual_dtype) * gate_cross_attn_B_T_1_1_D.to(residual_dtype) + x_B_T_H_W_D normalized_x_B_T_H_W_D = _fn( x_B_T_H_W_D, @@ -563,8 +565,8 @@ class Block(nn.Module): scale_mlp_B_T_1_1_D, shift_mlp_B_T_1_1_D, ) - result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D) - x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D * result_B_T_H_W_D + result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D.to(compute_dtype)) + x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype) return x_B_T_H_W_D @@ -876,6 +878,14 @@ class MiniTrainDIT(nn.Module): "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D, "transformer_options": kwargs.get("transformer_options", {}), } + + # The residual stream for this model has large values. To make fp16 compute_dtype work, we keep the residual stream + # in fp32, but run attention and MLP modules in fp16. + # An alternate method that clamps fp16 values "works" in the sense that it makes coherent images, but there is noticeable + # quality degradation and visual artifacts. + if x_B_T_H_W_D.dtype == torch.float16: + x_B_T_H_W_D = x_B_T_H_W_D.float() + for block in self.blocks: x_B_T_H_W_D = block( x_B_T_H_W_D, @@ -884,6 +894,6 @@ class MiniTrainDIT(nn.Module): **block_kwargs, ) - x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D, t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D) + x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D.to(crossattn_emb.dtype), t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D) x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)[:, :, :orig_shape[-3], :orig_shape[-2], :orig_shape[-1]] return x_B_C_Tt_Hp_Wp diff --git a/comfy/model_base.py b/comfy/model_base.py index 3bb54f59e..858789b30 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -1552,6 +1552,8 @@ class ACEStep15(BaseModel): cross_attn = kwargs.get("cross_attn", None) if cross_attn is not None: + if torch.count_nonzero(cross_attn) == 0: + out['replace_with_null_embeds'] = comfy.conds.CONDConstant(True) out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) conditioning_lyrics = kwargs.get("conditioning_lyrics", None) @@ -1575,6 +1577,10 @@ class ACEStep15(BaseModel): else: out['is_covers'] = comfy.conds.CONDConstant(False) + if refer_audio.shape[2] < noise.shape[2]: + pad = comfy.ldm.ace.ace_step15.get_silence_latent(noise.shape[2], device) + refer_audio = torch.cat([refer_audio.to(pad), pad[:, :, refer_audio.shape[2]:]], dim=2) + out['refer_audio'] = comfy.conds.CONDRegular(refer_audio) return out diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 77264ed28..d33db7507 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -993,7 +993,7 @@ class CosmosT2IPredict2(supported_models_base.BASE): memory_usage_factor = 1.0 - supported_inference_dtypes = [torch.bfloat16, torch.float32] + supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] def __init__(self, unet_config): super().__init__(unet_config) @@ -1023,11 +1023,7 @@ class Anima(supported_models_base.BASE): memory_usage_factor = 1.0 - supported_inference_dtypes = [torch.bfloat16, torch.float32] - - def __init__(self, unet_config): - super().__init__(unet_config) - self.memory_usage_factor = (unet_config.get("model_channels", 2048) / 2048) * 0.95 + supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] def get_model(self, state_dict, prefix="", device=None): out = model_base.Anima(self, device=device) @@ -1038,6 +1034,12 @@ class Anima(supported_models_base.BASE): detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3_06b.transformer.".format(pref)) return supported_models_base.ClipTarget(comfy.text_encoders.anima.AnimaTokenizer, comfy.text_encoders.anima.te(**detect)) + def set_inference_dtype(self, dtype, manual_cast_dtype, **kwargs): + self.memory_usage_factor = (self.unet_config.get("model_channels", 2048) / 2048) * 0.95 + if dtype is torch.float16: + self.memory_usage_factor *= 1.4 + return super().set_inference_dtype(dtype, manual_cast_dtype, **kwargs) + class CosmosI2VPredict2(CosmosT2IPredict2): unet_config = { "image_model": "cosmos_predict2", diff --git a/comfy/text_encoders/anima.py b/comfy/text_encoders/anima.py index b6f58cb25..d8c5a6f92 100644 --- a/comfy/text_encoders/anima.py +++ b/comfy/text_encoders/anima.py @@ -23,7 +23,7 @@ class AnimaTokenizer: def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs): out = {} qwen_ids = self.qwen3_06b.tokenize_with_weights(text, return_word_ids, **kwargs) - out["qwen3_06b"] = [[(token, 1.0) for token, _ in inner_list] for inner_list in qwen_ids] # Set weights to 1.0 + out["qwen3_06b"] = [[(k[0], 1.0, k[2]) if return_word_ids else (k[0], 1.0) for k in inner_list] for inner_list in qwen_ids] # Set weights to 1.0 out["t5xxl"] = self.t5xxl.tokenize_with_weights(text, return_word_ids, **kwargs) return out diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py index 26573fb12..3f87dfd6a 100644 --- a/comfy/text_encoders/lt.py +++ b/comfy/text_encoders/lt.py @@ -25,7 +25,7 @@ def ltxv_te(*args, **kwargs): class Gemma3_12BTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer = tokenizer_data.get("spiece_model", None) - super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) + super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) def state_dict(self): return {"spiece_model": self.tokenizer.serialize_model()} diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 8afd13acf..61a234634 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -622,6 +622,7 @@ class SamplerSASolver(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerSASolver", + search_aliases=["sde"], category="sampling/custom_sampling/samplers", inputs=[ io.Model.Input("model"), @@ -666,6 +667,7 @@ class SamplerSEEDS2(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerSEEDS2", + search_aliases=["sde", "exp heun"], category="sampling/custom_sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=["phi_1", "phi_2"]), diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index 51d1e5b9c..b1912392c 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -108,7 +108,7 @@ def lazycache_predict_noise_wrapper(executor, *args, **kwargs): easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"] if easycache.is_past_end_timestep(timestep): return executor(*args, **kwargs) - x: torch.Tensor = _extract_tensor(args[0], easycache.output_channels) + x: torch.Tensor = args[0][:, :easycache.output_channels] # prepare next x_prev next_x_prev = x input_change = None diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 6aecf1561..8d2d7297a 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -391,8 +391,9 @@ class LatentOperationTonemapReinhard(io.ComfyNode): latent_vector_magnitude = (torch.linalg.vector_norm(latent, dim=(1)) + 0.0000000001)[:,None] normalized_latent = latent / latent_vector_magnitude - mean = torch.mean(latent_vector_magnitude, dim=(1,2,3), keepdim=True) - std = torch.std(latent_vector_magnitude, dim=(1,2,3), keepdim=True) + dims = list(range(1, latent_vector_magnitude.ndim)) + mean = torch.mean(latent_vector_magnitude, dim=dims, keepdim=True) + std = torch.std(latent_vector_magnitude, dim=dims, keepdim=True) top = (std * 5 + mean) * multiplier diff --git a/requirements.txt b/requirements.txt index 41cc9174b..5e34a2a49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.38.13 comfyui-workflow-templates==0.8.31 -comfyui-embedded-docs==0.4.0 +comfyui-embedded-docs==0.4.1 torch torchsde torchvision