diff --git a/comfy/model_detection.py b/comfy/model_detection.py index f3355da5a..0131ca25a 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -190,6 +190,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): # HunyuanVideo 1.5 if '{}cond_type_embedding.weight'.format(key_prefix) in state_dict_keys: dit_config["use_cond_type_embedding"] = True + else: + dit_config["use_cond_type_embedding"] = False if '{}vision_in.proj.0.weight'.format(key_prefix) in state_dict_keys: dit_config["vision_in_dim"] = state_dict['{}vision_in.proj.0.weight'.format(key_prefix)].shape[0] else: diff --git a/comfy/sd.py b/comfy/sd.py index 1bf5bbc0a..6d5822e54 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -453,8 +453,8 @@ class VAE: encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig}, decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig}) - self.memory_used_encode = lambda shape, dtype: (1400 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) + self.memory_used_encode = lambda shape, dtype: (1400 * shape[-3] * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (2800 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) elif "decoder.conv_in.conv.weight" in sd: ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} ddconfig["conv3d"] = True diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index d405cc81a..ad4b89225 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -123,8 +123,6 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode): out_latent["samples"] = latent return io.NodeOutput(positive, negative, out_latent) - encode = execute # TODO: remove - class HunyuanVideo15RefinerLatent(io.ComfyNode): @classmethod @@ -213,8 +211,6 @@ class LatentUpscaleModelLoader(io.ComfyNode): return io.NodeOutput(model) - load_model = execute # TODO: remove - class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode): @classmethod @@ -254,8 +250,6 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode): s = model.resample_latent(s) return io.NodeOutput({"samples": s.cpu().float()}) - upscale = execute # TODO: remove - PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: "