Merge branch 'master' into dr-support-pip-cm

This commit is contained in:
Dr.Lt.Data 2025-10-14 07:36:42 +09:00
commit 2b47f4a38e
5 changed files with 189 additions and 137 deletions

View File

@ -657,51 +657,51 @@ class WanVAE(nn.Module):
) )
def encode(self, x): def encode(self, x):
self.clear_cache() conv_idx = [0]
feat_map = [None] * count_conv3d(self.encoder)
x = patchify(x, patch_size=2) x = patchify(x, patch_size=2)
t = x.shape[2] t = x.shape[2]
iter_ = 1 + (t - 1) // 4 iter_ = 1 + (t - 1) // 4
for i in range(iter_): for i in range(iter_):
self._enc_conv_idx = [0] conv_idx = [0]
if i == 0: if i == 0:
out = self.encoder( out = self.encoder(
x[:, :, :1, :, :], x[:, :, :1, :, :],
feat_cache=self._enc_feat_map, feat_cache=feat_map,
feat_idx=self._enc_conv_idx, feat_idx=conv_idx,
) )
else: else:
out_ = self.encoder( out_ = self.encoder(
x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
feat_cache=self._enc_feat_map, feat_cache=feat_map,
feat_idx=self._enc_conv_idx, feat_idx=conv_idx,
) )
out = torch.cat([out, out_], 2) out = torch.cat([out, out_], 2)
mu, log_var = self.conv1(out).chunk(2, dim=1) mu, log_var = self.conv1(out).chunk(2, dim=1)
self.clear_cache()
return mu return mu
def decode(self, z): def decode(self, z):
self.clear_cache() conv_idx = [0]
feat_map = [None] * count_conv3d(self.decoder)
iter_ = z.shape[2] iter_ = z.shape[2]
x = self.conv2(z) x = self.conv2(z)
for i in range(iter_): for i in range(iter_):
self._conv_idx = [0] conv_idx = [0]
if i == 0: if i == 0:
out = self.decoder( out = self.decoder(
x[:, :, i:i + 1, :, :], x[:, :, i:i + 1, :, :],
feat_cache=self._feat_map, feat_cache=feat_map,
feat_idx=self._conv_idx, feat_idx=conv_idx,
first_chunk=True, first_chunk=True,
) )
else: else:
out_ = self.decoder( out_ = self.decoder(
x[:, :, i:i + 1, :, :], x[:, :, i:i + 1, :, :],
feat_cache=self._feat_map, feat_cache=feat_map,
feat_idx=self._conv_idx, feat_idx=conv_idx,
) )
out = torch.cat([out, out_], 2) out = torch.cat([out, out_], 2)
out = unpatchify(out, patch_size=2) out = unpatchify(out, patch_size=2)
self.clear_cache()
return out return out
def reparameterize(self, mu, log_var): def reparameterize(self, mu, log_var):
@ -715,12 +715,3 @@ class WanVAE(nn.Module):
return mu return mu
std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
return mu + std * torch.randn_like(std) return mu + std * torch.randn_like(std)
def clear_cache(self):
self._conv_num = count_conv3d(self.decoder)
self._conv_idx = [0]
self._feat_map = [None] * self._conv_num
# cache encode
self._enc_conv_num = count_conv3d(self.encoder)
self._enc_conv_idx = [0]
self._enc_feat_map = [None] * self._enc_conv_num

View File

@ -138,6 +138,7 @@ class BaseModel(torch.nn.Module):
else: else:
operations = model_config.custom_operations operations = model_config.custom_operations
self.diffusion_model = unet_model(**unet_config, device=device, operations=operations) self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
self.diffusion_model.eval()
if comfy.model_management.force_channels_last(): if comfy.model_management.force_channels_last():
self.diffusion_model.to(memory_format=torch.channels_last) self.diffusion_model.to(memory_format=torch.channels_last)
logging.debug("using channels last mode for diffusion model") logging.debug("using channels last mode for diffusion model")
@ -669,7 +670,6 @@ class Lotus(BaseModel):
class StableCascade_C(BaseModel): class StableCascade_C(BaseModel):
def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None): def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
super().__init__(model_config, model_type, device=device, unet_model=StageC) super().__init__(model_config, model_type, device=device, unet_model=StageC)
self.diffusion_model.eval().requires_grad_(False)
def extra_conds(self, **kwargs): def extra_conds(self, **kwargs):
out = {} out = {}
@ -698,7 +698,6 @@ class StableCascade_C(BaseModel):
class StableCascade_B(BaseModel): class StableCascade_B(BaseModel):
def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None): def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
super().__init__(model_config, model_type, device=device, unet_model=StageB) super().__init__(model_config, model_type, device=device, unet_model=StageB)
self.diffusion_model.eval().requires_grad_(False)
def extra_conds(self, **kwargs): def extra_conds(self, **kwargs):
out = {} out = {}

View File

@ -2,42 +2,60 @@ import nodes
import node_helpers import node_helpers
import torch import torch
import comfy.model_management import comfy.model_management
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
class CLIPTextEncodeHunyuanDiT: class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": { return io.Schema(
"clip": ("CLIP", ), node_id="CLIPTextEncodeHunyuanDiT",
"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}), category="advanced/conditioning",
"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}), inputs=[
}} io.Clip.Input("clip"),
RETURN_TYPES = ("CONDITIONING",) io.String.Input("bert", multiline=True, dynamic_prompts=True),
FUNCTION = "encode" io.String.Input("mt5xl", multiline=True, dynamic_prompts=True),
],
outputs=[
io.Conditioning.Output(),
],
)
CATEGORY = "advanced/conditioning" @classmethod
def execute(cls, clip, bert, mt5xl) -> io.NodeOutput:
def encode(self, clip, bert, mt5xl):
tokens = clip.tokenize(bert) tokens = clip.tokenize(bert)
tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"] tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]
return (clip.encode_from_tokens_scheduled(tokens), ) return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
class EmptyHunyuanLatentVideo: encode = execute # TODO: remove
class EmptyHunyuanLatentVideo(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), return io.Schema(
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), node_id="EmptyHunyuanLatentVideo",
"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), category="latent/video",
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} inputs=[
RETURN_TYPES = ("LATENT",) io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
FUNCTION = "generate" io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("length", default=25, min=1, max=nodes.MAX_RESOLUTION, step=4),
io.Int.Input("batch_size", default=1, min=1, max=4096),
],
outputs=[
io.Latent.Output(),
],
)
CATEGORY = "latent/video" @classmethod
def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
def generate(self, width, height, length, batch_size=1):
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
return ({"samples":latent}, ) return io.NodeOutput({"samples":latent})
generate = execute # TODO: remove
PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
"<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: " "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
@ -50,45 +68,61 @@ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
"<|start_header_id|>assistant<|end_header_id|>\n\n" "<|start_header_id|>assistant<|end_header_id|>\n\n"
) )
class TextEncodeHunyuanVideo_ImageToVideo: class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": { return io.Schema(
"clip": ("CLIP", ), node_id="TextEncodeHunyuanVideo_ImageToVideo",
"clip_vision_output": ("CLIP_VISION_OUTPUT", ), category="advanced/conditioning",
"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), inputs=[
"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}), io.Clip.Input("clip"),
}} io.ClipVisionOutput.Input("clip_vision_output"),
RETURN_TYPES = ("CONDITIONING",) io.String.Input("prompt", multiline=True, dynamic_prompts=True),
FUNCTION = "encode" io.Int.Input(
"image_interleave",
default=2,
min=1,
max=512,
tooltip="How much the image influences things vs the text prompt. Higher number means more influence from the text prompt.",
),
],
outputs=[
io.Conditioning.Output(),
],
)
CATEGORY = "advanced/conditioning" @classmethod
def execute(cls, clip, clip_vision_output, prompt, image_interleave) -> io.NodeOutput:
def encode(self, clip, clip_vision_output, prompt, image_interleave):
tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave) tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
return (clip.encode_from_tokens_scheduled(tokens), ) return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
class HunyuanImageToVideo: encode = execute # TODO: remove
class HunyuanImageToVideo(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": {"positive": ("CONDITIONING", ), return io.Schema(
"vae": ("VAE", ), node_id="HunyuanImageToVideo",
"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), category="conditioning/video_models",
"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), inputs=[
"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), io.Conditioning.Input("positive"),
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), io.Vae.Input("vae"),
"guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], ) io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
}, io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
"optional": {"start_image": ("IMAGE", ), io.Int.Input("length", default=53, min=1, max=nodes.MAX_RESOLUTION, step=4),
}} io.Int.Input("batch_size", default=1, min=1, max=4096),
io.Combo.Input("guidance_type", options=["v1 (concat)", "v2 (replace)", "custom"]),
io.Image.Input("start_image", optional=True),
],
outputs=[
io.Conditioning.Output(display_name="positive"),
io.Latent.Output(display_name="latent"),
],
)
RETURN_TYPES = ("CONDITIONING", "LATENT") @classmethod
RETURN_NAMES = ("positive", "latent") def execute(cls, positive, vae, width, height, length, batch_size, guidance_type, start_image=None) -> io.NodeOutput:
FUNCTION = "encode"
CATEGORY = "conditioning/video_models"
def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
out_latent = {} out_latent = {}
@ -111,51 +145,76 @@ class HunyuanImageToVideo:
positive = node_helpers.conditioning_set_values(positive, cond) positive = node_helpers.conditioning_set_values(positive, cond)
out_latent["samples"] = latent out_latent["samples"] = latent
return (positive, out_latent) return io.NodeOutput(positive, out_latent)
class EmptyHunyuanImageLatent: encode = execute # TODO: remove
class EmptyHunyuanImageLatent(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), return io.Schema(
"height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), node_id="EmptyHunyuanImageLatent",
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} category="latent",
RETURN_TYPES = ("LATENT",) inputs=[
FUNCTION = "generate" io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
io.Int.Input("batch_size", default=1, min=1, max=4096),
],
outputs=[
io.Latent.Output(),
],
)
CATEGORY = "latent" @classmethod
def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
def generate(self, width, height, batch_size=1):
latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device()) latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
return ({"samples":latent}, ) return io.NodeOutput({"samples":latent})
class HunyuanRefinerLatent: generate = execute # TODO: remove
class HunyuanRefinerLatent(io.ComfyNode):
@classmethod @classmethod
def INPUT_TYPES(s): def define_schema(cls):
return {"required": {"positive": ("CONDITIONING", ), return io.Schema(
"negative": ("CONDITIONING", ), node_id="HunyuanRefinerLatent",
"latent": ("LATENT", ), inputs=[
"noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}), io.Conditioning.Input("positive"),
}} io.Conditioning.Input("negative"),
io.Latent.Input("latent"),
io.Float.Input("noise_augmentation", default=0.10, min=0.0, max=1.0, step=0.01),
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") ],
RETURN_NAMES = ("positive", "negative", "latent") outputs=[
io.Conditioning.Output(display_name="positive"),
io.Conditioning.Output(display_name="negative"),
io.Latent.Output(display_name="latent"),
],
)
FUNCTION = "execute" @classmethod
def execute(cls, positive, negative, latent, noise_augmentation) -> io.NodeOutput:
def execute(self, positive, negative, latent, noise_augmentation):
latent = latent["samples"] latent = latent["samples"]
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
out_latent = {} out_latent = {}
out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
return (positive, negative, out_latent) return io.NodeOutput(positive, negative, out_latent)
NODE_CLASS_MAPPINGS = { class HunyuanExtension(ComfyExtension):
"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, @override
"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, async def get_node_list(self) -> list[type[io.ComfyNode]]:
"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, return [
"HunyuanImageToVideo": HunyuanImageToVideo, CLIPTextEncodeHunyuanDiT,
"EmptyHunyuanImageLatent": EmptyHunyuanImageLatent, TextEncodeHunyuanVideo_ImageToVideo,
"HunyuanRefinerLatent": HunyuanRefinerLatent, EmptyHunyuanLatentVideo,
} HunyuanImageToVideo,
EmptyHunyuanImageLatent,
HunyuanRefinerLatent,
]
async def comfy_entrypoint() -> HunyuanExtension:
return HunyuanExtension()

View File

@ -25,7 +25,7 @@ class PreviewAny():
value = str(source) value = str(source)
elif source is not None: elif source is not None:
try: try:
value = json.dumps(source) value = json.dumps(source, indent=4)
except Exception: except Exception:
try: try:
value = str(source) value = str(source)

View File

@ -1,25 +1,5 @@
#Rename this to extra_model_paths.yaml and ComfyUI will load it #Rename this to extra_model_paths.yaml and ComfyUI will load it
#config for a1111 ui
#all you have to do is change the base_path to where yours is installed
a111:
base_path: path/to/stable-diffusion-webui/
checkpoints: models/Stable-diffusion
configs: models/Stable-diffusion
vae: models/VAE
loras: |
models/Lora
models/LyCORIS
upscale_models: |
models/ESRGAN
models/RealESRGAN
models/SwinIR
embeddings: embeddings
hypernetworks: models/hypernetworks
controlnet: models/ControlNet
#config for comfyui #config for comfyui
#your base path should be either an existing comfy install or a central folder where you store all of your models, loras, etc. #your base path should be either an existing comfy install or a central folder where you store all of your models, loras, etc.
@ -41,6 +21,29 @@ a111:
# loras: models/loras/ # loras: models/loras/
# upscale_models: models/upscale_models/ # upscale_models: models/upscale_models/
# vae: models/vae/ # vae: models/vae/
# audio_encoders: models/audio_encoders/
# model_patches: models/model_patches/
#config for a1111 ui
#all you have to do is uncomment this (remove the #) and change the base_path to where yours is installed
#a111:
# base_path: path/to/stable-diffusion-webui/
# checkpoints: models/Stable-diffusion
# configs: models/Stable-diffusion
# vae: models/VAE
# loras: |
# models/Lora
# models/LyCORIS
# upscale_models: |
# models/ESRGAN
# models/RealESRGAN
# models/SwinIR
# embeddings: embeddings
# hypernetworks: models/hypernetworks
# controlnet: models/ControlNet
# For a full list of supported keys (style_models, vae_approx, hypernetworks, photomaker, # For a full list of supported keys (style_models, vae_approx, hypernetworks, photomaker,
# model_patches, audio_encoders, classifiers, etc.) see folder_paths.py. # model_patches, audio_encoders, classifiers, etc.) see folder_paths.py.