mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-25 16:37:23 +08:00
Compare commits
9 Commits
a7d7851917
...
5cf7f1c846
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5cf7f1c846 | ||
|
|
2806163f6e | ||
|
|
cea8d0925f | ||
|
|
b138133ffa | ||
|
|
025e6792ee | ||
|
|
867b8d2408 | ||
|
|
d0f0b15cf5 | ||
|
|
b5bb83c964 | ||
|
|
6447250bd6 |
@ -31,7 +31,8 @@
|
||||
[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
|
||||
[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
|
||||
|
||||
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/4aab0bef-b413-4595-9766-a2c134676d27" />
|
||||
<img width="1590" height="795" alt="ComfyUI Screenshot" src="https://github.com/user-attachments/assets/36e065e0-bfae-4456-8c7f-8369d5ea48a2" />
|
||||
<br>
|
||||
</div>
|
||||
|
||||
ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...
|
||||
|
||||
@ -91,6 +91,7 @@ parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE"
|
||||
|
||||
parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
|
||||
parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
|
||||
parser.add_argument("--enable-triton-backend", action="store_true", help="ComfyUI will enable the use of Triton backend in comfy-kitchen. Is disabled at launch by default.")
|
||||
|
||||
class LatentPreviewMethod(enum.Enum):
|
||||
NoPreviews = "none"
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import torch
|
||||
import logging
|
||||
|
||||
from comfy.cli_args import args
|
||||
|
||||
try:
|
||||
import comfy_kitchen as ck
|
||||
from comfy_kitchen.tensor import (
|
||||
@ -21,7 +23,15 @@ try:
|
||||
ck.registry.disable("cuda")
|
||||
logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
|
||||
|
||||
ck.registry.disable("triton")
|
||||
if args.enable_triton_backend:
|
||||
try:
|
||||
import triton
|
||||
logging.info("Found triton %s. Enabling comfy-kitchen triton backend.", triton.__version__)
|
||||
except ImportError as e:
|
||||
logging.error(f"Failed to import triton, Error: {e}, the comfy-kitchen triton backend will not be available.")
|
||||
ck.registry.disable("triton")
|
||||
else:
|
||||
ck.registry.disable("triton")
|
||||
for k, v in ck.list_backends().items():
|
||||
logging.info(f"Found comfy_kitchen backend {k}: {v}")
|
||||
except ImportError as e:
|
||||
|
||||
64
comfy/sd.py
64
comfy/sd.py
@ -1320,6 +1320,13 @@ def detect_te_model(sd):
|
||||
return TEModel.QWEN25_3B
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
# Qwen-VL checkpoints can be saved under model.language_model.* (e.g. HY-OmniWeave text encoder).
|
||||
if 'model.language_model.layers.0.self_attn.k_proj.bias' in sd:
|
||||
weight = sd['model.language_model.layers.0.self_attn.k_proj.bias']
|
||||
if weight.shape[0] == 256:
|
||||
return TEModel.QWEN25_3B
|
||||
if weight.shape[0] == 512:
|
||||
return TEModel.QWEN25_7B
|
||||
if "model.language_model.layers.0.linear_attn.A_log" in sd and "model.language_model.layers.0.input_layernorm.weight" in sd:
|
||||
weight = sd['model.language_model.layers.0.input_layernorm.weight']
|
||||
if weight.shape[0] == 1024:
|
||||
@ -1365,7 +1372,11 @@ def t5xxl_detect(clip_data):
|
||||
return {}
|
||||
|
||||
def llama_detect(clip_data):
|
||||
weight_names = ["model.layers.0.self_attn.k_proj.weight", "model.layers.0.linear_attn.in_proj_a.weight"]
|
||||
weight_names = [
|
||||
"model.layers.0.self_attn.k_proj.weight",
|
||||
"model.layers.0.linear_attn.in_proj_a.weight",
|
||||
"model.language_model.layers.0.self_attn.k_proj.weight",
|
||||
]
|
||||
|
||||
for sd in clip_data:
|
||||
for weight_name in weight_names:
|
||||
@ -1476,7 +1487,23 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
clip_target.clip = comfy.text_encoders.omnigen2.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.omnigen2.Omnigen2Tokenizer
|
||||
elif te_model == TEModel.QWEN25_7B:
|
||||
if clip_type == CLIPType.HUNYUAN_IMAGE:
|
||||
# Some Qwen2.5-VL checkpoints (including HY-OmniWeave's text encoder)
|
||||
# are saved with "model.language_model.*" and "model.visual.*" prefixes.
|
||||
# Normalize keys to the layout expected by Comfy text encoder wrappers.
|
||||
for i, sd in enumerate(clip_data):
|
||||
if "model.language_model.layers.0.self_attn.k_proj.weight" in sd:
|
||||
clip_data[i] = comfy.utils.state_dict_prefix_replace(
|
||||
sd,
|
||||
{
|
||||
"model.language_model.": "model.",
|
||||
"model.visual.": "visual.",
|
||||
"final_layer_norm.": "model.norm.",
|
||||
},
|
||||
)
|
||||
if clip_type == CLIPType.HUNYUAN_VIDEO_15:
|
||||
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_video.HunyuanVideo15Tokenizer
|
||||
elif clip_type == CLIPType.HUNYUAN_IMAGE:
|
||||
clip_target.clip = comfy.text_encoders.hunyuan_image.te(byt5=False, **llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.hunyuan_image.HunyuanImageTokenizer
|
||||
elif clip_type == CLIPType.LONGCAT_IMAGE:
|
||||
@ -1814,6 +1841,39 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
|
||||
if custom_operations is None:
|
||||
sd, metadata = comfy.utils.convert_old_quants(sd, "", metadata=metadata)
|
||||
|
||||
# HY-OmniWeave checkpoints store double-block attention as split q/k/v tensors
|
||||
# while Comfy's HunyuanVideo implementation expects merged qkv tensors.
|
||||
if "double_blocks.0.img_attn_q.weight" in sd and "double_blocks.0.img_attn.qkv.weight" not in sd:
|
||||
converted_qkv = 0
|
||||
block_indices = set()
|
||||
for k in list(sd.keys()):
|
||||
if not k.startswith("double_blocks."):
|
||||
continue
|
||||
parts = k.split(".")
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
if parts[2] == "img_attn_q":
|
||||
try:
|
||||
block_indices.add(int(parts[1]))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for idx in sorted(block_indices):
|
||||
for attn_prefix in ("img_attn", "txt_attn"):
|
||||
for end in ("weight", "bias"):
|
||||
q_key = f"double_blocks.{idx}.{attn_prefix}_q.{end}"
|
||||
k_key = f"double_blocks.{idx}.{attn_prefix}_k.{end}"
|
||||
v_key = f"double_blocks.{idx}.{attn_prefix}_v.{end}"
|
||||
qkv_key = f"double_blocks.{idx}.{attn_prefix}.qkv.{end}"
|
||||
if qkv_key in sd:
|
||||
continue
|
||||
if q_key in sd and k_key in sd and v_key in sd:
|
||||
sd[qkv_key] = torch.cat((sd.pop(q_key), sd.pop(k_key), sd.pop(v_key)), dim=0)
|
||||
converted_qkv += 1
|
||||
|
||||
if converted_qkv > 0:
|
||||
logging.info(f"Converted {converted_qkv} split HunyuanVideo attention tensors to qkv format.")
|
||||
|
||||
parameters = comfy.utils.calculate_parameters(sd)
|
||||
weight_dtype = comfy.utils.weight_dtype(sd)
|
||||
|
||||
|
||||
@ -202,14 +202,11 @@ class JoinImageWithAlpha(io.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def execute(cls, image: torch.Tensor, alpha: torch.Tensor) -> io.NodeOutput:
|
||||
batch_size = min(len(image), len(alpha))
|
||||
out_images = []
|
||||
|
||||
batch_size = max(len(image), len(alpha))
|
||||
alpha = 1.0 - resize_mask(alpha, image.shape[1:])
|
||||
for i in range(batch_size):
|
||||
out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2))
|
||||
|
||||
return io.NodeOutput(torch.stack(out_images))
|
||||
alpha = comfy.utils.repeat_to_batch_size(alpha, batch_size)
|
||||
image = comfy.utils.repeat_to_batch_size(image, batch_size)
|
||||
return io.NodeOutput(torch.cat((image[..., :3], alpha.unsqueeze(-1)), dim=-1))
|
||||
|
||||
|
||||
class CompositingExtension(ComfyExtension):
|
||||
|
||||
@ -2,6 +2,8 @@ import nodes
|
||||
import node_helpers
|
||||
import torch
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
import comfy.clip_vision
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
from comfy.ldm.hunyuan_video.upsampler import HunyuanVideo15SRModel
|
||||
@ -301,6 +303,246 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
|
||||
encode = execute # TODO: remove
|
||||
|
||||
|
||||
class TextEncodeHunyuanVideo15Omni(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeHunyuanVideo15Omni",
|
||||
display_name="Text Encode HunyuanVideo 15 Omni",
|
||||
category="advanced/conditioning",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"),
|
||||
io.Boolean.Input("use_visual_inputs", default=True, advanced=True),
|
||||
io.Int.Input("max_visual_inputs", default=8, min=1, max=64, advanced=True),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(),
|
||||
],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _task_system_prompt(task: str) -> str:
|
||||
prompts = {
|
||||
"t2v": "Describe a high-quality target video from the user's request with concrete scene details, motion, camera behavior, and style.",
|
||||
"i2v": "Describe a target video that should stay consistent with the provided reference image while following the user's request.",
|
||||
"interpolation": "Describe a target video that smoothly transitions between the provided keyframe images while following the user's request.",
|
||||
"reference2v": "Describe a target video that composes the provided reference subjects into a coherent scene following the user's request.",
|
||||
"editing": "Describe an edited output video that follows the user's instruction while preserving relevant source video content.",
|
||||
"tiv2v": "Describe an edited output video using both the provided source video and reference image guidance according to the user's instruction.",
|
||||
}
|
||||
return prompts.get(task, prompts["t2v"])
|
||||
|
||||
@classmethod
|
||||
def _build_template(cls, task: str, image_count: int) -> str:
|
||||
system_prompt = cls._task_system_prompt(task)
|
||||
visual_tokens = "<|vision_start|><|image_pad|><|vision_end|>\n" * image_count
|
||||
return (
|
||||
"<|im_start|>system\n"
|
||||
f"{system_prompt}"
|
||||
"<|im_end|>\n"
|
||||
"<|im_start|>user\n"
|
||||
f"{visual_tokens}" + "{}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_image_embeds(clip_vision_output, max_visual_inputs: int):
|
||||
if clip_vision_output is None:
|
||||
return []
|
||||
mm_projected = getattr(clip_vision_output, "mm_projected", None)
|
||||
if mm_projected is None:
|
||||
return []
|
||||
if mm_projected.ndim == 2:
|
||||
return [mm_projected]
|
||||
count = min(mm_projected.shape[0], max_visual_inputs)
|
||||
return [mm_projected[i] for i in range(count)]
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, task, use_visual_inputs, max_visual_inputs, clip_vision_output=None) -> io.NodeOutput:
|
||||
image_embeds = cls._extract_image_embeds(clip_vision_output, max_visual_inputs) if use_visual_inputs else []
|
||||
template = cls._build_template(task, len(image_embeds))
|
||||
|
||||
# HunyuanVideo 1.5 tokenizers use `images=...`; HunyuanVideo 1.0 uses `image_embeds=...`.
|
||||
try:
|
||||
tokens = clip.tokenize(prompt, llama_template=template, images=image_embeds)
|
||||
except TypeError:
|
||||
embeds = None
|
||||
if len(image_embeds) > 0:
|
||||
embeds = torch.stack(image_embeds, dim=0)
|
||||
tokens = clip.tokenize(prompt, llama_template=template, image_embeds=embeds, image_interleave=1)
|
||||
return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens))
|
||||
|
||||
encode = execute # TODO: remove
|
||||
|
||||
|
||||
class HunyuanClipVisionOutputConcat(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanClipVisionOutputConcat",
|
||||
display_name="Hunyuan CLIP Vision Output Concat",
|
||||
category="conditioning/video_models",
|
||||
inputs=[
|
||||
io.ClipVisionOutput.Input("clip_vision_output_1"),
|
||||
io.ClipVisionOutput.Input("clip_vision_output_2", optional=True),
|
||||
io.ClipVisionOutput.Input("clip_vision_output_3", optional=True),
|
||||
io.ClipVisionOutput.Input("clip_vision_output_4", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.ClipVisionOutput.Output(),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip_vision_output_1, clip_vision_output_2=None, clip_vision_output_3=None, clip_vision_output_4=None) -> io.NodeOutput:
|
||||
outputs = [o for o in (clip_vision_output_1, clip_vision_output_2, clip_vision_output_3, clip_vision_output_4) if o is not None]
|
||||
merged = comfy.clip_vision.Output()
|
||||
tensor_attrs = ["last_hidden_state", "image_embeds", "penultimate_hidden_states", "all_hidden_states", "mm_projected"]
|
||||
for attr in tensor_attrs:
|
||||
values = [getattr(o, attr) for o in outputs if hasattr(o, attr)]
|
||||
if len(values) > 0 and torch.is_tensor(values[0]):
|
||||
setattr(merged, attr, torch.cat(values, dim=0))
|
||||
|
||||
image_sizes = []
|
||||
for o in outputs:
|
||||
if hasattr(o, "image_sizes"):
|
||||
image_sizes.extend(getattr(o, "image_sizes"))
|
||||
if len(image_sizes) > 0:
|
||||
merged.image_sizes = image_sizes
|
||||
return io.NodeOutput(merged)
|
||||
|
||||
|
||||
class HunyuanVideo15OmniConditioning(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15OmniConditioning",
|
||||
display_name="HunyuanVideo 15 Omni Conditioning",
|
||||
category="conditioning/video_models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
io.Vae.Input("vae"),
|
||||
io.Combo.Input("task", options=["t2v", "i2v", "interpolation", "reference2v", "editing", "tiv2v"], default="t2v"),
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("length", default=81, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("reference_images", optional=True, tooltip="For i2v/interpolation/reference2v/tiv2v."),
|
||||
io.Image.Input("condition_video", optional=True, tooltip="For editing/tiv2v."),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
io.Latent.Output(display_name="latent"),
|
||||
],
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _latent_length(length: int) -> int:
|
||||
return ((length - 1) // 4) + 1
|
||||
|
||||
@staticmethod
|
||||
def _upscale_frames(frames: torch.Tensor, width: int, height: int):
|
||||
return comfy.utils.common_upscale(frames.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||
|
||||
@classmethod
|
||||
def _encode_single_image(cls, vae, image: torch.Tensor, width: int, height: int):
|
||||
upscaled = cls._upscale_frames(image[:1], width, height)
|
||||
return vae.encode(upscaled[:, :, :, :3])
|
||||
|
||||
@classmethod
|
||||
def _encode_video(cls, vae, video: torch.Tensor, width: int, height: int, length: int):
|
||||
upscaled = cls._upscale_frames(video[:length], width, height)
|
||||
return vae.encode(upscaled[:, :, :, :3])
|
||||
|
||||
@staticmethod
|
||||
def _assign_frame(target: torch.Tensor, source: torch.Tensor, frame_idx: int):
|
||||
if frame_idx < 0 or frame_idx >= target.shape[2]:
|
||||
return
|
||||
target[:, :, frame_idx:frame_idx + 1] = source[:, :, :1]
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, task, width, height, length, batch_size, reference_images=None, condition_video=None, clip_vision_output=None) -> io.NodeOutput:
|
||||
latent_length = cls._latent_length(length)
|
||||
latent = torch.zeros([batch_size, 32, latent_length, height // 16, width // 16], device=comfy.model_management.intermediate_device())
|
||||
|
||||
if task == "t2v":
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||
return io.NodeOutput(positive, negative, {"samples": latent})
|
||||
|
||||
cond_latent = torch.zeros_like(latent[:1])
|
||||
omni_mask = torch.zeros((latent_length,), device=cond_latent.device, dtype=cond_latent.dtype)
|
||||
|
||||
if task == "i2v":
|
||||
if reference_images is None or reference_images.shape[0] < 1:
|
||||
raise ValueError("Task i2v requires at least one reference image.")
|
||||
encoded = cls._encode_single_image(vae, reference_images, width, height)
|
||||
cls._assign_frame(cond_latent, encoded, 0)
|
||||
omni_mask[0] = 1.0
|
||||
|
||||
elif task == "interpolation":
|
||||
if reference_images is None or reference_images.shape[0] < 2:
|
||||
raise ValueError("Task interpolation requires at least two reference images.")
|
||||
encoded_first = cls._encode_single_image(vae, reference_images[:1], width, height)
|
||||
encoded_last = cls._encode_single_image(vae, reference_images[-1:], width, height)
|
||||
cls._assign_frame(cond_latent, encoded_first, 0)
|
||||
cls._assign_frame(cond_latent, encoded_last, latent_length - 1)
|
||||
omni_mask[0] = 1.0
|
||||
omni_mask[-1] = 1.0
|
||||
|
||||
elif task == "reference2v":
|
||||
if reference_images is None or reference_images.shape[0] < 1:
|
||||
raise ValueError("Task reference2v requires at least one reference image.")
|
||||
num_refs = min(reference_images.shape[0], max(1, latent_length - 1))
|
||||
for idx in range(num_refs):
|
||||
encoded = cls._encode_single_image(vae, reference_images[idx:idx + 1], width, height)
|
||||
frame_idx = min(idx + 1, latent_length - 1)
|
||||
cls._assign_frame(cond_latent, encoded, frame_idx)
|
||||
omni_mask[frame_idx] = 1.0
|
||||
|
||||
elif task == "editing":
|
||||
if condition_video is None or condition_video.shape[0] < 1:
|
||||
raise ValueError("Task editing requires condition_video.")
|
||||
encoded = cls._encode_video(vae, condition_video, width, height, length)
|
||||
valid_frames = min(latent_length, encoded.shape[2])
|
||||
cond_latent[:, :, :valid_frames] = encoded[:, :, :valid_frames]
|
||||
omni_mask[:valid_frames] = 1.0
|
||||
|
||||
elif task == "tiv2v":
|
||||
if condition_video is None or condition_video.shape[0] < 1:
|
||||
raise ValueError("Task tiv2v requires condition_video.")
|
||||
if reference_images is None or reference_images.shape[0] < 1:
|
||||
raise ValueError("Task tiv2v requires at least one reference image.")
|
||||
encoded_video = cls._encode_video(vae, condition_video, width, height, length)
|
||||
valid_frames = min(latent_length, encoded_video.shape[2])
|
||||
cond_latent[:, :, :valid_frames] = encoded_video[:, :, :valid_frames]
|
||||
omni_mask[:valid_frames] = 1.0
|
||||
|
||||
encoded_ref = cls._encode_single_image(vae, reference_images[:1], width, height)
|
||||
ref_idx = 1 if latent_length > 1 else 0
|
||||
cond_latent[:, :, ref_idx:ref_idx + 1] += encoded_ref[:, :, :1]
|
||||
omni_mask[ref_idx] += 1.0
|
||||
|
||||
cond_latent = comfy.utils.resize_to_batch_size(cond_latent, batch_size)
|
||||
# BaseModel/HunyuanVideo15 inverts concat_mask (mask = 1 - concat_mask), so pass the pre-inverted mask.
|
||||
concat_mask = (1.0 - omni_mask).view(1, 1, latent_length, 1, 1).expand(cond_latent.shape[0], 1, latent_length, cond_latent.shape[-2], cond_latent.shape[-1]).to(cond_latent.dtype)
|
||||
|
||||
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": cond_latent, "concat_mask": concat_mask})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": cond_latent, "concat_mask": concat_mask})
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output})
|
||||
|
||||
return io.NodeOutput(positive, negative, {"samples": latent})
|
||||
|
||||
|
||||
class HunyuanImageToVideo(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
@ -411,9 +653,12 @@ class HunyuanExtension(ComfyExtension):
|
||||
return [
|
||||
CLIPTextEncodeHunyuanDiT,
|
||||
TextEncodeHunyuanVideo_ImageToVideo,
|
||||
TextEncodeHunyuanVideo15Omni,
|
||||
HunyuanClipVisionOutputConcat,
|
||||
EmptyHunyuanLatentVideo,
|
||||
EmptyHunyuanVideo15Latent,
|
||||
HunyuanVideo15ImageToVideo,
|
||||
HunyuanVideo15OmniConditioning,
|
||||
HunyuanVideo15SuperResolution,
|
||||
HunyuanVideo15LatentUpscaleWithModel,
|
||||
LatentUpscaleModelLoader,
|
||||
|
||||
@ -49,7 +49,7 @@ class Int(io.ComfyNode):
|
||||
display_name="Int",
|
||||
category="utils/primitive",
|
||||
inputs=[
|
||||
io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=True),
|
||||
io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=io.ControlAfterGenerate.fixed),
|
||||
],
|
||||
outputs=[io.Int.Output()],
|
||||
)
|
||||
|
||||
@ -86,6 +86,6 @@ def image_alpha_fix(destination, source):
|
||||
if destination.shape[-1] < source.shape[-1]:
|
||||
source = source[...,:destination.shape[-1]]
|
||||
elif destination.shape[-1] > source.shape[-1]:
|
||||
destination = torch.nn.functional.pad(destination, (0, 1))
|
||||
destination[..., -1] = 1.0
|
||||
source = torch.nn.functional.pad(source, (0, 1))
|
||||
source[..., -1] = 1.0
|
||||
return destination, source
|
||||
|
||||
70
nodes.py
70
nodes.py
@ -958,7 +958,7 @@ class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "hunyuan_video_15", "flux2", "ovis", "longcat_image"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -968,7 +968,7 @@ class CLIPLoader:
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B"
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\nhidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nhunyuan_video_15: qwen2.5-vl (single-file fallback without byT5)"
|
||||
|
||||
def load_clip(self, clip_name, type="stable_diffusion", device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
@ -1754,57 +1754,49 @@ class LoadImage:
|
||||
|
||||
return True
|
||||
|
||||
class LoadImageMask:
|
||||
|
||||
class LoadImageMask(LoadImage):
|
||||
ESSENTIALS_CATEGORY = "Image Tools"
|
||||
SEARCH_ALIASES = ["import mask", "alpha mask", "channel mask"]
|
||||
|
||||
_color_channels = ["alpha", "red", "green", "blue"]
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
input_dir = folder_paths.get_input_directory()
|
||||
files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
|
||||
return {"required":
|
||||
{"image": (sorted(files), {"image_upload": True}),
|
||||
"channel": (s._color_channels, ), }
|
||||
}
|
||||
types = super().INPUT_TYPES()
|
||||
return {
|
||||
"required": {
|
||||
**types["required"],
|
||||
"channel": (s._color_channels, )
|
||||
}
|
||||
}
|
||||
|
||||
CATEGORY = "mask"
|
||||
|
||||
RETURN_TYPES = ("MASK",)
|
||||
FUNCTION = "load_image"
|
||||
def load_image(self, image, channel):
|
||||
image_path = folder_paths.get_annotated_filepath(image)
|
||||
i = node_helpers.pillow(Image.open, image_path)
|
||||
i = node_helpers.pillow(ImageOps.exif_transpose, i)
|
||||
if i.getbands() != ("R", "G", "B", "A"):
|
||||
if i.mode == 'I':
|
||||
i = i.point(lambda i: i * (1 / 255))
|
||||
i = i.convert("RGBA")
|
||||
mask = None
|
||||
FUNCTION = "load_image_mask"
|
||||
|
||||
def load_image_mask(self, image, channel):
|
||||
image_tensor, mask_tensor = super().load_image(image)
|
||||
c = channel[0].upper()
|
||||
if c in i.getbands():
|
||||
mask = np.array(i.getchannel(c)).astype(np.float32) / 255.0
|
||||
mask = torch.from_numpy(mask)
|
||||
if c == 'A':
|
||||
mask = 1. - mask
|
||||
|
||||
if c == 'A':
|
||||
return (mask_tensor,)
|
||||
|
||||
channel_idx = {'R': 0, 'G': 1, 'B': 2}.get(c, 0)
|
||||
|
||||
if channel_idx < image_tensor.shape[-1]:
|
||||
return (image_tensor[..., channel_idx].clone(),)
|
||||
else:
|
||||
mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
|
||||
return (mask.unsqueeze(0),)
|
||||
empty_mask = torch.zeros(
|
||||
image_tensor.shape[:-1],
|
||||
dtype=image_tensor.dtype,
|
||||
device=image_tensor.device
|
||||
)
|
||||
return (empty_mask,)
|
||||
|
||||
@classmethod
|
||||
def IS_CHANGED(s, image, channel):
|
||||
image_path = folder_paths.get_annotated_filepath(image)
|
||||
m = hashlib.sha256()
|
||||
with open(image_path, 'rb') as f:
|
||||
m.update(f.read())
|
||||
return m.digest().hex()
|
||||
|
||||
@classmethod
|
||||
def VALIDATE_INPUTS(s, image):
|
||||
if not folder_paths.exists_annotated_filepath(image):
|
||||
return "Invalid image file: {}".format(image)
|
||||
|
||||
return True
|
||||
return super().IS_CHANGED(image)
|
||||
|
||||
|
||||
class LoadImageOutput(LoadImage):
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import errno
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
@ -1245,7 +1246,13 @@ class PromptServer():
|
||||
address = addr[0]
|
||||
port = addr[1]
|
||||
site = web.TCPSite(runner, address, port, ssl_context=ssl_ctx)
|
||||
await site.start()
|
||||
try:
|
||||
await site.start()
|
||||
except OSError as e:
|
||||
if e.errno == errno.EADDRINUSE:
|
||||
logging.error(f"Port {port} is already in use on address {address}. Please close the other application or use a different port with --port.")
|
||||
raise SystemExit(1)
|
||||
raise
|
||||
|
||||
if not hasattr(self, 'address'):
|
||||
self.address = address #TODO: remove this
|
||||
|
||||
Loading…
Reference in New Issue
Block a user