Merge branch 'master' into dr-support-pip-cm

This commit is contained in:
Dr.Lt.Data 2025-08-30 06:12:16 +09:00
commit 1224d58a17
7 changed files with 69 additions and 45 deletions

View File

@ -1110,9 +1110,10 @@ class WAN21(BaseModel):
shape_image[1] = extra_channels shape_image[1] = extra_channels
image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device) image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
else: else:
latent_dim = self.latent_format.latent_channels
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
for i in range(0, image.shape[1], 16): for i in range(0, image.shape[1], latent_dim):
image[:, i: i + 16] = self.process_latent_in(image[:, i: i + 16]) image[:, i: i + latent_dim] = self.process_latent_in(image[:, i: i + latent_dim])
image = utils.resize_to_batch_size(image, noise.shape[0]) image = utils.resize_to_batch_size(image, noise.shape[0])
if extra_channels != image.shape[1] + 4: if extra_channels != image.shape[1] + 4:
@ -1245,18 +1246,14 @@ class WAN22_S2V(WAN21):
out['reference_motion'] = reference_motion.shape out['reference_motion'] = reference_motion.shape
return out return out
class WAN22(BaseModel): class WAN22(WAN21):
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
self.image_to_video = image_to_video self.image_to_video = image_to_video
def extra_conds(self, **kwargs): def extra_conds(self, **kwargs):
out = super().extra_conds(**kwargs) out = super().extra_conds(**kwargs)
cross_attn = kwargs.get("cross_attn", None) denoise_mask = kwargs.get("denoise_mask", None)
if cross_attn is not None:
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
if denoise_mask is not None: if denoise_mask is not None:
out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask) out["denoise_mask"] = comfy.conds.CONDRegular(denoise_mask)
return out return out

View File

@ -8,6 +8,7 @@ import av
import io import io
import json import json
import numpy as np import numpy as np
import math
import torch import torch
from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents from comfy_api.latest._util import VideoContainer, VideoCodec, VideoComponents
@ -282,8 +283,6 @@ class VideoFromComponents(VideoInput):
if self.__components.audio: if self.__components.audio:
audio_sample_rate = int(self.__components.audio['sample_rate']) audio_sample_rate = int(self.__components.audio['sample_rate'])
audio_stream = output.add_stream('aac', rate=audio_sample_rate) audio_stream = output.add_stream('aac', rate=audio_sample_rate)
audio_stream.sample_rate = audio_sample_rate
audio_stream.format = 'fltp'
# Encode video # Encode video
for i, frame in enumerate(self.__components.images): for i, frame in enumerate(self.__components.images):
@ -298,27 +297,12 @@ class VideoFromComponents(VideoInput):
output.mux(packet) output.mux(packet)
if audio_stream and self.__components.audio: if audio_stream and self.__components.audio:
# Encode audio waveform = self.__components.audio['waveform']
samples_per_frame = int(audio_sample_rate / frame_rate) waveform = waveform[:, :, :math.ceil((audio_sample_rate / frame_rate) * self.__components.images.shape[0])]
num_frames = self.__components.audio['waveform'].shape[2] // samples_per_frame frame = av.AudioFrame.from_ndarray(waveform.movedim(2, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[1] == 1 else 'stereo')
for i in range(num_frames): frame.sample_rate = audio_sample_rate
start = i * samples_per_frame frame.pts = 0
end = start + samples_per_frame output.mux(audio_stream.encode(frame))
# TODO(Feature) - Add support for stereo audio
chunk = (
self.__components.audio["waveform"][0, 0, start:end]
.unsqueeze(0)
.contiguous()
.numpy()
)
audio_frame = av.AudioFrame.from_ndarray(chunk, format='fltp', layout='mono')
audio_frame.sample_rate = audio_sample_rate
audio_frame.pts = i * samples_per_frame
for packet in audio_stream.encode(audio_frame):
output.mux(packet)
# Flush audio
for packet in audio_stream.encode(None):
output.mux(packet)
# Flush encoder
output.mux(audio_stream.encode(None))

View File

@ -1,6 +1,7 @@
import comfy.utils import comfy.utils
import comfy_extras.nodes_post_processing import comfy_extras.nodes_post_processing
import torch import torch
import nodes
def reshape_latent_to(target_shape, latent, repeat_batch=True): def reshape_latent_to(target_shape, latent, repeat_batch=True):
@ -137,6 +138,41 @@ class LatentConcat:
samples_out["samples"] = torch.cat(c, dim=dim) samples_out["samples"] = torch.cat(c, dim=dim)
return (samples_out,) return (samples_out,)
class LatentCut:
@classmethod
def INPUT_TYPES(s):
return {"required": {"samples": ("LATENT",),
"dim": (["x", "y", "t"], ),
"index": ("INT", {"default": 0, "min": -nodes.MAX_RESOLUTION, "max": nodes.MAX_RESOLUTION, "step": 1}),
"amount": ("INT", {"default": 1, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 1})}}
RETURN_TYPES = ("LATENT",)
FUNCTION = "op"
CATEGORY = "latent/advanced"
def op(self, samples, dim, index, amount):
samples_out = samples.copy()
s1 = samples["samples"]
if "x" in dim:
dim = s1.ndim - 1
elif "y" in dim:
dim = s1.ndim - 2
elif "t" in dim:
dim = s1.ndim - 3
if index >= 0:
index = min(index, s1.shape[dim] - 1)
amount = min(s1.shape[dim] - index, amount)
else:
index = max(index, -s1.shape[dim])
amount = min(-index, amount)
samples_out["samples"] = torch.narrow(s1, dim, index, amount)
return (samples_out,)
class LatentBatch: class LatentBatch:
@classmethod @classmethod
def INPUT_TYPES(s): def INPUT_TYPES(s):
@ -312,6 +348,7 @@ NODE_CLASS_MAPPINGS = {
"LatentMultiply": LatentMultiply, "LatentMultiply": LatentMultiply,
"LatentInterpolate": LatentInterpolate, "LatentInterpolate": LatentInterpolate,
"LatentConcat": LatentConcat, "LatentConcat": LatentConcat,
"LatentCut": LatentCut,
"LatentBatch": LatentBatch, "LatentBatch": LatentBatch,
"LatentBatchSeedBehavior": LatentBatchSeedBehavior, "LatentBatchSeedBehavior": LatentBatchSeedBehavior,
"LatentApplyOperation": LatentApplyOperation, "LatentApplyOperation": LatentApplyOperation,

View File

@ -139,8 +139,13 @@ class Wan22FunControlToVideo(io.ComfyNode):
@classmethod @classmethod
def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput: def execute(cls, positive, negative, vae, width, height, length, batch_size, ref_image=None, start_image=None, control_video=None) -> io.NodeOutput:
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) spacial_scale = vae.spacial_compression_encode()
concat_latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) latent_channels = vae.latent_channels
latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device())
concat_latent = torch.zeros([batch_size, latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device())
if latent_channels == 48:
concat_latent = comfy.latent_formats.Wan22().process_out(concat_latent)
else:
concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent) concat_latent = comfy.latent_formats.Wan21().process_out(concat_latent)
concat_latent = concat_latent.repeat(1, 2, 1, 1, 1) concat_latent = concat_latent.repeat(1, 2, 1, 1, 1)
mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1])) mask = torch.ones((1, 1, latent.shape[2] * 4, latent.shape[-2], latent.shape[-1]))
@ -148,7 +153,7 @@ class Wan22FunControlToVideo(io.ComfyNode):
if start_image is not None: if start_image is not None:
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
concat_latent_image = vae.encode(start_image[:, :, :, :3]) concat_latent_image = vae.encode(start_image[:, :, :, :3])
concat_latent[:,16:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] concat_latent[:,latent_channels:,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
mask[:, :, :start_image.shape[0] + 3] = 0.0 mask[:, :, :start_image.shape[0] + 3] = 0.0
ref_latent = None ref_latent = None
@ -159,11 +164,11 @@ class Wan22FunControlToVideo(io.ComfyNode):
if control_video is not None: if control_video is not None:
control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) control_video = comfy.utils.common_upscale(control_video[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
concat_latent_image = vae.encode(control_video[:, :, :, :3]) concat_latent_image = vae.encode(control_video[:, :, :, :3])
concat_latent[:,:16,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]] concat_latent[:,:latent_channels,:concat_latent_image.shape[2]] = concat_latent_image[:,:,:concat_latent.shape[2]]
mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2) mask = mask.view(1, mask.shape[2] // 4, 4, mask.shape[3], mask.shape[4]).transpose(1, 2)
positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels})
negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": 16}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent, "concat_mask": mask, "concat_mask_index": latent_channels})
if ref_latent is not None: if ref_latent is not None:
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True) positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
@ -201,7 +206,8 @@ class WanFirstLastFrameToVideo(io.ComfyNode):
@classmethod @classmethod
def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput: def execute(cls, positive, negative, vae, width, height, length, batch_size, start_image=None, end_image=None, clip_vision_start_image=None, clip_vision_end_image=None) -> io.NodeOutput:
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) spacial_scale = vae.spacial_compression_encode()
latent = torch.zeros([batch_size, vae.latent_channels, ((length - 1) // 4) + 1, height // spacial_scale, width // spacial_scale], device=comfy.model_management.intermediate_device())
if start_image is not None: if start_image is not None:
start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
if end_image is not None: if end_image is not None:

View File

@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is # This file is automatically generated by the build process when version is
# updated in pyproject.toml. # updated in pyproject.toml.
__version__ = "0.3.54" __version__ = "0.3.55"

View File

@ -1,6 +1,6 @@
[project] [project]
name = "ComfyUI" name = "ComfyUI"
version = "0.3.54" version = "0.3.55"
readme = "README.md" readme = "README.md"
license = { file = "LICENSE" } license = { file = "LICENSE" }
requires-python = ">=3.9" requires-python = ">=3.9"

View File

@ -1,5 +1,5 @@
comfyui-frontend-package==1.25.11 comfyui-frontend-package==1.25.11
comfyui-workflow-templates==0.1.68 comfyui-workflow-templates==0.1.70
comfyui-embedded-docs==0.2.6 comfyui-embedded-docs==0.2.6
comfyui_manager==4.0.1b2 comfyui_manager==4.0.1b2
torch torch