mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-26 14:20:27 +08:00
Merge branch 'comfyanonymous:master' into master
This commit is contained in:
commit
025d9ed896
@ -379,6 +379,7 @@ class LTXVModel(torch.nn.Module):
|
|||||||
positional_embedding_max_pos=[20, 2048, 2048],
|
positional_embedding_max_pos=[20, 2048, 2048],
|
||||||
dtype=None, device=None, operations=None, **kwargs):
|
dtype=None, device=None, operations=None, **kwargs):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.generator = None
|
||||||
self.dtype = dtype
|
self.dtype = dtype
|
||||||
self.out_channels = in_channels
|
self.out_channels = in_channels
|
||||||
self.inner_dim = num_attention_heads * attention_head_dim
|
self.inner_dim = num_attention_heads * attention_head_dim
|
||||||
@ -415,7 +416,7 @@ class LTXVModel(torch.nn.Module):
|
|||||||
|
|
||||||
self.patchifier = SymmetricPatchifier(1)
|
self.patchifier = SymmetricPatchifier(1)
|
||||||
|
|
||||||
def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, transformer_options={}, **kwargs):
|
def forward(self, x, timestep, context, attention_mask, frame_rate=25, guiding_latent=None, guiding_latent_noise_scale=0, transformer_options={}, **kwargs):
|
||||||
patches_replace = transformer_options.get("patches_replace", {})
|
patches_replace = transformer_options.get("patches_replace", {})
|
||||||
|
|
||||||
indices_grid = self.patchifier.get_grid(
|
indices_grid = self.patchifier.get_grid(
|
||||||
@ -431,10 +432,22 @@ class LTXVModel(torch.nn.Module):
|
|||||||
ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
|
ts = torch.ones([x.shape[0], 1, x.shape[2], x.shape[3], x.shape[4]], device=x.device, dtype=x.dtype)
|
||||||
input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
|
input_ts = timestep.view([timestep.shape[0]] + [1] * (x.ndim - 1))
|
||||||
ts *= input_ts
|
ts *= input_ts
|
||||||
ts[:, :, 0] = 0.0
|
ts[:, :, 0] = guiding_latent_noise_scale * (input_ts[:, :, 0] ** 2)
|
||||||
timestep = self.patchifier.patchify(ts)
|
timestep = self.patchifier.patchify(ts)
|
||||||
input_x = x.clone()
|
input_x = x.clone()
|
||||||
x[:, :, 0] = guiding_latent[:, :, 0]
|
x[:, :, 0] = guiding_latent[:, :, 0]
|
||||||
|
if guiding_latent_noise_scale > 0:
|
||||||
|
if self.generator is None:
|
||||||
|
self.generator = torch.Generator(device=x.device).manual_seed(42)
|
||||||
|
elif self.generator.device != x.device:
|
||||||
|
self.generator = torch.Generator(device=x.device).set_state(self.generator.get_state())
|
||||||
|
|
||||||
|
noise_shape = [guiding_latent.shape[0], guiding_latent.shape[1], 1, guiding_latent.shape[3], guiding_latent.shape[4]]
|
||||||
|
scale = guiding_latent_noise_scale * (input_ts ** 2)
|
||||||
|
guiding_noise = scale * torch.randn(size=noise_shape, device=x.device, generator=self.generator)
|
||||||
|
|
||||||
|
x[:, :, 0] = guiding_noise[:, :, 0] + x[:, :, 0] * (1.0 - scale[:, :, 0])
|
||||||
|
|
||||||
|
|
||||||
orig_shape = list(x.shape)
|
orig_shape = list(x.shape)
|
||||||
|
|
||||||
|
|||||||
@ -804,5 +804,9 @@ class LTXV(BaseModel):
|
|||||||
if guiding_latent is not None:
|
if guiding_latent is not None:
|
||||||
out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
|
out['guiding_latent'] = comfy.conds.CONDRegular(guiding_latent)
|
||||||
|
|
||||||
|
guiding_latent_noise_scale = kwargs.get("guiding_latent_noise_scale", None)
|
||||||
|
if guiding_latent_noise_scale is not None:
|
||||||
|
out["guiding_latent_noise_scale"] = comfy.conds.CONDConstant(guiding_latent_noise_scale)
|
||||||
|
|
||||||
out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
|
out['frame_rate'] = comfy.conds.CONDConstant(kwargs.get("frame_rate", 25))
|
||||||
return out
|
return out
|
||||||
|
|||||||
@ -243,7 +243,7 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
|
|||||||
return 1.0
|
return 1.0
|
||||||
if percent >= 1.0:
|
if percent >= 1.0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return 1.0 - percent
|
return time_snr_shift(self.shift, 1.0 - percent)
|
||||||
|
|
||||||
class StableCascadeSampling(ModelSamplingDiscrete):
|
class StableCascadeSampling(ModelSamplingDiscrete):
|
||||||
def __init__(self, model_config=None):
|
def __init__(self, model_config=None):
|
||||||
@ -336,4 +336,4 @@ class ModelSamplingFlux(torch.nn.Module):
|
|||||||
return 1.0
|
return 1.0
|
||||||
if percent >= 1.0:
|
if percent >= 1.0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return 1.0 - percent
|
return flux_time_shift(self.shift, 1.0, 1.0 - percent)
|
||||||
|
|||||||
@ -32,7 +32,9 @@ class LTXVImgToVideo:
|
|||||||
"width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
|
"width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
|
||||||
"height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
|
"height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
|
||||||
"length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}),
|
"length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}),
|
||||||
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
|
"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
|
||||||
|
"image_noise_scale": ("FLOAT", {"default": 0.15, "min": 0, "max": 1.0, "step": 0.01, "tooltip": "Amount of noise to apply on conditioning image latent."})
|
||||||
|
}}
|
||||||
|
|
||||||
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
|
||||||
RETURN_NAMES = ("positive", "negative", "latent")
|
RETURN_NAMES = ("positive", "negative", "latent")
|
||||||
@ -40,12 +42,12 @@ class LTXVImgToVideo:
|
|||||||
CATEGORY = "conditioning/video_models"
|
CATEGORY = "conditioning/video_models"
|
||||||
FUNCTION = "generate"
|
FUNCTION = "generate"
|
||||||
|
|
||||||
def generate(self, positive, negative, image, vae, width, height, length, batch_size):
|
def generate(self, positive, negative, image, vae, width, height, length, batch_size, image_noise_scale):
|
||||||
pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
|
||||||
encode_pixels = pixels[:, :, :, :3]
|
encode_pixels = pixels[:, :, :, :3]
|
||||||
t = vae.encode(encode_pixels)
|
t = vae.encode(encode_pixels)
|
||||||
positive = node_helpers.conditioning_set_values(positive, {"guiding_latent": t})
|
positive = node_helpers.conditioning_set_values(positive, {"guiding_latent": t, "guiding_latent_noise_scale": image_noise_scale})
|
||||||
negative = node_helpers.conditioning_set_values(negative, {"guiding_latent": t})
|
negative = node_helpers.conditioning_set_values(negative, {"guiding_latent": t, "guiding_latent_noise_scale": image_noise_scale})
|
||||||
|
|
||||||
latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
|
||||||
latent[:, :, :t.shape[2]] = t
|
latent[:, :, :t.shape[2]] = t
|
||||||
@ -109,6 +111,7 @@ class ModelSamplingLTXV:
|
|||||||
model_sampling = ModelSamplingAdvanced(model.model.model_config)
|
model_sampling = ModelSamplingAdvanced(model.model.model_config)
|
||||||
model_sampling.set_parameters(shift=shift)
|
model_sampling.set_parameters(shift=shift)
|
||||||
m.add_object_patch("model_sampling", model_sampling)
|
m.add_object_patch("model_sampling", model_sampling)
|
||||||
|
|
||||||
return (m, )
|
return (m, )
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user