mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-08 16:22:38 +08:00
Merge branch 'master' into curve-node
This commit is contained in:
commit
efb912c36c
File diff suppressed because one or more lines are too long
@ -386,7 +386,7 @@ class Flux(nn.Module):
|
|||||||
h = max(h, ref.shape[-2] + h_offset)
|
h = max(h, ref.shape[-2] + h_offset)
|
||||||
w = max(w, ref.shape[-1] + w_offset)
|
w = max(w, ref.shape[-1] + w_offset)
|
||||||
|
|
||||||
kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset)
|
kontext, kontext_ids = self.process_img(ref, index=index, h_offset=h_offset, w_offset=w_offset, transformer_options=transformer_options)
|
||||||
img = torch.cat([img, kontext], dim=1)
|
img = torch.cat([img, kontext], dim=1)
|
||||||
img_ids = torch.cat([img_ids, kontext_ids], dim=1)
|
img_ids = torch.cat([img_ids, kontext_ids], dim=1)
|
||||||
ref_num_tokens.append(kontext.shape[1])
|
ref_num_tokens.append(kontext.shape[1])
|
||||||
|
|||||||
@ -681,6 +681,33 @@ class LTXAVModel(LTXVModel):
|
|||||||
additional_args["has_spatial_mask"] = has_spatial_mask
|
additional_args["has_spatial_mask"] = has_spatial_mask
|
||||||
|
|
||||||
ax, a_latent_coords = self.a_patchifier.patchify(ax)
|
ax, a_latent_coords = self.a_patchifier.patchify(ax)
|
||||||
|
|
||||||
|
# Inject reference audio for ID-LoRA in-context conditioning
|
||||||
|
ref_audio = kwargs.get("ref_audio", None)
|
||||||
|
ref_audio_seq_len = 0
|
||||||
|
if ref_audio is not None:
|
||||||
|
ref_tokens = ref_audio["tokens"].to(dtype=ax.dtype, device=ax.device)
|
||||||
|
if ref_tokens.shape[0] < ax.shape[0]:
|
||||||
|
ref_tokens = ref_tokens.expand(ax.shape[0], -1, -1)
|
||||||
|
ref_audio_seq_len = ref_tokens.shape[1]
|
||||||
|
B = ax.shape[0]
|
||||||
|
|
||||||
|
# Compute negative temporal positions matching ID-LoRA convention:
|
||||||
|
# offset by -(end_of_last_token + time_per_latent) so reference ends just before t=0
|
||||||
|
p = self.a_patchifier
|
||||||
|
tpl = p.hop_length * p.audio_latent_downsample_factor / p.sample_rate
|
||||||
|
ref_start = p._get_audio_latent_time_in_sec(0, ref_audio_seq_len, torch.float32, ax.device)
|
||||||
|
ref_end = p._get_audio_latent_time_in_sec(1, ref_audio_seq_len + 1, torch.float32, ax.device)
|
||||||
|
time_offset = ref_end[-1].item() + tpl
|
||||||
|
ref_start = (ref_start - time_offset).unsqueeze(0).expand(B, -1).unsqueeze(1)
|
||||||
|
ref_end = (ref_end - time_offset).unsqueeze(0).expand(B, -1).unsqueeze(1)
|
||||||
|
ref_pos = torch.stack([ref_start, ref_end], dim=-1)
|
||||||
|
|
||||||
|
additional_args["ref_audio_seq_len"] = ref_audio_seq_len
|
||||||
|
additional_args["target_audio_seq_len"] = ax.shape[1]
|
||||||
|
ax = torch.cat([ref_tokens, ax], dim=1)
|
||||||
|
a_latent_coords = torch.cat([ref_pos.to(a_latent_coords), a_latent_coords], dim=2)
|
||||||
|
|
||||||
ax = self.audio_patchify_proj(ax)
|
ax = self.audio_patchify_proj(ax)
|
||||||
|
|
||||||
# additional_args.update({"av_orig_shape": list(x.shape)})
|
# additional_args.update({"av_orig_shape": list(x.shape)})
|
||||||
@ -721,6 +748,14 @@ class LTXAVModel(LTXVModel):
|
|||||||
|
|
||||||
# Prepare audio timestep
|
# Prepare audio timestep
|
||||||
a_timestep = kwargs.get("a_timestep")
|
a_timestep = kwargs.get("a_timestep")
|
||||||
|
ref_audio_seq_len = kwargs.get("ref_audio_seq_len", 0)
|
||||||
|
if ref_audio_seq_len > 0 and a_timestep is not None:
|
||||||
|
# Reference tokens must have timestep=0, expand scalar/1D timestep to per-token so ref=0 and target=sigma.
|
||||||
|
target_len = kwargs.get("target_audio_seq_len")
|
||||||
|
if a_timestep.dim() <= 1:
|
||||||
|
a_timestep = a_timestep.view(-1, 1).expand(batch_size, target_len)
|
||||||
|
ref_ts = torch.zeros(batch_size, ref_audio_seq_len, *a_timestep.shape[2:], device=a_timestep.device, dtype=a_timestep.dtype)
|
||||||
|
a_timestep = torch.cat([ref_ts, a_timestep], dim=1)
|
||||||
if a_timestep is not None:
|
if a_timestep is not None:
|
||||||
a_timestep_scaled = a_timestep * self.timestep_scale_multiplier
|
a_timestep_scaled = a_timestep * self.timestep_scale_multiplier
|
||||||
a_timestep_flat = a_timestep_scaled.flatten()
|
a_timestep_flat = a_timestep_scaled.flatten()
|
||||||
@ -955,6 +990,13 @@ class LTXAVModel(LTXVModel):
|
|||||||
v_embedded_timestep = embedded_timestep[0]
|
v_embedded_timestep = embedded_timestep[0]
|
||||||
a_embedded_timestep = embedded_timestep[1]
|
a_embedded_timestep = embedded_timestep[1]
|
||||||
|
|
||||||
|
# Trim reference audio tokens before unpatchification
|
||||||
|
ref_audio_seq_len = kwargs.get("ref_audio_seq_len", 0)
|
||||||
|
if ref_audio_seq_len > 0:
|
||||||
|
ax = ax[:, ref_audio_seq_len:]
|
||||||
|
if a_embedded_timestep.shape[1] > 1:
|
||||||
|
a_embedded_timestep = a_embedded_timestep[:, ref_audio_seq_len:]
|
||||||
|
|
||||||
# Expand compressed video timestep if needed
|
# Expand compressed video timestep if needed
|
||||||
if isinstance(v_embedded_timestep, CompressedTimestep):
|
if isinstance(v_embedded_timestep, CompressedTimestep):
|
||||||
v_embedded_timestep = v_embedded_timestep.expand()
|
v_embedded_timestep = v_embedded_timestep.expand()
|
||||||
|
|||||||
@ -376,11 +376,16 @@ class Decoder3d(nn.Module):
|
|||||||
return
|
return
|
||||||
|
|
||||||
layer = self.upsamples[layer_idx]
|
layer = self.upsamples[layer_idx]
|
||||||
if isinstance(layer, Resample) and layer.mode == 'upsample3d' and x.shape[2] > 1:
|
if feat_cache is not None:
|
||||||
for frame_idx in range(x.shape[2]):
|
x = layer(x, feat_cache, feat_idx)
|
||||||
|
else:
|
||||||
|
x = layer(x)
|
||||||
|
|
||||||
|
if isinstance(layer, Resample) and layer.mode == 'upsample3d' and x.shape[2] > 2:
|
||||||
|
for frame_idx in range(0, x.shape[2], 2):
|
||||||
self.run_up(
|
self.run_up(
|
||||||
layer_idx,
|
layer_idx + 1,
|
||||||
[x[:, :, frame_idx:frame_idx + 1, :, :]],
|
[x[:, :, frame_idx:frame_idx + 2, :, :]],
|
||||||
feat_cache,
|
feat_cache,
|
||||||
feat_idx.copy(),
|
feat_idx.copy(),
|
||||||
out_chunks,
|
out_chunks,
|
||||||
@ -388,11 +393,6 @@ class Decoder3d(nn.Module):
|
|||||||
del x
|
del x
|
||||||
return
|
return
|
||||||
|
|
||||||
if feat_cache is not None:
|
|
||||||
x = layer(x, feat_cache, feat_idx)
|
|
||||||
else:
|
|
||||||
x = layer(x)
|
|
||||||
|
|
||||||
next_x_ref = [x]
|
next_x_ref = [x]
|
||||||
del x
|
del x
|
||||||
self.run_up(layer_idx + 1, next_x_ref, feat_cache, feat_idx, out_chunks)
|
self.run_up(layer_idx + 1, next_x_ref, feat_cache, feat_idx, out_chunks)
|
||||||
|
|||||||
@ -937,9 +937,10 @@ class LongCatImage(Flux):
|
|||||||
transformer_options = transformer_options.copy()
|
transformer_options = transformer_options.copy()
|
||||||
rope_opts = transformer_options.get("rope_options", {})
|
rope_opts = transformer_options.get("rope_options", {})
|
||||||
rope_opts = dict(rope_opts)
|
rope_opts = dict(rope_opts)
|
||||||
|
pe_len = float(c_crossattn.shape[1]) if c_crossattn is not None else 512.0
|
||||||
rope_opts.setdefault("shift_t", 1.0)
|
rope_opts.setdefault("shift_t", 1.0)
|
||||||
rope_opts.setdefault("shift_y", 512.0)
|
rope_opts.setdefault("shift_y", pe_len)
|
||||||
rope_opts.setdefault("shift_x", 512.0)
|
rope_opts.setdefault("shift_x", pe_len)
|
||||||
transformer_options["rope_options"] = rope_opts
|
transformer_options["rope_options"] = rope_opts
|
||||||
return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)
|
return super()._apply_model(x, t, c_concat, c_crossattn, control, transformer_options, **kwargs)
|
||||||
|
|
||||||
@ -1060,6 +1061,10 @@ class LTXAV(BaseModel):
|
|||||||
if guide_attention_entries is not None:
|
if guide_attention_entries is not None:
|
||||||
out['guide_attention_entries'] = comfy.conds.CONDConstant(guide_attention_entries)
|
out['guide_attention_entries'] = comfy.conds.CONDConstant(guide_attention_entries)
|
||||||
|
|
||||||
|
ref_audio = kwargs.get("ref_audio", None)
|
||||||
|
if ref_audio is not None:
|
||||||
|
out['ref_audio'] = comfy.conds.CONDConstant(ref_audio)
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def process_timestep(self, timestep, x, denoise_mask=None, audio_denoise_mask=None, **kwargs):
|
def process_timestep(self, timestep, x, denoise_mask=None, audio_denoise_mask=None, **kwargs):
|
||||||
|
|||||||
@ -8,12 +8,12 @@ import comfy.nested_tensor
|
|||||||
|
|
||||||
def prepare_noise_inner(latent_image, generator, noise_inds=None):
|
def prepare_noise_inner(latent_image, generator, noise_inds=None):
|
||||||
if noise_inds is None:
|
if noise_inds is None:
|
||||||
return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
|
return torch.randn(latent_image.size(), dtype=torch.float32, layout=latent_image.layout, generator=generator, device="cpu").to(dtype=latent_image.dtype)
|
||||||
|
|
||||||
unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
|
unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
|
||||||
noises = []
|
noises = []
|
||||||
for i in range(unique_inds[-1]+1):
|
for i in range(unique_inds[-1]+1):
|
||||||
noise = torch.randn([1] + list(latent_image.size())[1:], dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
|
noise = torch.randn([1] + list(latent_image.size())[1:], dtype=torch.float32, layout=latent_image.layout, generator=generator, device="cpu").to(dtype=latent_image.dtype)
|
||||||
if i in unique_inds:
|
if i in unique_inds:
|
||||||
noises.append(noise)
|
noises.append(noise)
|
||||||
noises = [noises[i] for i in inverse]
|
noises = [noises[i] for i in inverse]
|
||||||
|
|||||||
@ -985,8 +985,8 @@ class CFGGuider:
|
|||||||
self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
|
self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
|
||||||
device = self.model_patcher.load_device
|
device = self.model_patcher.load_device
|
||||||
|
|
||||||
noise = noise.to(device)
|
noise = noise.to(device=device, dtype=torch.float32)
|
||||||
latent_image = latent_image.to(device)
|
latent_image = latent_image.to(device=device, dtype=torch.float32)
|
||||||
sigmas = sigmas.to(device)
|
sigmas = sigmas.to(device)
|
||||||
cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype())
|
cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype())
|
||||||
|
|
||||||
@ -1028,6 +1028,7 @@ class CFGGuider:
|
|||||||
denoise_mask, _ = comfy.utils.pack_latents(denoise_masks)
|
denoise_mask, _ = comfy.utils.pack_latents(denoise_masks)
|
||||||
else:
|
else:
|
||||||
denoise_mask = denoise_masks[0]
|
denoise_mask = denoise_masks[0]
|
||||||
|
denoise_mask = denoise_mask.float()
|
||||||
|
|
||||||
self.conds = {}
|
self.conds = {}
|
||||||
for k in self.original_conds:
|
for k in self.original_conds:
|
||||||
|
|||||||
@ -1028,12 +1028,19 @@ class Qwen25_7BVLI(BaseLlama, BaseGenerate, torch.nn.Module):
|
|||||||
grid = e.get("extra", None)
|
grid = e.get("extra", None)
|
||||||
start = e.get("index")
|
start = e.get("index")
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
|
position_ids = torch.ones((3, embeds.shape[1]), device=embeds.device, dtype=torch.long)
|
||||||
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
||||||
end = e.get("size") + start
|
end = e.get("size") + start
|
||||||
len_max = int(grid.max()) // 2
|
len_max = int(grid.max()) // 2
|
||||||
start_next = len_max + start
|
start_next = len_max + start
|
||||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
if attention_mask is not None:
|
||||||
|
# Assign compact sequential positions to attended tokens only,
|
||||||
|
# skipping over padding so post-padding tokens aren't inflated.
|
||||||
|
after_mask = attention_mask[0, end:]
|
||||||
|
text_positions = after_mask.cumsum(0) - 1 + start_next + offset
|
||||||
|
position_ids[:, end:] = torch.where(after_mask.bool(), text_positions, position_ids[0, end:])
|
||||||
|
else:
|
||||||
|
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||||
position_ids[0, start:end] = start + offset
|
position_ids[0, start:end] = start + offset
|
||||||
max_d = int(grid[0][1]) // 2
|
max_d = int(grid[0][1]) // 2
|
||||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||||
|
|||||||
@ -64,7 +64,13 @@ class LongCatImageBaseTokenizer(Qwen25_7BVLITokenizer):
|
|||||||
return [output]
|
return [output]
|
||||||
|
|
||||||
|
|
||||||
|
IMAGE_PAD_TOKEN_ID = 151655
|
||||||
|
|
||||||
class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
||||||
|
T2I_PREFIX = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
|
||||||
|
EDIT_PREFIX = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
|
||||||
|
SUFFIX = "<|im_end|>\n<|im_start|>assistant\n"
|
||||||
|
|
||||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
embedding_directory=embedding_directory,
|
embedding_directory=embedding_directory,
|
||||||
@ -72,10 +78,8 @@ class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
|||||||
name="qwen25_7b",
|
name="qwen25_7b",
|
||||||
tokenizer=LongCatImageBaseTokenizer,
|
tokenizer=LongCatImageBaseTokenizer,
|
||||||
)
|
)
|
||||||
self.longcat_template_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
|
|
||||||
self.longcat_template_suffix = "<|im_end|>\n<|im_start|>assistant\n"
|
|
||||||
|
|
||||||
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
|
def tokenize_with_weights(self, text, return_word_ids=False, images=None, **kwargs):
|
||||||
skip_template = False
|
skip_template = False
|
||||||
if text.startswith("<|im_start|>"):
|
if text.startswith("<|im_start|>"):
|
||||||
skip_template = True
|
skip_template = True
|
||||||
@ -90,11 +94,14 @@ class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
|||||||
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
|
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
has_images = images is not None and len(images) > 0
|
||||||
|
template_prefix = self.EDIT_PREFIX if has_images else self.T2I_PREFIX
|
||||||
|
|
||||||
prefix_ids = base_tok.tokenizer(
|
prefix_ids = base_tok.tokenizer(
|
||||||
self.longcat_template_prefix, add_special_tokens=False
|
template_prefix, add_special_tokens=False
|
||||||
)["input_ids"]
|
)["input_ids"]
|
||||||
suffix_ids = base_tok.tokenizer(
|
suffix_ids = base_tok.tokenizer(
|
||||||
self.longcat_template_suffix, add_special_tokens=False
|
self.SUFFIX, add_special_tokens=False
|
||||||
)["input_ids"]
|
)["input_ids"]
|
||||||
|
|
||||||
prompt_tokens = base_tok.tokenize_with_weights(
|
prompt_tokens = base_tok.tokenize_with_weights(
|
||||||
@ -106,6 +113,14 @@ class LongCatImageTokenizer(sd1_clip.SD1Tokenizer):
|
|||||||
suffix_pairs = [(t, 1.0) for t in suffix_ids]
|
suffix_pairs = [(t, 1.0) for t in suffix_ids]
|
||||||
|
|
||||||
combined = prefix_pairs + prompt_pairs + suffix_pairs
|
combined = prefix_pairs + prompt_pairs + suffix_pairs
|
||||||
|
|
||||||
|
if has_images:
|
||||||
|
embed_count = 0
|
||||||
|
for i in range(len(combined)):
|
||||||
|
if combined[i][0] == IMAGE_PAD_TOKEN_ID and embed_count < len(images):
|
||||||
|
combined[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"}, combined[i][1])
|
||||||
|
embed_count += 1
|
||||||
|
|
||||||
tokens = {"qwen25_7b": [combined]}
|
tokens = {"qwen25_7b": [combined]}
|
||||||
|
|
||||||
return tokens
|
return tokens
|
||||||
|
|||||||
@ -425,4 +425,7 @@ class Qwen2VLVisionTransformer(nn.Module):
|
|||||||
hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
|
hidden_states = block(hidden_states, position_embeddings, cu_seqlens_now, optimized_attention=optimized_attention)
|
||||||
|
|
||||||
hidden_states = self.merger(hidden_states)
|
hidden_states = self.merger(hidden_states)
|
||||||
|
# Potentially important for spatially precise edits. This is present in the HF implementation.
|
||||||
|
reverse_indices = torch.argsort(window_index)
|
||||||
|
hidden_states = hidden_states[reverse_indices, :]
|
||||||
return hidden_states
|
return hidden_states
|
||||||
|
|||||||
@ -29,13 +29,21 @@ class ImageEditRequest(BaseModel):
|
|||||||
class VideoGenerationRequest(BaseModel):
|
class VideoGenerationRequest(BaseModel):
|
||||||
model: str = Field(...)
|
model: str = Field(...)
|
||||||
prompt: str = Field(...)
|
prompt: str = Field(...)
|
||||||
image: InputUrlObject | None = Field(...)
|
image: InputUrlObject | None = Field(None)
|
||||||
|
reference_images: list[InputUrlObject] | None = Field(None)
|
||||||
duration: int = Field(...)
|
duration: int = Field(...)
|
||||||
aspect_ratio: str | None = Field(...)
|
aspect_ratio: str | None = Field(...)
|
||||||
resolution: str = Field(...)
|
resolution: str = Field(...)
|
||||||
seed: int = Field(...)
|
seed: int = Field(...)
|
||||||
|
|
||||||
|
|
||||||
|
class VideoExtensionRequest(BaseModel):
|
||||||
|
prompt: str = Field(...)
|
||||||
|
video: InputUrlObject = Field(...)
|
||||||
|
duration: int = Field(default=6)
|
||||||
|
model: str | None = Field(default=None)
|
||||||
|
|
||||||
|
|
||||||
class VideoEditRequest(BaseModel):
|
class VideoEditRequest(BaseModel):
|
||||||
model: str = Field(...)
|
model: str = Field(...)
|
||||||
prompt: str = Field(...)
|
prompt: str = Field(...)
|
||||||
|
|||||||
@ -8,6 +8,7 @@ from comfy_api_nodes.apis.grok import (
|
|||||||
ImageGenerationResponse,
|
ImageGenerationResponse,
|
||||||
InputUrlObject,
|
InputUrlObject,
|
||||||
VideoEditRequest,
|
VideoEditRequest,
|
||||||
|
VideoExtensionRequest,
|
||||||
VideoGenerationRequest,
|
VideoGenerationRequest,
|
||||||
VideoGenerationResponse,
|
VideoGenerationResponse,
|
||||||
VideoStatusResponse,
|
VideoStatusResponse,
|
||||||
@ -21,6 +22,7 @@ from comfy_api_nodes.util import (
|
|||||||
poll_op,
|
poll_op,
|
||||||
sync_op,
|
sync_op,
|
||||||
tensor_to_base64_string,
|
tensor_to_base64_string,
|
||||||
|
upload_images_to_comfyapi,
|
||||||
upload_video_to_comfyapi,
|
upload_video_to_comfyapi,
|
||||||
validate_string,
|
validate_string,
|
||||||
validate_video_duration,
|
validate_video_duration,
|
||||||
@ -33,6 +35,13 @@ def _extract_grok_price(response) -> float | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_grok_video_price(response) -> float | None:
|
||||||
|
price = _extract_grok_price(response)
|
||||||
|
if price is not None:
|
||||||
|
return price * 1.43
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class GrokImageNode(IO.ComfyNode):
|
class GrokImageNode(IO.ComfyNode):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -354,6 +363,8 @@ class GrokVideoNode(IO.ComfyNode):
|
|||||||
seed: int,
|
seed: int,
|
||||||
image: Input.Image | None = None,
|
image: Input.Image | None = None,
|
||||||
) -> IO.NodeOutput:
|
) -> IO.NodeOutput:
|
||||||
|
if model == "grok-imagine-video-beta":
|
||||||
|
model = "grok-imagine-video"
|
||||||
image_url = None
|
image_url = None
|
||||||
if image is not None:
|
if image is not None:
|
||||||
if get_number_of_images(image) != 1:
|
if get_number_of_images(image) != 1:
|
||||||
@ -462,6 +473,244 @@ class GrokVideoEditNode(IO.ComfyNode):
|
|||||||
return IO.NodeOutput(await download_url_to_video_output(response.video.url))
|
return IO.NodeOutput(await download_url_to_video_output(response.video.url))
|
||||||
|
|
||||||
|
|
||||||
|
class GrokVideoReferenceNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="GrokVideoReferenceNode",
|
||||||
|
display_name="Grok Reference-to-Video",
|
||||||
|
category="api node/video/Grok",
|
||||||
|
description="Generate video guided by reference images as style and content references.",
|
||||||
|
inputs=[
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="Text description of the desired video.",
|
||||||
|
),
|
||||||
|
IO.DynamicCombo.Input(
|
||||||
|
"model",
|
||||||
|
options=[
|
||||||
|
IO.DynamicCombo.Option(
|
||||||
|
"grok-imagine-video",
|
||||||
|
[
|
||||||
|
IO.Autogrow.Input(
|
||||||
|
"reference_images",
|
||||||
|
template=IO.Autogrow.TemplatePrefix(
|
||||||
|
IO.Image.Input("image"),
|
||||||
|
prefix="reference_",
|
||||||
|
min=1,
|
||||||
|
max=7,
|
||||||
|
),
|
||||||
|
tooltip="Up to 7 reference images to guide the video generation.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input(
|
||||||
|
"resolution",
|
||||||
|
options=["480p", "720p"],
|
||||||
|
tooltip="The resolution of the output video.",
|
||||||
|
),
|
||||||
|
IO.Combo.Input(
|
||||||
|
"aspect_ratio",
|
||||||
|
options=["16:9", "4:3", "3:2", "1:1", "2:3", "3:4", "9:16"],
|
||||||
|
tooltip="The aspect ratio of the output video.",
|
||||||
|
),
|
||||||
|
IO.Int.Input(
|
||||||
|
"duration",
|
||||||
|
default=6,
|
||||||
|
min=2,
|
||||||
|
max=10,
|
||||||
|
step=1,
|
||||||
|
tooltip="The duration of the output video in seconds.",
|
||||||
|
display_mode=IO.NumberDisplay.slider,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
tooltip="The model to use for video generation.",
|
||||||
|
),
|
||||||
|
IO.Int.Input(
|
||||||
|
"seed",
|
||||||
|
default=0,
|
||||||
|
min=0,
|
||||||
|
max=2147483647,
|
||||||
|
step=1,
|
||||||
|
display_mode=IO.NumberDisplay.number,
|
||||||
|
control_after_generate=True,
|
||||||
|
tooltip="Seed to determine if node should re-run; "
|
||||||
|
"actual results are nondeterministic regardless of seed.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
price_badge=IO.PriceBadge(
|
||||||
|
depends_on=IO.PriceBadgeDepends(
|
||||||
|
widgets=["model.duration", "model.resolution"],
|
||||||
|
input_groups=["model.reference_images"],
|
||||||
|
),
|
||||||
|
expr="""
|
||||||
|
(
|
||||||
|
$res := $lookup(widgets, "model.resolution");
|
||||||
|
$dur := $lookup(widgets, "model.duration");
|
||||||
|
$refs := inputGroups["model.reference_images"];
|
||||||
|
$rate := $res = "720p" ? 0.07 : 0.05;
|
||||||
|
$price := ($rate * $dur + 0.002 * $refs) * 1.43;
|
||||||
|
{"type":"usd","usd": $price}
|
||||||
|
)
|
||||||
|
""",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
prompt: str,
|
||||||
|
model: dict,
|
||||||
|
seed: int,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||||
|
ref_image_urls = await upload_images_to_comfyapi(
|
||||||
|
cls,
|
||||||
|
list(model["reference_images"].values()),
|
||||||
|
mime_type="image/png",
|
||||||
|
wait_label="Uploading base images",
|
||||||
|
max_images=7,
|
||||||
|
)
|
||||||
|
initial_response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/xai/v1/videos/generations", method="POST"),
|
||||||
|
data=VideoGenerationRequest(
|
||||||
|
model=model["model"],
|
||||||
|
reference_images=[InputUrlObject(url=i) for i in ref_image_urls],
|
||||||
|
prompt=prompt,
|
||||||
|
resolution=model["resolution"],
|
||||||
|
duration=model["duration"],
|
||||||
|
aspect_ratio=model["aspect_ratio"],
|
||||||
|
seed=seed,
|
||||||
|
),
|
||||||
|
response_model=VideoGenerationResponse,
|
||||||
|
)
|
||||||
|
response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/xai/v1/videos/{initial_response.request_id}"),
|
||||||
|
status_extractor=lambda r: r.status if r.status is not None else "complete",
|
||||||
|
response_model=VideoStatusResponse,
|
||||||
|
price_extractor=_extract_grok_video_price,
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(await download_url_to_video_output(response.video.url))
|
||||||
|
|
||||||
|
|
||||||
|
class GrokVideoExtendNode(IO.ComfyNode):
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls):
|
||||||
|
return IO.Schema(
|
||||||
|
node_id="GrokVideoExtendNode",
|
||||||
|
display_name="Grok Video Extend",
|
||||||
|
category="api node/video/Grok",
|
||||||
|
description="Extend an existing video with a seamless continuation based on a text prompt.",
|
||||||
|
inputs=[
|
||||||
|
IO.String.Input(
|
||||||
|
"prompt",
|
||||||
|
multiline=True,
|
||||||
|
tooltip="Text description of what should happen next in the video.",
|
||||||
|
),
|
||||||
|
IO.Video.Input("video", tooltip="Source video to extend. MP4 format, 2-15 seconds."),
|
||||||
|
IO.DynamicCombo.Input(
|
||||||
|
"model",
|
||||||
|
options=[
|
||||||
|
IO.DynamicCombo.Option(
|
||||||
|
"grok-imagine-video",
|
||||||
|
[
|
||||||
|
IO.Int.Input(
|
||||||
|
"duration",
|
||||||
|
default=8,
|
||||||
|
min=2,
|
||||||
|
max=10,
|
||||||
|
step=1,
|
||||||
|
tooltip="Length of the extension in seconds.",
|
||||||
|
display_mode=IO.NumberDisplay.slider,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
tooltip="The model to use for video extension.",
|
||||||
|
),
|
||||||
|
IO.Int.Input(
|
||||||
|
"seed",
|
||||||
|
default=0,
|
||||||
|
min=0,
|
||||||
|
max=2147483647,
|
||||||
|
step=1,
|
||||||
|
display_mode=IO.NumberDisplay.number,
|
||||||
|
control_after_generate=True,
|
||||||
|
tooltip="Seed to determine if node should re-run; "
|
||||||
|
"actual results are nondeterministic regardless of seed.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
IO.Video.Output(),
|
||||||
|
],
|
||||||
|
hidden=[
|
||||||
|
IO.Hidden.auth_token_comfy_org,
|
||||||
|
IO.Hidden.api_key_comfy_org,
|
||||||
|
IO.Hidden.unique_id,
|
||||||
|
],
|
||||||
|
is_api_node=True,
|
||||||
|
price_badge=IO.PriceBadge(
|
||||||
|
depends_on=IO.PriceBadgeDepends(widgets=["model.duration"]),
|
||||||
|
expr="""
|
||||||
|
(
|
||||||
|
$dur := $lookup(widgets, "model.duration");
|
||||||
|
{
|
||||||
|
"type": "range_usd",
|
||||||
|
"min_usd": (0.02 + 0.05 * $dur) * 1.43,
|
||||||
|
"max_usd": (0.15 + 0.05 * $dur) * 1.43
|
||||||
|
}
|
||||||
|
)
|
||||||
|
""",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def execute(
|
||||||
|
cls,
|
||||||
|
prompt: str,
|
||||||
|
video: Input.Video,
|
||||||
|
model: dict,
|
||||||
|
seed: int,
|
||||||
|
) -> IO.NodeOutput:
|
||||||
|
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||||
|
validate_video_duration(video, min_duration=2, max_duration=15)
|
||||||
|
video_size = get_fs_object_size(video.get_stream_source())
|
||||||
|
if video_size > 50 * 1024 * 1024:
|
||||||
|
raise ValueError(f"Video size ({video_size / 1024 / 1024:.1f}MB) exceeds 50MB limit.")
|
||||||
|
initial_response = await sync_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path="/proxy/xai/v1/videos/extensions", method="POST"),
|
||||||
|
data=VideoExtensionRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
video=InputUrlObject(url=await upload_video_to_comfyapi(cls, video)),
|
||||||
|
duration=model["duration"],
|
||||||
|
),
|
||||||
|
response_model=VideoGenerationResponse,
|
||||||
|
)
|
||||||
|
response = await poll_op(
|
||||||
|
cls,
|
||||||
|
ApiEndpoint(path=f"/proxy/xai/v1/videos/{initial_response.request_id}"),
|
||||||
|
status_extractor=lambda r: r.status if r.status is not None else "complete",
|
||||||
|
response_model=VideoStatusResponse,
|
||||||
|
price_extractor=_extract_grok_video_price,
|
||||||
|
)
|
||||||
|
return IO.NodeOutput(await download_url_to_video_output(response.video.url))
|
||||||
|
|
||||||
|
|
||||||
class GrokExtension(ComfyExtension):
|
class GrokExtension(ComfyExtension):
|
||||||
@override
|
@override
|
||||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||||
@ -469,7 +718,9 @@ class GrokExtension(ComfyExtension):
|
|||||||
GrokImageNode,
|
GrokImageNode,
|
||||||
GrokImageEditNode,
|
GrokImageEditNode,
|
||||||
GrokVideoNode,
|
GrokVideoNode,
|
||||||
|
GrokVideoReferenceNode,
|
||||||
GrokVideoEditNode,
|
GrokVideoEditNode,
|
||||||
|
GrokVideoExtendNode,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@ import node_helpers
|
|||||||
import torch
|
import torch
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
import comfy.model_sampling
|
import comfy.model_sampling
|
||||||
|
import comfy.samplers
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
import math
|
import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -682,6 +683,84 @@ class LTXVSeparateAVLatent(io.ComfyNode):
|
|||||||
return io.NodeOutput(video_latent, audio_latent)
|
return io.NodeOutput(video_latent, audio_latent)
|
||||||
|
|
||||||
|
|
||||||
|
class LTXVReferenceAudio(io.ComfyNode):
|
||||||
|
@classmethod
|
||||||
|
def define_schema(cls) -> io.Schema:
|
||||||
|
return io.Schema(
|
||||||
|
node_id="LTXVReferenceAudio",
|
||||||
|
display_name="LTXV Reference Audio (ID-LoRA)",
|
||||||
|
category="conditioning/audio",
|
||||||
|
description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).",
|
||||||
|
inputs=[
|
||||||
|
io.Model.Input("model"),
|
||||||
|
io.Conditioning.Input("positive"),
|
||||||
|
io.Conditioning.Input("negative"),
|
||||||
|
io.Audio.Input("reference_audio", tooltip="Reference audio clip whose speaker identity to transfer. ~5 seconds recommended (training duration). Shorter or longer clips may degrade voice identity transfer."),
|
||||||
|
io.Vae.Input(id="audio_vae", display_name="Audio VAE", tooltip="LTXV Audio VAE for encoding."),
|
||||||
|
io.Float.Input("identity_guidance_scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01, tooltip="Strength of identity guidance. Runs an extra forward pass without reference each step to amplify speaker identity. Set to 0 to disable (no extra pass)."),
|
||||||
|
io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001, advanced=True, tooltip="Start of the sigma range where identity guidance is active."),
|
||||||
|
io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001, advanced=True, tooltip="End of the sigma range where identity guidance is active."),
|
||||||
|
],
|
||||||
|
outputs=[
|
||||||
|
io.Model.Output(),
|
||||||
|
io.Conditioning.Output(display_name="positive"),
|
||||||
|
io.Conditioning.Output(display_name="negative"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def execute(cls, model, positive, negative, reference_audio, audio_vae, identity_guidance_scale, start_percent, end_percent) -> io.NodeOutput:
|
||||||
|
# Encode reference audio to latents and patchify
|
||||||
|
audio_latents = audio_vae.encode(reference_audio)
|
||||||
|
b, c, t, f = audio_latents.shape
|
||||||
|
ref_tokens = audio_latents.permute(0, 2, 1, 3).reshape(b, t, c * f)
|
||||||
|
ref_audio = {"tokens": ref_tokens}
|
||||||
|
|
||||||
|
positive = node_helpers.conditioning_set_values(positive, {"ref_audio": ref_audio})
|
||||||
|
negative = node_helpers.conditioning_set_values(negative, {"ref_audio": ref_audio})
|
||||||
|
|
||||||
|
# Patch model with identity guidance
|
||||||
|
m = model.clone()
|
||||||
|
scale = identity_guidance_scale
|
||||||
|
model_sampling = m.get_model_object("model_sampling")
|
||||||
|
sigma_start = model_sampling.percent_to_sigma(start_percent)
|
||||||
|
sigma_end = model_sampling.percent_to_sigma(end_percent)
|
||||||
|
|
||||||
|
def post_cfg_function(args):
|
||||||
|
if scale == 0:
|
||||||
|
return args["denoised"]
|
||||||
|
|
||||||
|
sigma = args["sigma"]
|
||||||
|
sigma_ = sigma[0].item()
|
||||||
|
if sigma_ > sigma_start or sigma_ < sigma_end:
|
||||||
|
return args["denoised"]
|
||||||
|
|
||||||
|
cond_pred = args["cond_denoised"]
|
||||||
|
cond = args["cond"]
|
||||||
|
cfg_result = args["denoised"]
|
||||||
|
model_options = args["model_options"].copy()
|
||||||
|
x = args["input"]
|
||||||
|
|
||||||
|
# Strip ref_audio from conditioning for the no-reference pass
|
||||||
|
noref_cond = []
|
||||||
|
for entry in cond:
|
||||||
|
new_entry = entry.copy()
|
||||||
|
mc = new_entry.get("model_conds", {}).copy()
|
||||||
|
mc.pop("ref_audio", None)
|
||||||
|
new_entry["model_conds"] = mc
|
||||||
|
noref_cond.append(new_entry)
|
||||||
|
|
||||||
|
(pred_noref,) = comfy.samplers.calc_cond_batch(
|
||||||
|
args["model"], [noref_cond], x, sigma, model_options
|
||||||
|
)
|
||||||
|
|
||||||
|
return cfg_result + (cond_pred - pred_noref) * scale
|
||||||
|
|
||||||
|
m.set_model_sampler_post_cfg_function(post_cfg_function)
|
||||||
|
|
||||||
|
return io.NodeOutput(m, positive, negative)
|
||||||
|
|
||||||
|
|
||||||
class LtxvExtension(ComfyExtension):
|
class LtxvExtension(ComfyExtension):
|
||||||
@override
|
@override
|
||||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||||
@ -697,6 +776,7 @@ class LtxvExtension(ComfyExtension):
|
|||||||
LTXVCropGuides,
|
LTXVCropGuides,
|
||||||
LTXVConcatAVLatent,
|
LTXVConcatAVLatent,
|
||||||
LTXVSeparateAVLatent,
|
LTXVSeparateAVLatent,
|
||||||
|
LTXVReferenceAudio,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
# This file is automatically generated by the build process when version is
|
# This file is automatically generated by the build process when version is
|
||||||
# updated in pyproject.toml.
|
# updated in pyproject.toml.
|
||||||
__version__ = "0.18.0"
|
__version__ = "0.18.1"
|
||||||
|
|||||||
3
main.py
3
main.py
@ -471,6 +471,9 @@ if __name__ == "__main__":
|
|||||||
if sys.version_info.major == 3 and sys.version_info.minor < 10:
|
if sys.version_info.major == 3 and sys.version_info.minor < 10:
|
||||||
logging.warning("WARNING: You are using a python version older than 3.10, please upgrade to a newer one. 3.12 and above is recommended.")
|
logging.warning("WARNING: You are using a python version older than 3.10, please upgrade to a newer one. 3.12 and above is recommended.")
|
||||||
|
|
||||||
|
if args.disable_dynamic_vram:
|
||||||
|
logging.warning("Dynamic vram disabled with argument. If you have any issues with dynamic vram enabled please give us a detailed reports as this argument will be removed soon.")
|
||||||
|
|
||||||
event_loop, _, start_all_func = start_comfyui()
|
event_loop, _, start_all_func = start_comfyui()
|
||||||
try:
|
try:
|
||||||
x = start_all_func()
|
x = start_all_func()
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
comfyui_manager==4.1b6
|
comfyui_manager==4.1b8
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "ComfyUI"
|
name = "ComfyUI"
|
||||||
version = "0.18.0"
|
version = "0.18.1"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = { file = "LICENSE" }
|
license = { file = "LICENSE" }
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
comfyui-frontend-package==1.41.21
|
comfyui-frontend-package==1.42.8
|
||||||
comfyui-workflow-templates==0.9.26
|
comfyui-workflow-templates==0.9.26
|
||||||
comfyui-embedded-docs==0.4.3
|
comfyui-embedded-docs==0.4.3
|
||||||
torch
|
torch
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user