Merge branch 'master' into dr-support-pip-cm

This commit is contained in:
Dr.Lt.Data 2025-10-07 14:30:16 +09:00
commit 6b20418ad1
15 changed files with 1561 additions and 1576 deletions

View File

@ -23,8 +23,6 @@ class MusicDCAE(torch.nn.Module):
else: else:
self.source_sample_rate = source_sample_rate self.source_sample_rate = source_sample_rate
# self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100)
self.transform = transforms.Compose([ self.transform = transforms.Compose([
transforms.Normalize(0.5, 0.5), transforms.Normalize(0.5, 0.5),
]) ])
@ -37,10 +35,6 @@ class MusicDCAE(torch.nn.Module):
self.scale_factor = 0.1786 self.scale_factor = 0.1786
self.shift_factor = -1.9091 self.shift_factor = -1.9091
def load_audio(self, audio_path):
audio, sr = torchaudio.load(audio_path)
return audio, sr
def forward_mel(self, audios): def forward_mel(self, audios):
mels = [] mels = []
for i in range(len(audios)): for i in range(len(audios)):
@ -73,10 +67,8 @@ class MusicDCAE(torch.nn.Module):
latent = self.dcae.encoder(mel.unsqueeze(0)) latent = self.dcae.encoder(mel.unsqueeze(0))
latents.append(latent) latents.append(latent)
latents = torch.cat(latents, dim=0) latents = torch.cat(latents, dim=0)
# latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long()
latents = (latents - self.shift_factor) * self.scale_factor latents = (latents - self.shift_factor) * self.scale_factor
return latents return latents
# return latents, latent_lengths
@torch.no_grad() @torch.no_grad()
def decode(self, latents, audio_lengths=None, sr=None): def decode(self, latents, audio_lengths=None, sr=None):
@ -91,9 +83,7 @@ class MusicDCAE(torch.nn.Module):
wav = self.vocoder.decode(mels[0]).squeeze(1) wav = self.vocoder.decode(mels[0]).squeeze(1)
if sr is not None: if sr is not None:
# resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype)
wav = torchaudio.functional.resample(wav, 44100, sr) wav = torchaudio.functional.resample(wav, 44100, sr)
# wav = resampler(wav)
else: else:
sr = 44100 sr = 44100
pred_wavs.append(wav) pred_wavs.append(wav)
@ -101,7 +91,6 @@ class MusicDCAE(torch.nn.Module):
if audio_lengths is not None: if audio_lengths is not None:
pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)] pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)]
return torch.stack(pred_wavs) return torch.stack(pred_wavs)
# return sr, pred_wavs
def forward(self, audios, audio_lengths=None, sr=None): def forward(self, audios, audio_lengths=None, sr=None):
latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr) latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr)

View File

@ -365,8 +365,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["patch_size"] = 2 dit_config["patch_size"] = 2
dit_config["in_channels"] = 16 dit_config["in_channels"] = 16
dit_config["dim"] = 2304 dit_config["dim"] = 2304
dit_config["cap_feat_dim"] = 2304 dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1]
dit_config["n_layers"] = 26 dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.')
dit_config["n_heads"] = 24 dit_config["n_heads"] = 24
dit_config["n_kv_heads"] = 8 dit_config["n_kv_heads"] = 8
dit_config["qk_norm"] = True dit_config["qk_norm"] = True

View File

@ -890,6 +890,7 @@ class TEModel(Enum):
QWEN25_3B = 10 QWEN25_3B = 10
QWEN25_7B = 11 QWEN25_7B = 11
BYT5_SMALL_GLYPH = 12 BYT5_SMALL_GLYPH = 12
GEMMA_3_4B = 13
def detect_te_model(sd): def detect_te_model(sd):
if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: if "text_model.encoder.layers.30.mlp.fc1.weight" in sd:
@ -912,6 +913,8 @@ def detect_te_model(sd):
return TEModel.BYT5_SMALL_GLYPH return TEModel.BYT5_SMALL_GLYPH
return TEModel.T5_BASE return TEModel.T5_BASE
if 'model.layers.0.post_feedforward_layernorm.weight' in sd: if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
if 'model.layers.0.self_attn.q_norm.weight' in sd:
return TEModel.GEMMA_3_4B
return TEModel.GEMMA_2_2B return TEModel.GEMMA_2_2B
if 'model.layers.0.self_attn.k_proj.bias' in sd: if 'model.layers.0.self_attn.k_proj.bias' in sd:
weight = sd['model.layers.0.self_attn.k_proj.bias'] weight = sd['model.layers.0.self_attn.k_proj.bias']
@ -1016,6 +1019,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
elif te_model == TEModel.GEMMA_3_4B:
clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b")
clip_target.tokenizer = comfy.text_encoders.lumina2.NTokenizer
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
elif te_model == TEModel.LLAMA3_8: elif te_model == TEModel.LLAMA3_8:
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data), clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data),
clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None) clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None)

View File

@ -3,6 +3,7 @@ import torch.nn as nn
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Any from typing import Optional, Any
import math import math
import logging
from comfy.ldm.modules.attention import optimized_attention_for_device from comfy.ldm.modules.attention import optimized_attention_for_device
import comfy.model_management import comfy.model_management
@ -28,6 +29,9 @@ class Llama2Config:
mlp_activation = "silu" mlp_activation = "silu"
qkv_bias = False qkv_bias = False
rope_dims = None rope_dims = None
q_norm = None
k_norm = None
rope_scale = None
@dataclass @dataclass
class Qwen25_3BConfig: class Qwen25_3BConfig:
@ -46,6 +50,9 @@ class Qwen25_3BConfig:
mlp_activation = "silu" mlp_activation = "silu"
qkv_bias = True qkv_bias = True
rope_dims = None rope_dims = None
q_norm = None
k_norm = None
rope_scale = None
@dataclass @dataclass
class Qwen25_7BVLI_Config: class Qwen25_7BVLI_Config:
@ -64,6 +71,9 @@ class Qwen25_7BVLI_Config:
mlp_activation = "silu" mlp_activation = "silu"
qkv_bias = True qkv_bias = True
rope_dims = [16, 24, 24] rope_dims = [16, 24, 24]
q_norm = None
k_norm = None
rope_scale = None
@dataclass @dataclass
class Gemma2_2B_Config: class Gemma2_2B_Config:
@ -82,6 +92,32 @@ class Gemma2_2B_Config:
mlp_activation = "gelu_pytorch_tanh" mlp_activation = "gelu_pytorch_tanh"
qkv_bias = False qkv_bias = False
rope_dims = None rope_dims = None
q_norm = None
k_norm = None
sliding_attention = None
rope_scale = None
@dataclass
class Gemma3_4B_Config:
vocab_size: int = 262208
hidden_size: int = 2560
intermediate_size: int = 10240
num_hidden_layers: int = 34
num_attention_heads: int = 8
num_key_value_heads: int = 4
max_position_embeddings: int = 131072
rms_norm_eps: float = 1e-6
rope_theta = [10000.0, 1000000.0]
transformer_type: str = "gemma3"
head_dim = 256
rms_norm_add = True
mlp_activation = "gelu_pytorch_tanh"
qkv_bias = False
rope_dims = None
q_norm = "gemma3"
k_norm = "gemma3"
sliding_attention = [False, False, False, False, False, 1024]
rope_scale = [1.0, 8.0]
class RMSNorm(nn.Module): class RMSNorm(nn.Module):
def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None):
@ -106,9 +142,20 @@ def rotate_half(x):
return torch.cat((-x2, x1), dim=-1) return torch.cat((-x2, x1), dim=-1)
def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None): def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
if not isinstance(theta, list):
theta = [theta]
out = []
for index, t in enumerate(theta):
theta_numerator = torch.arange(0, head_dim, 2, device=device).float() theta_numerator = torch.arange(0, head_dim, 2, device=device).float()
inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) inv_freq = 1.0 / (t ** (theta_numerator / head_dim))
if rope_scale is not None:
if isinstance(rope_scale, list):
inv_freq /= rope_scale[index]
else:
inv_freq /= rope_scale
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float() position_ids_expanded = position_ids[:, None, :].float()
@ -123,8 +170,12 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=N
else: else:
cos = cos.unsqueeze(1) cos = cos.unsqueeze(1)
sin = sin.unsqueeze(1) sin = sin.unsqueeze(1)
out.append((cos, sin))
return (cos, sin) if len(out) == 1:
return out[0]
return out
def apply_rope(xq, xk, freqs_cis): def apply_rope(xq, xk, freqs_cis):
@ -152,6 +203,14 @@ class Attention(nn.Module):
self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype) self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype) self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype)
self.q_norm = None
self.k_norm = None
if config.q_norm == "gemma3":
self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
if config.k_norm == "gemma3":
self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
def forward( def forward(
self, self,
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
@ -168,6 +227,11 @@ class Attention(nn.Module):
xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2) xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2) xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2)
if self.q_norm is not None:
xq = self.q_norm(xq)
if self.k_norm is not None:
xk = self.k_norm(xk)
xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis) xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis)
xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
@ -192,7 +256,7 @@ class MLP(nn.Module):
return self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x)) return self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x))
class TransformerBlock(nn.Module): class TransformerBlock(nn.Module):
def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None):
super().__init__() super().__init__()
self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops) self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops)
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
@ -226,7 +290,7 @@ class TransformerBlock(nn.Module):
return x return x
class TransformerBlockGemma2(nn.Module): class TransformerBlockGemma2(nn.Module):
def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None):
super().__init__() super().__init__()
self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops) self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops)
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
@ -235,6 +299,13 @@ class TransformerBlockGemma2(nn.Module):
self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
if config.sliding_attention is not None: # TODO: implement. (Not that necessary since models are trained on less than 1024 tokens)
self.sliding_attention = config.sliding_attention[index % len(config.sliding_attention)]
else:
self.sliding_attention = False
self.transformer_type = config.transformer_type
def forward( def forward(
self, self,
x: torch.Tensor, x: torch.Tensor,
@ -242,6 +313,14 @@ class TransformerBlockGemma2(nn.Module):
freqs_cis: Optional[torch.Tensor] = None, freqs_cis: Optional[torch.Tensor] = None,
optimized_attention=None, optimized_attention=None,
): ):
if self.transformer_type == 'gemma3':
if self.sliding_attention:
if x.shape[1] > self.sliding_attention:
logging.warning("Warning: sliding attention not implemented, results may be incorrect")
freqs_cis = freqs_cis[1]
else:
freqs_cis = freqs_cis[0]
# Self Attention # Self Attention
residual = x residual = x
x = self.input_layernorm(x) x = self.input_layernorm(x)
@ -276,7 +355,7 @@ class Llama2_(nn.Module):
device=device, device=device,
dtype=dtype dtype=dtype
) )
if self.config.transformer_type == "gemma2": if self.config.transformer_type == "gemma2" or self.config.transformer_type == "gemma3":
transformer = TransformerBlockGemma2 transformer = TransformerBlockGemma2
self.normalize_in = True self.normalize_in = True
else: else:
@ -284,8 +363,8 @@ class Llama2_(nn.Module):
self.normalize_in = False self.normalize_in = False
self.layers = nn.ModuleList([ self.layers = nn.ModuleList([
transformer(config, device=device, dtype=dtype, ops=ops) transformer(config, index=i, device=device, dtype=dtype, ops=ops)
for _ in range(config.num_hidden_layers) for i in range(config.num_hidden_layers)
]) ])
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
# self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype)
@ -305,6 +384,7 @@ class Llama2_(nn.Module):
freqs_cis = precompute_freqs_cis(self.config.head_dim, freqs_cis = precompute_freqs_cis(self.config.head_dim,
position_ids, position_ids,
self.config.rope_theta, self.config.rope_theta,
self.config.rope_scale,
self.config.rope_dims, self.config.rope_dims,
device=x.device) device=x.device)
@ -433,3 +513,12 @@ class Gemma2_2B(BaseLlama, torch.nn.Module):
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
self.dtype = dtype self.dtype = dtype
class Gemma3_4B(BaseLlama, torch.nn.Module):
def __init__(self, config_dict, dtype, device, operations):
super().__init__()
config = Gemma3_4B_Config(**config_dict)
self.num_layers = config.num_hidden_layers
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
self.dtype = dtype

View File

@ -11,23 +11,41 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer):
def state_dict(self): def state_dict(self):
return {"spiece_model": self.tokenizer.serialize_model()} return {"spiece_model": self.tokenizer.serialize_model()}
class Gemma3_4BTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
tokenizer = tokenizer_data.get("spiece_model", None)
super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
def state_dict(self):
return {"spiece_model": self.tokenizer.serialize_model()}
class LuminaTokenizer(sd1_clip.SD1Tokenizer): class LuminaTokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}): def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma2_2b", tokenizer=Gemma2BTokenizer) super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma2_2b", tokenizer=Gemma2BTokenizer)
class NTokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma3_4b", tokenizer=Gemma3_4BTokenizer)
class Gemma2_2BModel(sd1_clip.SDClipModel): class Gemma2_2BModel(sd1_clip.SDClipModel):
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
class Gemma3_4BModel(sd1_clip.SDClipModel):
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
class LuminaModel(sd1_clip.SD1ClipModel): class LuminaModel(sd1_clip.SD1ClipModel):
def __init__(self, device="cpu", dtype=None, model_options={}): def __init__(self, device="cpu", dtype=None, model_options={}, name="gemma2_2b", clip_model=Gemma2_2BModel):
super().__init__(device=device, dtype=dtype, name="gemma2_2b", clip_model=Gemma2_2BModel, model_options=model_options) super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options)
def te(dtype_llama=None, llama_scaled_fp8=None): def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"):
if model_type == "gemma2_2b":
model = Gemma2_2BModel
elif model_type == "gemma3_4b":
model = Gemma3_4BModel
class LuminaTEModel_(LuminaModel): class LuminaTEModel_(LuminaModel):
def __init__(self, device="cpu", dtype=None, model_options={}): def __init__(self, device="cpu", dtype=None, model_options={}):
if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options:
@ -35,5 +53,5 @@ def te(dtype_llama=None, llama_scaled_fp8=None):
model_options["scaled_fp8"] = llama_scaled_fp8 model_options["scaled_fp8"] = llama_scaled_fp8
if dtype_llama is not None: if dtype_llama is not None:
dtype = dtype_llama dtype = dtype_llama
super().__init__(device=device, dtype=dtype, model_options=model_options) super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model)
return LuminaTEModel_ return LuminaTEModel_

View File

@ -152,7 +152,7 @@ def validate_aspect_ratio(
raise TypeError( raise TypeError(
f"Aspect ratio cannot reduce to any less than {minimum_ratio_str} ({minimum_ratio}), but was {aspect_ratio} ({calculated_ratio})." f"Aspect ratio cannot reduce to any less than {minimum_ratio_str} ({minimum_ratio}), but was {aspect_ratio} ({calculated_ratio})."
) )
elif calculated_ratio > maximum_ratio: if calculated_ratio > maximum_ratio:
raise TypeError( raise TypeError(
f"Aspect ratio cannot reduce to any greater than {maximum_ratio_str} ({maximum_ratio}), but was {aspect_ratio} ({calculated_ratio})." f"Aspect ratio cannot reduce to any greater than {maximum_ratio_str} ({maximum_ratio}), but was {aspect_ratio} ({calculated_ratio})."
) )

File diff suppressed because it is too large Load Diff

View File

@ -473,7 +473,7 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode):
height=width_height["height"], height=width_height["height"],
use_negative_prompts=True, use_negative_prompts=True,
) )
"""Upload image to comfy backend to have a URL available for further processing"""
# Get MIME type from tensor - assuming PNG format for image tensors # Get MIME type from tensor - assuming PNG format for image tensors
mime_type = "image/png" mime_type = "image/png"
@ -591,7 +591,6 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode):
validated_video = validate_video_to_video_input(video) validated_video = validate_video_to_video_input(video)
video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth) video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth)
"""Validate prompts and inference input"""
validate_prompts(prompt, negative_prompt) validate_prompts(prompt, negative_prompt)
# Only include motion_intensity for Motion Transfer # Only include motion_intensity for Motion Transfer

View File

@ -5,14 +5,16 @@ Pika API docs: https://pika-827374fb.mintlify.app/api-reference
""" """
from __future__ import annotations from __future__ import annotations
import io from io import BytesIO
import logging import logging
from typing import Optional, TypeVar from typing import Optional, TypeVar
from enum import Enum
import numpy as np import numpy as np
import torch import torch
from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions from typing_extensions import override
from comfy_api.latest import ComfyExtension, io as comfy_io
from comfy_api.input_impl import VideoFromFile from comfy_api.input_impl import VideoFromFile
from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput
from comfy_api_nodes.apinode_utils import ( from comfy_api_nodes.apinode_utils import (
@ -20,7 +22,6 @@ from comfy_api_nodes.apinode_utils import (
tensor_to_bytesio, tensor_to_bytesio,
) )
from comfy_api_nodes.apis import ( from comfy_api_nodes.apis import (
IngredientsMode,
PikaBodyGenerate22C2vGenerate22PikascenesPost, PikaBodyGenerate22C2vGenerate22PikascenesPost,
PikaBodyGenerate22I2vGenerate22I2vPost, PikaBodyGenerate22I2vGenerate22I2vPost,
PikaBodyGenerate22KeyframeGenerate22PikaframesPost, PikaBodyGenerate22KeyframeGenerate22PikaframesPost,
@ -28,10 +29,7 @@ from comfy_api_nodes.apis import (
PikaBodyGeneratePikadditionsGeneratePikadditionsPost, PikaBodyGeneratePikadditionsGeneratePikadditionsPost,
PikaBodyGeneratePikaffectsGeneratePikaffectsPost, PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
PikaBodyGeneratePikaswapsGeneratePikaswapsPost, PikaBodyGeneratePikaswapsGeneratePikaswapsPost,
PikaDurationEnum,
Pikaffect,
PikaGenerateResponse, PikaGenerateResponse,
PikaResolutionEnum,
PikaVideoResponse, PikaVideoResponse,
) )
from comfy_api_nodes.apis.client import ( from comfy_api_nodes.apis.client import (
@ -41,7 +39,6 @@ from comfy_api_nodes.apis.client import (
PollingOperation, PollingOperation,
SynchronousOperation, SynchronousOperation,
) )
from comfy_api_nodes.mapper_utils import model_field_to_node_input
R = TypeVar("R") R = TypeVar("R")
@ -58,6 +55,35 @@ PATH_PIKASCENES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikascenes"
PATH_VIDEO_GET = "/proxy/pika/videos" PATH_VIDEO_GET = "/proxy/pika/videos"
class PikaDurationEnum(int, Enum):
integer_5 = 5
integer_10 = 10
class PikaResolutionEnum(str, Enum):
field_1080p = "1080p"
field_720p = "720p"
class Pikaffect(str, Enum):
Cake_ify = "Cake-ify"
Crumble = "Crumble"
Crush = "Crush"
Decapitate = "Decapitate"
Deflate = "Deflate"
Dissolve = "Dissolve"
Explode = "Explode"
Eye_pop = "Eye-pop"
Inflate = "Inflate"
Levitate = "Levitate"
Melt = "Melt"
Peel = "Peel"
Poke = "Poke"
Squish = "Squish"
Ta_da = "Ta-da"
Tear = "Tear"
class PikaApiError(Exception): class PikaApiError(Exception):
"""Exception for Pika API errors.""" """Exception for Pika API errors."""
@ -74,60 +100,11 @@ def is_valid_initial_response(response: PikaGenerateResponse) -> bool:
return hasattr(response, "video_id") and response.video_id is not None return hasattr(response, "video_id") and response.video_id is not None
class PikaNodeBase(ComfyNodeABC): async def poll_for_task_status(
"""Base class for Pika nodes."""
@classmethod
def get_base_inputs_types(
cls, request_model
) -> dict[str, tuple[IO, InputTypeOptions]]:
"""Get the base required inputs types common to all Pika nodes."""
return {
"prompt_text": model_field_to_node_input(
IO.STRING,
request_model,
"promptText",
multiline=True,
),
"negative_prompt": model_field_to_node_input(
IO.STRING,
request_model,
"negativePrompt",
multiline=True,
),
"seed": model_field_to_node_input(
IO.INT,
request_model,
"seed",
min=0,
max=0xFFFFFFFF,
control_after_generate=True,
),
"resolution": model_field_to_node_input(
IO.COMBO,
request_model,
"resolution",
enum_type=PikaResolutionEnum,
),
"duration": model_field_to_node_input(
IO.COMBO,
request_model,
"duration",
enum_type=PikaDurationEnum,
),
}
CATEGORY = "api node/video/Pika"
API_NODE = True
FUNCTION = "api_call"
RETURN_TYPES = ("VIDEO",)
async def poll_for_task_status(
self,
task_id: str, task_id: str,
auth_kwargs: Optional[dict[str, str]] = None, auth_kwargs: Optional[dict[str, str]] = None,
node_id: Optional[str] = None, node_id: Optional[str] = None,
) -> PikaGenerateResponse: ) -> PikaGenerateResponse:
polling_operation = PollingOperation( polling_operation = PollingOperation(
poll_endpoint=ApiEndpoint( poll_endpoint=ApiEndpoint(
path=f"{PATH_VIDEO_GET}/{task_id}", path=f"{PATH_VIDEO_GET}/{task_id}",
@ -154,12 +131,12 @@ class PikaNodeBase(ComfyNodeABC):
) )
return await polling_operation.execute() return await polling_operation.execute()
async def execute_task(
self, async def execute_task(
initial_operation: SynchronousOperation[R, PikaGenerateResponse], initial_operation: SynchronousOperation[R, PikaGenerateResponse],
auth_kwargs: Optional[dict[str, str]] = None, auth_kwargs: Optional[dict[str, str]] = None,
node_id: Optional[str] = None, node_id: Optional[str] = None,
) -> tuple[VideoFromFile]: ) -> tuple[VideoFromFile]:
"""Executes the initial operation then polls for the task status until it is completed. """Executes the initial operation then polls for the task status until it is completed.
Args: Args:
@ -176,7 +153,7 @@ class PikaNodeBase(ComfyNodeABC):
raise PikaApiError(error_msg) raise PikaApiError(error_msg)
task_id = initial_response.video_id task_id = initial_response.video_id
final_response = await self.poll_for_task_status(task_id, auth_kwargs) final_response = await poll_for_task_status(task_id, auth_kwargs, node_id=node_id)
if not is_valid_video_response(final_response): if not is_valid_video_response(final_response):
error_msg = ( error_msg = (
f"Pika task {task_id} succeeded but no video data found in response." f"Pika task {task_id} succeeded but no video data found in response."
@ -190,39 +167,54 @@ class PikaNodeBase(ComfyNodeABC):
return (await download_url_to_video_output(video_url),) return (await download_url_to_video_output(video_url),)
class PikaImageToVideoV2_2(PikaNodeBase): def get_base_inputs_types() -> list[comfy_io.Input]:
"""Get the base required inputs types common to all Pika nodes."""
return [
comfy_io.String.Input("prompt_text", multiline=True),
comfy_io.String.Input("negative_prompt", multiline=True),
comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True),
comfy_io.Combo.Input(
"resolution", options=[resolution.value for resolution in PikaResolutionEnum], default="1080p"
),
comfy_io.Combo.Input(
"duration", options=[duration.value for duration in PikaDurationEnum], default=5
),
]
class PikaImageToVideoV2_2(comfy_io.ComfyNode):
"""Pika 2.2 Image to Video Node.""" """Pika 2.2 Image to Video Node."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
return { return comfy_io.Schema(
"required": { node_id="PikaImageToVideoNode2_2",
"image": ( display_name="Pika Image to Video",
IO.IMAGE, description="Sends an image and prompt to the Pika API v2.2 to generate a video.",
{"tooltip": "The image to convert to video"}, category="api node/video/Pika",
), inputs=[
**cls.get_base_inputs_types(PikaBodyGenerate22I2vGenerate22I2vPost), comfy_io.Image.Input("image", tooltip="The image to convert to video"),
}, *get_base_inputs_types(),
"hidden": { ],
"auth_token": "AUTH_TOKEN_COMFY_ORG", outputs=[comfy_io.Video.Output()],
"comfy_api_key": "API_KEY_COMFY_ORG", hidden=[
"unique_id": "UNIQUE_ID", comfy_io.Hidden.auth_token_comfy_org,
}, comfy_io.Hidden.api_key_comfy_org,
} comfy_io.Hidden.unique_id,
],
is_api_node=True,
)
DESCRIPTION = "Sends an image and prompt to the Pika API v2.2 to generate a video." @classmethod
async def execute(
async def api_call( cls,
self,
image: torch.Tensor, image: torch.Tensor,
prompt_text: str, prompt_text: str,
negative_prompt: str, negative_prompt: str,
seed: int, seed: int,
resolution: str, resolution: str,
duration: int, duration: int,
unique_id: str, ) -> comfy_io.NodeOutput:
**kwargs,
) -> tuple[VideoFromFile]:
# Convert image to BytesIO # Convert image to BytesIO
image_bytes_io = tensor_to_bytesio(image) image_bytes_io = tensor_to_bytesio(image)
image_bytes_io.seek(0) image_bytes_io.seek(0)
@ -237,7 +229,10 @@ class PikaImageToVideoV2_2(PikaNodeBase):
resolution=resolution, resolution=resolution,
duration=duration, duration=duration,
) )
auth = {
"auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_IMAGE_TO_VIDEO, path=PATH_IMAGE_TO_VIDEO,
@ -248,50 +243,55 @@ class PikaImageToVideoV2_2(PikaNodeBase):
request=pika_request_data, request=pika_request_data,
files=pika_files, files=pika_files,
content_type="multipart/form-data", content_type="multipart/form-data",
auth_kwargs=kwargs, auth_kwargs=auth,
) )
return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id)
class PikaTextToVideoNodeV2_2(PikaNodeBase): class PikaTextToVideoNodeV2_2(comfy_io.ComfyNode):
"""Pika Text2Video v2.2 Node.""" """Pika Text2Video v2.2 Node."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
return { return comfy_io.Schema(
"required": { node_id="PikaTextToVideoNode2_2",
**cls.get_base_inputs_types(PikaBodyGenerate22T2vGenerate22T2vPost), display_name="Pika Text to Video",
"aspect_ratio": model_field_to_node_input( description="Sends a text prompt to the Pika API v2.2 to generate a video.",
IO.FLOAT, category="api node/video/Pika",
PikaBodyGenerate22T2vGenerate22T2vPost, inputs=[
"aspectRatio", *get_base_inputs_types(),
comfy_io.Float.Input(
"aspect_ratio",
step=0.001, step=0.001,
min=0.4, min=0.4,
max=2.5, max=2.5,
default=1.7777777777777777, default=1.7777777777777777,
), tooltip="Aspect ratio (width / height)",
}, )
"hidden": { ],
"auth_token": "AUTH_TOKEN_COMFY_ORG", outputs=[comfy_io.Video.Output()],
"comfy_api_key": "API_KEY_COMFY_ORG", hidden=[
"unique_id": "UNIQUE_ID", comfy_io.Hidden.auth_token_comfy_org,
}, comfy_io.Hidden.api_key_comfy_org,
} comfy_io.Hidden.unique_id,
],
is_api_node=True,
)
DESCRIPTION = "Sends a text prompt to the Pika API v2.2 to generate a video." @classmethod
async def execute(
async def api_call( cls,
self,
prompt_text: str, prompt_text: str,
negative_prompt: str, negative_prompt: str,
seed: int, seed: int,
resolution: str, resolution: str,
duration: int, duration: int,
aspect_ratio: float, aspect_ratio: float,
unique_id: str, ) -> comfy_io.NodeOutput:
**kwargs, auth = {
) -> tuple[VideoFromFile]: "auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_TEXT_TO_VIDEO, path=PATH_TEXT_TO_VIDEO,
@ -307,62 +307,75 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase):
duration=duration, duration=duration,
aspectRatio=aspect_ratio, aspectRatio=aspect_ratio,
), ),
auth_kwargs=kwargs, auth_kwargs=auth,
content_type="application/x-www-form-urlencoded", content_type="application/x-www-form-urlencoded",
) )
return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id)
class PikaScenesV2_2(PikaNodeBase): class PikaScenesV2_2(comfy_io.ComfyNode):
"""PikaScenes v2.2 Node.""" """PikaScenes v2.2 Node."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
image_ingredient_input = ( return comfy_io.Schema(
IO.IMAGE, node_id="PikaScenesV2_2",
{"tooltip": "Image that will be used as ingredient to create a video."}, display_name="Pika Scenes (Video Image Composition)",
) description="Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them.",
return { category="api node/video/Pika",
"required": { inputs=[
**cls.get_base_inputs_types( *get_base_inputs_types(),
PikaBodyGenerate22C2vGenerate22PikascenesPost, comfy_io.Combo.Input(
), "ingredients_mode",
"ingredients_mode": model_field_to_node_input( options=["creative", "precise"],
IO.COMBO,
PikaBodyGenerate22C2vGenerate22PikascenesPost,
"ingredientsMode",
enum_type=IngredientsMode,
default="creative", default="creative",
), ),
"aspect_ratio": model_field_to_node_input( comfy_io.Float.Input(
IO.FLOAT, "aspect_ratio",
PikaBodyGenerate22C2vGenerate22PikascenesPost,
"aspectRatio",
step=0.001, step=0.001,
min=0.4, min=0.4,
max=2.5, max=2.5,
default=1.7777777777777777, default=1.7777777777777777,
tooltip="Aspect ratio (width / height)",
), ),
}, comfy_io.Image.Input(
"optional": { "image_ingredient_1",
"image_ingredient_1": image_ingredient_input, optional=True,
"image_ingredient_2": image_ingredient_input, tooltip="Image that will be used as ingredient to create a video.",
"image_ingredient_3": image_ingredient_input, ),
"image_ingredient_4": image_ingredient_input, comfy_io.Image.Input(
"image_ingredient_5": image_ingredient_input, "image_ingredient_2",
}, optional=True,
"hidden": { tooltip="Image that will be used as ingredient to create a video.",
"auth_token": "AUTH_TOKEN_COMFY_ORG", ),
"comfy_api_key": "API_KEY_COMFY_ORG", comfy_io.Image.Input(
"unique_id": "UNIQUE_ID", "image_ingredient_3",
}, optional=True,
} tooltip="Image that will be used as ingredient to create a video.",
),
comfy_io.Image.Input(
"image_ingredient_4",
optional=True,
tooltip="Image that will be used as ingredient to create a video.",
),
comfy_io.Image.Input(
"image_ingredient_5",
optional=True,
tooltip="Image that will be used as ingredient to create a video.",
),
],
outputs=[comfy_io.Video.Output()],
hidden=[
comfy_io.Hidden.auth_token_comfy_org,
comfy_io.Hidden.api_key_comfy_org,
comfy_io.Hidden.unique_id,
],
is_api_node=True,
)
DESCRIPTION = "Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them." @classmethod
async def execute(
async def api_call( cls,
self,
prompt_text: str, prompt_text: str,
negative_prompt: str, negative_prompt: str,
seed: int, seed: int,
@ -370,14 +383,12 @@ class PikaScenesV2_2(PikaNodeBase):
duration: int, duration: int,
ingredients_mode: str, ingredients_mode: str,
aspect_ratio: float, aspect_ratio: float,
unique_id: str,
image_ingredient_1: Optional[torch.Tensor] = None, image_ingredient_1: Optional[torch.Tensor] = None,
image_ingredient_2: Optional[torch.Tensor] = None, image_ingredient_2: Optional[torch.Tensor] = None,
image_ingredient_3: Optional[torch.Tensor] = None, image_ingredient_3: Optional[torch.Tensor] = None,
image_ingredient_4: Optional[torch.Tensor] = None, image_ingredient_4: Optional[torch.Tensor] = None,
image_ingredient_5: Optional[torch.Tensor] = None, image_ingredient_5: Optional[torch.Tensor] = None,
**kwargs, ) -> comfy_io.NodeOutput:
) -> tuple[VideoFromFile]:
# Convert all passed images to BytesIO # Convert all passed images to BytesIO
all_image_bytes_io = [] all_image_bytes_io = []
for image in [ for image in [
@ -406,7 +417,10 @@ class PikaScenesV2_2(PikaNodeBase):
duration=duration, duration=duration,
aspectRatio=aspect_ratio, aspectRatio=aspect_ratio,
) )
auth = {
"auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_PIKASCENES, path=PATH_PIKASCENES,
@ -417,63 +431,54 @@ class PikaScenesV2_2(PikaNodeBase):
request=pika_request_data, request=pika_request_data,
files=pika_files, files=pika_files,
content_type="multipart/form-data", content_type="multipart/form-data",
auth_kwargs=kwargs, auth_kwargs=auth,
) )
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
class PikAdditionsNode(PikaNodeBase): class PikAdditionsNode(comfy_io.ComfyNode):
"""Pika Pikadditions Node. Add an image into a video.""" """Pika Pikadditions Node. Add an image into a video."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
return { return comfy_io.Schema(
"required": { node_id="Pikadditions",
"video": (IO.VIDEO, {"tooltip": "The video to add an image to."}), display_name="Pikadditions (Video Object Insertion)",
"image": (IO.IMAGE, {"tooltip": "The image to add to the video."}), description="Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result.",
"prompt_text": model_field_to_node_input( category="api node/video/Pika",
IO.STRING, inputs=[
PikaBodyGeneratePikadditionsGeneratePikadditionsPost, comfy_io.Video.Input("video", tooltip="The video to add an image to."),
"promptText", comfy_io.Image.Input("image", tooltip="The image to add to the video."),
multiline=True, comfy_io.String.Input("prompt_text", multiline=True),
), comfy_io.String.Input("negative_prompt", multiline=True),
"negative_prompt": model_field_to_node_input( comfy_io.Int.Input(
IO.STRING,
PikaBodyGeneratePikadditionsGeneratePikadditionsPost,
"negativePrompt",
multiline=True,
),
"seed": model_field_to_node_input(
IO.INT,
PikaBodyGeneratePikadditionsGeneratePikadditionsPost,
"seed", "seed",
min=0, min=0,
max=0xFFFFFFFF, max=0xFFFFFFFF,
control_after_generate=True, control_after_generate=True,
), ),
}, ],
"hidden": { outputs=[comfy_io.Video.Output()],
"auth_token": "AUTH_TOKEN_COMFY_ORG", hidden=[
"comfy_api_key": "API_KEY_COMFY_ORG", comfy_io.Hidden.auth_token_comfy_org,
"unique_id": "UNIQUE_ID", comfy_io.Hidden.api_key_comfy_org,
}, comfy_io.Hidden.unique_id,
} ],
is_api_node=True,
)
DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result." @classmethod
async def execute(
async def api_call( cls,
self,
video: VideoInput, video: VideoInput,
image: torch.Tensor, image: torch.Tensor,
prompt_text: str, prompt_text: str,
negative_prompt: str, negative_prompt: str,
seed: int, seed: int,
unique_id: str, ) -> comfy_io.NodeOutput:
**kwargs,
) -> tuple[VideoFromFile]:
# Convert video to BytesIO # Convert video to BytesIO
video_bytes_io = io.BytesIO() video_bytes_io = BytesIO()
video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264)
video_bytes_io.seek(0) video_bytes_io.seek(0)
@ -492,7 +497,10 @@ class PikAdditionsNode(PikaNodeBase):
negativePrompt=negative_prompt, negativePrompt=negative_prompt,
seed=seed, seed=seed,
) )
auth = {
"auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_PIKADDITIONS, path=PATH_PIKADDITIONS,
@ -503,74 +511,51 @@ class PikAdditionsNode(PikaNodeBase):
request=pika_request_data, request=pika_request_data,
files=pika_files, files=pika_files,
content_type="multipart/form-data", content_type="multipart/form-data",
auth_kwargs=kwargs, auth_kwargs=auth,
) )
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
class PikaSwapsNode(PikaNodeBase): class PikaSwapsNode(comfy_io.ComfyNode):
"""Pika Pikaswaps Node.""" """Pika Pikaswaps Node."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
return { return comfy_io.Schema(
"required": { node_id="Pikaswaps",
"video": (IO.VIDEO, {"tooltip": "The video to swap an object in."}), display_name="Pika Swaps (Video Object Replacement)",
"image": ( description="Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates.",
IO.IMAGE, category="api node/video/Pika",
{ inputs=[
"tooltip": "The image used to replace the masked object in the video." comfy_io.Video.Input("video", tooltip="The video to swap an object in."),
}, comfy_io.Image.Input("image", tooltip="The image used to replace the masked object in the video."),
), comfy_io.Mask.Input("mask", tooltip="Use the mask to define areas in the video to replace"),
"mask": ( comfy_io.String.Input("prompt_text", multiline=True),
IO.MASK, comfy_io.String.Input("negative_prompt", multiline=True),
{"tooltip": "Use the mask to define areas in the video to replace"}, comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True),
), ],
"prompt_text": model_field_to_node_input( outputs=[comfy_io.Video.Output()],
IO.STRING, hidden=[
PikaBodyGeneratePikaswapsGeneratePikaswapsPost, comfy_io.Hidden.auth_token_comfy_org,
"promptText", comfy_io.Hidden.api_key_comfy_org,
multiline=True, comfy_io.Hidden.unique_id,
), ],
"negative_prompt": model_field_to_node_input( is_api_node=True,
IO.STRING, )
PikaBodyGeneratePikaswapsGeneratePikaswapsPost,
"negativePrompt",
multiline=True,
),
"seed": model_field_to_node_input(
IO.INT,
PikaBodyGeneratePikaswapsGeneratePikaswapsPost,
"seed",
min=0,
max=0xFFFFFFFF,
control_after_generate=True,
),
},
"hidden": {
"auth_token": "AUTH_TOKEN_COMFY_ORG",
"comfy_api_key": "API_KEY_COMFY_ORG",
"unique_id": "UNIQUE_ID",
},
}
DESCRIPTION = "Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates." @classmethod
RETURN_TYPES = ("VIDEO",) async def execute(
cls,
async def api_call(
self,
video: VideoInput, video: VideoInput,
image: torch.Tensor, image: torch.Tensor,
mask: torch.Tensor, mask: torch.Tensor,
prompt_text: str, prompt_text: str,
negative_prompt: str, negative_prompt: str,
seed: int, seed: int,
unique_id: str, ) -> comfy_io.NodeOutput:
**kwargs,
) -> tuple[VideoFromFile]:
# Convert video to BytesIO # Convert video to BytesIO
video_bytes_io = io.BytesIO() video_bytes_io = BytesIO()
video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264)
video_bytes_io.seek(0) video_bytes_io.seek(0)
@ -579,7 +564,7 @@ class PikaSwapsNode(PikaNodeBase):
mask = mask.repeat(1, 3, 1, 1) mask = mask.repeat(1, 3, 1, 1)
# Convert 3-channel binary mask to BytesIO # Convert 3-channel binary mask to BytesIO
mask_bytes_io = io.BytesIO() mask_bytes_io = BytesIO()
mask_bytes_io.write(mask.numpy().astype(np.uint8)) mask_bytes_io.write(mask.numpy().astype(np.uint8))
mask_bytes_io.seek(0) mask_bytes_io.seek(0)
@ -599,7 +584,10 @@ class PikaSwapsNode(PikaNodeBase):
negativePrompt=negative_prompt, negativePrompt=negative_prompt,
seed=seed, seed=seed,
) )
auth = {
"auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_PIKADDITIONS, path=PATH_PIKADDITIONS,
@ -610,71 +598,52 @@ class PikaSwapsNode(PikaNodeBase):
request=pika_request_data, request=pika_request_data,
files=pika_files, files=pika_files,
content_type="multipart/form-data", content_type="multipart/form-data",
auth_kwargs=kwargs, auth_kwargs=auth,
) )
return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id)
class PikaffectsNode(PikaNodeBase): class PikaffectsNode(comfy_io.ComfyNode):
"""Pika Pikaffects Node.""" """Pika Pikaffects Node."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
return { return comfy_io.Schema(
"required": { node_id="Pikaffects",
"image": ( display_name="Pikaffects (Video Effects)",
IO.IMAGE, description="Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear",
{"tooltip": "The reference image to apply the Pikaffect to."}, category="api node/video/Pika",
inputs=[
comfy_io.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."),
comfy_io.Combo.Input(
"pikaffect", options=[pikaffect.value for pikaffect in Pikaffect], default="Cake-ify"
), ),
"pikaffect": model_field_to_node_input( comfy_io.String.Input("prompt_text", multiline=True),
IO.COMBO, comfy_io.String.Input("negative_prompt", multiline=True),
PikaBodyGeneratePikaffectsGeneratePikaffectsPost, comfy_io.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True),
"pikaffect", ],
enum_type=Pikaffect, outputs=[comfy_io.Video.Output()],
default="Cake-ify", hidden=[
), comfy_io.Hidden.auth_token_comfy_org,
"prompt_text": model_field_to_node_input( comfy_io.Hidden.api_key_comfy_org,
IO.STRING, comfy_io.Hidden.unique_id,
PikaBodyGeneratePikaffectsGeneratePikaffectsPost, ],
"promptText", is_api_node=True,
multiline=True, )
),
"negative_prompt": model_field_to_node_input(
IO.STRING,
PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
"negativePrompt",
multiline=True,
),
"seed": model_field_to_node_input(
IO.INT,
PikaBodyGeneratePikaffectsGeneratePikaffectsPost,
"seed",
min=0,
max=0xFFFFFFFF,
control_after_generate=True,
),
},
"hidden": {
"auth_token": "AUTH_TOKEN_COMFY_ORG",
"comfy_api_key": "API_KEY_COMFY_ORG",
"unique_id": "UNIQUE_ID",
},
}
DESCRIPTION = "Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear" @classmethod
async def execute(
async def api_call( cls,
self,
image: torch.Tensor, image: torch.Tensor,
pikaffect: str, pikaffect: str,
prompt_text: str, prompt_text: str,
negative_prompt: str, negative_prompt: str,
seed: int, seed: int,
unique_id: str, ) -> comfy_io.NodeOutput:
**kwargs, auth = {
) -> tuple[VideoFromFile]: "auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_PIKAFFECTS, path=PATH_PIKAFFECTS,
@ -690,36 +659,38 @@ class PikaffectsNode(PikaNodeBase):
), ),
files={"image": ("image.png", tensor_to_bytesio(image), "image/png")}, files={"image": ("image.png", tensor_to_bytesio(image), "image/png")},
content_type="multipart/form-data", content_type="multipart/form-data",
auth_kwargs=kwargs, auth_kwargs=auth,
) )
return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id)
class PikaStartEndFrameNode2_2(PikaNodeBase): class PikaStartEndFrameNode2_2(comfy_io.ComfyNode):
"""PikaFrames v2.2 Node.""" """PikaFrames v2.2 Node."""
@classmethod @classmethod
def INPUT_TYPES(cls): def define_schema(cls) -> comfy_io.Schema:
return { return comfy_io.Schema(
"required": { node_id="PikaStartEndFrameNode2_2",
"image_start": (IO.IMAGE, {"tooltip": "The first image to combine."}), display_name="Pika Start and End Frame to Video",
"image_end": (IO.IMAGE, {"tooltip": "The last image to combine."}), description="Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them.",
**cls.get_base_inputs_types( category="api node/video/Pika",
PikaBodyGenerate22KeyframeGenerate22PikaframesPost inputs=[
), comfy_io.Image.Input("image_start", tooltip="The first image to combine."),
}, comfy_io.Image.Input("image_end", tooltip="The last image to combine."),
"hidden": { *get_base_inputs_types(),
"auth_token": "AUTH_TOKEN_COMFY_ORG", ],
"comfy_api_key": "API_KEY_COMFY_ORG", outputs=[comfy_io.Video.Output()],
"unique_id": "UNIQUE_ID", hidden=[
}, comfy_io.Hidden.auth_token_comfy_org,
} comfy_io.Hidden.api_key_comfy_org,
comfy_io.Hidden.unique_id,
],
is_api_node=True,
)
DESCRIPTION = "Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them." @classmethod
async def execute(
async def api_call( cls,
self,
image_start: torch.Tensor, image_start: torch.Tensor,
image_end: torch.Tensor, image_end: torch.Tensor,
prompt_text: str, prompt_text: str,
@ -727,15 +698,15 @@ class PikaStartEndFrameNode2_2(PikaNodeBase):
seed: int, seed: int,
resolution: str, resolution: str,
duration: int, duration: int,
unique_id: str, ) -> comfy_io.NodeOutput:
**kwargs,
) -> tuple[VideoFromFile]:
pika_files = [ pika_files = [
("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")),
("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")),
] ]
auth = {
"auth_token": cls.hidden.auth_token_comfy_org,
"comfy_api_key": cls.hidden.api_key_comfy_org,
}
initial_operation = SynchronousOperation( initial_operation = SynchronousOperation(
endpoint=ApiEndpoint( endpoint=ApiEndpoint(
path=PATH_PIKAFRAMES, path=PATH_PIKAFRAMES,
@ -752,28 +723,24 @@ class PikaStartEndFrameNode2_2(PikaNodeBase):
), ),
files=pika_files, files=pika_files,
content_type="multipart/form-data", content_type="multipart/form-data",
auth_kwargs=kwargs, auth_kwargs=auth,
) )
return await execute_task(initial_operation, auth_kwargs=auth, node_id=cls.hidden.unique_id)
return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id)
NODE_CLASS_MAPPINGS = { class PikaApiNodesExtension(ComfyExtension):
"PikaImageToVideoNode2_2": PikaImageToVideoV2_2, @override
"PikaTextToVideoNode2_2": PikaTextToVideoNodeV2_2, async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]:
"PikaScenesV2_2": PikaScenesV2_2, return [
"Pikadditions": PikAdditionsNode, PikaImageToVideoV2_2,
"Pikaswaps": PikaSwapsNode, PikaTextToVideoNodeV2_2,
"Pikaffects": PikaffectsNode, PikaScenesV2_2,
"PikaStartEndFrameNode2_2": PikaStartEndFrameNode2_2, PikAdditionsNode,
} PikaSwapsNode,
PikaffectsNode,
PikaStartEndFrameNode2_2,
]
NODE_DISPLAY_NAME_MAPPINGS = {
"PikaImageToVideoNode2_2": "Pika Image to Video", async def comfy_entrypoint() -> PikaApiNodesExtension:
"PikaTextToVideoNode2_2": "Pika Text to Video", return PikaApiNodesExtension()
"PikaScenesV2_2": "Pika Scenes (Video Image Composition)",
"Pikadditions": "Pikadditions (Video Object Insertion)",
"Pikaswaps": "Pika Swaps (Video Object Replacement)",
"Pikaffects": "Pikaffects (Video Effects)",
"PikaStartEndFrameNode2_2": "Pika Start and End Frame to Video",
}

View File

@ -146,7 +146,7 @@ class PixverseTextToVideoNode(comfy_io.ComfyNode):
comfy_io.String.Input( comfy_io.String.Input(
"negative_prompt", "negative_prompt",
default="", default="",
force_input=True, multiline=True,
tooltip="An optional text description of undesired elements on an image.", tooltip="An optional text description of undesired elements on an image.",
optional=True, optional=True,
), ),
@ -284,7 +284,7 @@ class PixverseImageToVideoNode(comfy_io.ComfyNode):
comfy_io.String.Input( comfy_io.String.Input(
"negative_prompt", "negative_prompt",
default="", default="",
force_input=True, multiline=True,
tooltip="An optional text description of undesired elements on an image.", tooltip="An optional text description of undesired elements on an image.",
optional=True, optional=True,
), ),
@ -425,7 +425,7 @@ class PixverseTransitionVideoNode(comfy_io.ComfyNode):
comfy_io.String.Input( comfy_io.String.Input(
"negative_prompt", "negative_prompt",
default="", default="",
force_input=True, multiline=True,
tooltip="An optional text description of undesired elements on an image.", tooltip="An optional text description of undesired elements on an image.",
optional=True, optional=True,
), ),

View File

@ -107,7 +107,7 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co
# if list already exists exists, just extend list with data # if list already exists exists, just extend list with data
for check_list in lists_to_check: for check_list in lists_to_check:
for conv_tuple in check_list: for conv_tuple in check_list:
if conv_tuple[0] == parent_key and type(conv_tuple[1]) is list: if conv_tuple[0] == parent_key and isinstance(conv_tuple[1], list):
conv_tuple[1].append(formatter(data)) conv_tuple[1].append(formatter(data))
return True return True
return False return False
@ -119,7 +119,7 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co
if formatter is None: if formatter is None:
formatter = lambda v: v # Multipart representation of value formatter = lambda v: v # Multipart representation of value
if type(data) is not dict: if not isinstance(data, dict):
# if list already exists exists, just extend list with data # if list already exists exists, just extend list with data
added = handle_converted_lists(data, parent_key, converted_to_check) added = handle_converted_lists(data, parent_key, converted_to_check)
if added: if added:
@ -136,9 +136,9 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co
for key, value in data.items(): for key, value in data.items():
current_key = key if parent_key is None else f"{parent_key}[{key}]" current_key = key if parent_key is None else f"{parent_key}[{key}]"
if type(value) is dict: if isinstance(value, dict):
converted.extend(recraft_multipart_parser(value, current_key, formatter, next_check).items()) converted.extend(recraft_multipart_parser(value, current_key, formatter, next_check).items())
elif type(value) is list: elif isinstance(value, list):
for ind, list_value in enumerate(value): for ind, list_value in enumerate(value):
iter_key = f"{current_key}[]" iter_key = f"{current_key}[]"
converted.extend(recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items()) converted.extend(recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items())

View File

@ -360,7 +360,7 @@ class RecordAudio:
def load(self, audio): def load(self, audio):
audio_path = folder_paths.get_annotated_filepath(audio) audio_path = folder_paths.get_annotated_filepath(audio)
waveform, sample_rate = torchaudio.load(audio_path) waveform, sample_rate = load(audio_path)
audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate}
return (audio, ) return (audio, )

View File

@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is # This file is automatically generated by the build process when version is
# updated in pyproject.toml. # updated in pyproject.toml.
__version__ = "0.3.62" __version__ = "0.3.63"

View File

@ -1,6 +1,6 @@
[project] [project]
name = "ComfyUI" name = "ComfyUI"
version = "0.3.62" version = "0.3.63"
readme = "README.md" readme = "README.md"
license = { file = "LICENSE" } license = { file = "LICENSE" }
requires-python = ">=3.9" requires-python = ">=3.9"
@ -57,18 +57,14 @@ messages_control.disable = [
"redefined-builtin", "redefined-builtin",
"unnecessary-lambda", "unnecessary-lambda",
"dangerous-default-value", "dangerous-default-value",
"invalid-overridden-method",
# next warnings should be fixed in future # next warnings should be fixed in future
"bad-classmethod-argument", # Class method should have 'cls' as first argument "bad-classmethod-argument", # Class method should have 'cls' as first argument
"wrong-import-order", # Standard imports should be placed before third party imports "wrong-import-order", # Standard imports should be placed before third party imports
"logging-fstring-interpolation", # Use lazy % formatting in logging functions "logging-fstring-interpolation", # Use lazy % formatting in logging functions
"ungrouped-imports", "ungrouped-imports",
"unnecessary-pass", "unnecessary-pass",
"unidiomatic-typecheck",
"unnecessary-lambda-assignment", "unnecessary-lambda-assignment",
"no-else-return", "no-else-return",
"no-else-raise",
"invalid-overridden-method",
"unused-variable", "unused-variable",
"pointless-string-statement",
"redefined-outer-name",
] ]

View File

@ -1,5 +1,5 @@
comfyui-frontend-package==1.27.7 comfyui-frontend-package==1.27.7
comfyui-workflow-templates==0.1.91 comfyui-workflow-templates==0.1.93
comfyui-embedded-docs==0.2.6 comfyui-embedded-docs==0.2.6
comfyui_manager==4.0.2 comfyui_manager==4.0.2
torch torch
@ -26,6 +26,5 @@ av>=14.2.0
#non essential dependencies: #non essential dependencies:
kornia>=0.7.1 kornia>=0.7.1
spandrel spandrel
soundfile
pydantic~=2.0 pydantic~=2.0
pydantic-settings~=2.0 pydantic-settings~=2.0