From 1978f59ffdf242389ded3eec76274a4cbed9cc3d Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Tue, 17 Feb 2026 06:33:43 +0800 Subject: [PATCH 001/317] chore: update workflow templates to v0.8.42 (#12491) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e939e486a..0930bbbb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.38.14 -comfyui-workflow-templates==0.8.38 +comfyui-workflow-templates==0.8.42 comfyui-embedded-docs==0.4.1 torch torchsde From 4454fab7f003c655e07f059c315e2aae0e5fb087 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 16 Feb 2026 17:09:24 -0800 Subject: [PATCH 002/317] Remove code to support RMSNorm on old pytorch. (#12499) --- comfy/ops.py | 6 ++---- comfy/rmsnorm.py | 55 ++++-------------------------------------------- 2 files changed, 6 insertions(+), 55 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 688937e43..026062f56 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -21,7 +21,6 @@ import logging import comfy.model_management from comfy.cli_args import args, PerformanceFeature, enables_dynamic_vram import comfy.float -import comfy.rmsnorm import json import comfy.memory_management import comfy.pinned_memory @@ -463,7 +462,7 @@ class disable_weight_init: else: return super().forward(*args, **kwargs) - class RMSNorm(comfy.rmsnorm.RMSNorm, CastWeightBiasOp): + class RMSNorm(torch.nn.RMSNorm, CastWeightBiasOp): def reset_parameters(self): self.bias = None return None @@ -475,8 +474,7 @@ class disable_weight_init: weight = None bias = None offload_stream = None - x = comfy.rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated - # x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) + x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) uncast_bias_weight(self, weight, bias, offload_stream) return x diff --git a/comfy/rmsnorm.py b/comfy/rmsnorm.py index 555542a46..ab7cf14fa 100644 --- a/comfy/rmsnorm.py +++ b/comfy/rmsnorm.py @@ -1,57 +1,10 @@ import torch import comfy.model_management -import numbers -import logging - -RMSNorm = None - -try: - rms_norm_torch = torch.nn.functional.rms_norm - RMSNorm = torch.nn.RMSNorm -except: - rms_norm_torch = None - logging.warning("Please update pytorch to use native RMSNorm") +RMSNorm = torch.nn.RMSNorm def rms_norm(x, weight=None, eps=1e-6): - if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()): - if weight is None: - return rms_norm_torch(x, (x.shape[-1],), eps=eps) - else: - return rms_norm_torch(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps) + if weight is None: + return torch.nn.functional.rms_norm(x, (x.shape[-1],), eps=eps) else: - r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps) - if weight is None: - return r - else: - return r * comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device) - - -if RMSNorm is None: - class RMSNorm(torch.nn.Module): - def __init__( - self, - normalized_shape, - eps=1e-6, - elementwise_affine=True, - device=None, - dtype=None, - ): - factory_kwargs = {"device": device, "dtype": dtype} - super().__init__() - if isinstance(normalized_shape, numbers.Integral): - # mypy error: incompatible types in assignment - normalized_shape = (normalized_shape,) # type: ignore[assignment] - self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] - self.eps = eps - self.elementwise_affine = elementwise_affine - if self.elementwise_affine: - self.weight = torch.nn.Parameter( - torch.empty(self.normalized_shape, **factory_kwargs) - ) - else: - self.register_parameter("weight", None) - self.bias = None - - def forward(self, x): - return rms_norm(x, self.weight, self.eps) + return torch.nn.functional.rms_norm(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps) From b44fc4c589c66e39686239d6eff7d6088668c9a8 Mon Sep 17 00:00:00 2001 From: Alex Butler Date: Tue, 17 Feb 2026 03:16:19 +0000 Subject: [PATCH 003/317] add venv* to gitignore (#12431) --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4e8cea71e..2700ad5c2 100644 --- a/.gitignore +++ b/.gitignore @@ -11,7 +11,7 @@ extra_model_paths.yaml /.vs .vscode/ .idea/ -venv/ +venv*/ .venv/ /web/extensions/* !/web/extensions/logging.js.example From 8a6fbc2dc29d0b15c1e9655c24e7501829249995 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Mon, 16 Feb 2026 19:20:21 -0800 Subject: [PATCH 004/317] Allow control_after_generate to be type ControlAfterGenerate in v3 schema (#12187) --- comfy_api/latest/_io.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 95d79c035..d18330d0b 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -75,6 +75,12 @@ class NumberDisplay(str, Enum): slider = "slider" +class ControlAfterGenerate(str, Enum): + fixed = "fixed" + increment = "increment" + decrement = "decrement" + randomize = "randomize" + class _ComfyType(ABC): Type = Any io_type: str = None @@ -263,7 +269,7 @@ class Int(ComfyTypeIO): class Input(WidgetInput): '''Integer input.''' def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, - default: int=None, min: int=None, max: int=None, step: int=None, control_after_generate: bool=None, + default: int=None, min: int=None, max: int=None, step: int=None, control_after_generate: bool | ControlAfterGenerate=None, display_mode: NumberDisplay=None, socketless: bool=None, force_input: bool=None, extra_dict=None, raw_link: bool=None, advanced: bool=None): super().__init__(id, display_name, optional, tooltip, lazy, default, socketless, None, force_input, extra_dict, raw_link, advanced) self.min = min @@ -345,7 +351,7 @@ class Combo(ComfyTypeIO): tooltip: str=None, lazy: bool=None, default: str | int | Enum = None, - control_after_generate: bool=None, + control_after_generate: bool | ControlAfterGenerate=None, upload: UploadType=None, image_folder: FolderType=None, remote: RemoteOptions=None, @@ -389,7 +395,7 @@ class MultiCombo(ComfyTypeI): Type = list[str] class Input(Combo.Input): def __init__(self, id: str, options: list[str], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, - default: list[str]=None, placeholder: str=None, chip: bool=None, control_after_generate: bool=None, + default: list[str]=None, placeholder: str=None, chip: bool=None, control_after_generate: bool | ControlAfterGenerate=None, socketless: bool=None, extra_dict=None, raw_link: bool=None, advanced: bool=None): super().__init__(id, options, display_name, optional, tooltip, lazy, default, control_after_generate, socketless=socketless, extra_dict=extra_dict, raw_link=raw_link, advanced=advanced) self.multiselect = True @@ -2097,6 +2103,7 @@ __all__ = [ "UploadType", "RemoteOptions", "NumberDisplay", + "ControlAfterGenerate", "comfytype", "Custom", From 18927538a15d44c734653513e9fdbbe1e79a9f0c Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 16 Feb 2026 20:30:34 -0800 Subject: [PATCH 005/317] Implement NAG on all the models based on the Flux code. (#12500) Use the Normalized Attention Guidance node. Flux, Flux2, Klein, Chroma, Chroma radiance, Hunyuan Video, etc.. --- comfy/ldm/chroma/model.py | 2 + comfy/ldm/flux/layers.py | 18 ++++++ comfy/ldm/flux/model.py | 2 + comfy/ldm/hunyuan_video/model.py | 2 + comfy/model_patcher.py | 5 +- comfy_extras/nodes_nag.py | 99 ++++++++++++++++++++++++++++++++ nodes.py | 1 + 7 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 comfy_extras/nodes_nag.py diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index 2e8ef0687..9fd865f20 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -152,6 +152,7 @@ class Chroma(nn.Module): transformer_options={}, attn_mask: Tensor = None, ) -> Tensor: + transformer_options = transformer_options.copy() patches_replace = transformer_options.get("patches_replace", {}) # running on sequences img @@ -228,6 +229,7 @@ class Chroma(nn.Module): transformer_options["total_blocks"] = len(self.single_blocks) transformer_options["block_type"] = "single" + transformer_options["img_slice"] = [txt.shape[1], img.shape[1]] for i, block in enumerate(self.single_blocks): transformer_options["block_index"] = i if i not in self.skip_dit: diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index 3518a1922..8b3f500d7 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -196,6 +196,9 @@ class DoubleStreamBlock(nn.Module): else: (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec + transformer_patches = transformer_options.get("patches", {}) + extra_options = transformer_options.copy() + # prepare image for attention img_modulated = self.img_norm1(img) img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img) @@ -224,6 +227,12 @@ class DoubleStreamBlock(nn.Module): attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) del q, k, v + if "attn1_output_patch" in transformer_patches: + extra_options["img_slice"] = [txt.shape[1], attn.shape[1]] + patch = transformer_patches["attn1_output_patch"] + for p in patch: + attn = p(attn, extra_options) + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] # calculate the img bloks @@ -303,6 +312,9 @@ class SingleStreamBlock(nn.Module): else: mod = vec + transformer_patches = transformer_options.get("patches", {}) + extra_options = transformer_options.copy() + qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim_first], dim=-1) q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) @@ -312,6 +324,12 @@ class SingleStreamBlock(nn.Module): # compute attention attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) del q, k, v + + if "attn1_output_patch" in transformer_patches: + patch = transformer_patches["attn1_output_patch"] + for p in patch: + attn = p(attn, extra_options) + # compute activation in mlp stream, cat again and run second linear layer if self.yak_mlp: mlp = self.mlp_act(mlp[..., self.mlp_hidden_dim_first // 2:]) * mlp[..., :self.mlp_hidden_dim_first // 2] diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 260ccad7e..ef4dcf7c5 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -142,6 +142,7 @@ class Flux(nn.Module): attn_mask: Tensor = None, ) -> Tensor: + transformer_options = transformer_options.copy() patches = transformer_options.get("patches", {}) patches_replace = transformer_options.get("patches_replace", {}) if img.ndim != 3 or txt.ndim != 3: @@ -231,6 +232,7 @@ class Flux(nn.Module): transformer_options["total_blocks"] = len(self.single_blocks) transformer_options["block_type"] = "single" + transformer_options["img_slice"] = [txt.shape[1], img.shape[1]] for i, block in enumerate(self.single_blocks): transformer_options["block_index"] = i if ("single_block", i) in blocks_replace: diff --git a/comfy/ldm/hunyuan_video/model.py b/comfy/ldm/hunyuan_video/model.py index 563f28f6b..b94cdfa87 100644 --- a/comfy/ldm/hunyuan_video/model.py +++ b/comfy/ldm/hunyuan_video/model.py @@ -304,6 +304,7 @@ class HunyuanVideo(nn.Module): control=None, transformer_options={}, ) -> Tensor: + transformer_options = transformer_options.copy() patches_replace = transformer_options.get("patches_replace", {}) initial_shape = list(img.shape) @@ -416,6 +417,7 @@ class HunyuanVideo(nn.Module): transformer_options["total_blocks"] = len(self.single_blocks) transformer_options["block_type"] = "single" + transformer_options["img_slice"] = [txt.shape[1], img.shape[1]] for i, block in enumerate(self.single_blocks): transformer_options["block_index"] = i if ("single_block", i) in blocks_replace: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index f01818f50..21b4ce53e 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -406,13 +406,16 @@ class ModelPatcher: def memory_required(self, input_shape): return self.model.memory_required(input_shape=input_shape) + def disable_model_cfg1_optimization(self): + self.model_options["disable_cfg1_optimization"] = True + def set_model_sampler_cfg_function(self, sampler_cfg_function, disable_cfg1_optimization=False): if len(inspect.signature(sampler_cfg_function).parameters) == 3: self.model_options["sampler_cfg_function"] = lambda args: sampler_cfg_function(args["cond"], args["uncond"], args["cond_scale"]) #Old way else: self.model_options["sampler_cfg_function"] = sampler_cfg_function if disable_cfg1_optimization: - self.model_options["disable_cfg1_optimization"] = True + self.disable_model_cfg1_optimization() def set_model_sampler_post_cfg_function(self, post_cfg_function, disable_cfg1_optimization=False): self.model_options = set_model_options_post_cfg_function(self.model_options, post_cfg_function, disable_cfg1_optimization) diff --git a/comfy_extras/nodes_nag.py b/comfy_extras/nodes_nag.py new file mode 100644 index 000000000..033e40eb9 --- /dev/null +++ b/comfy_extras/nodes_nag.py @@ -0,0 +1,99 @@ +import torch +from comfy_api.latest import ComfyExtension, io +from typing_extensions import override + + +class NAGuidance(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="NAGuidance", + display_name="Normalized Attention Guidance", + description="Applies Normalized Attention Guidance to models, enabling negative prompts on distilled/schnell models.", + category="", + is_experimental=True, + inputs=[ + io.Model.Input("model", tooltip="The model to apply NAG to."), + io.Float.Input("nag_scale", min=0.0, default=5.0, max=50.0, step=0.1, tooltip="The guidance scale factor. Higher values push further from the negative prompt."), + io.Float.Input("nag_alpha", min=0.0, default=0.5, max=1.0, step=0.01, tooltip="Blending factor for the normalized attention. 1.0 is full replacement, 0.0 is no effect."), + io.Float.Input("nag_tau", min=1.0, default=1.5, max=10.0, step=0.01), + # io.Float.Input("start_percent", min=0.0, default=0.0, max=1.0, step=0.01, tooltip="The relative sampling step to begin applying NAG."), + # io.Float.Input("end_percent", min=0.0, default=1.0, max=1.0, step=0.01, tooltip="The relative sampling step to stop applying NAG."), + ], + outputs=[ + io.Model.Output(tooltip="The patched model with NAG enabled."), + ], + ) + + @classmethod + def execute(cls, model: io.Model.Type, nag_scale: float, nag_alpha: float, nag_tau: float) -> io.NodeOutput: + m = model.clone() + + # sigma_start = m.get_model_object("model_sampling").percent_to_sigma(start_percent) + # sigma_end = m.get_model_object("model_sampling").percent_to_sigma(end_percent) + + def nag_attention_output_patch(out, extra_options): + cond_or_uncond = extra_options.get("cond_or_uncond", None) + if cond_or_uncond is None: + return out + + if not (1 in cond_or_uncond and 0 in cond_or_uncond): + return out + + # sigma = extra_options.get("sigmas", None) + # if sigma is not None and len(sigma) > 0: + # sigma = sigma[0].item() + # if sigma > sigma_start or sigma < sigma_end: + # return out + + img_slice = extra_options.get("img_slice", None) + + if img_slice is not None: + orig_out = out + out = out[:, img_slice[0]:img_slice[1]] # only apply on img part + + batch_size = out.shape[0] + half_size = batch_size // len(cond_or_uncond) + + ind_neg = cond_or_uncond.index(1) + ind_pos = cond_or_uncond.index(0) + z_pos = out[half_size * ind_pos:half_size * (ind_pos + 1)] + z_neg = out[half_size * ind_neg:half_size * (ind_neg + 1)] + + guided = z_pos * nag_scale - z_neg * (nag_scale - 1.0) + + eps = 1e-6 + norm_pos = torch.norm(z_pos, p=1, dim=-1, keepdim=True).clamp_min(eps) + norm_guided = torch.norm(guided, p=1, dim=-1, keepdim=True).clamp_min(eps) + + ratio = norm_guided / norm_pos + scale_factor = torch.minimum(ratio, torch.full_like(ratio, nag_tau)) / ratio + + guided_normalized = guided * scale_factor + + z_final = guided_normalized * nag_alpha + z_pos * (1.0 - nag_alpha) + + if img_slice is not None: + orig_out[half_size * ind_neg:half_size * (ind_neg + 1), img_slice[0]:img_slice[1]] = z_final + orig_out[half_size * ind_pos:half_size * (ind_pos + 1), img_slice[0]:img_slice[1]] = z_final + return orig_out + else: + out[half_size * ind_pos:half_size * (ind_pos + 1)] = z_final + return out + + m.set_model_attn1_output_patch(nag_attention_output_patch) + m.disable_model_cfg1_optimization() + + return io.NodeOutput(m) + + +class NagExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + NAGuidance, + ] + + +async def comfy_entrypoint() -> NagExtension: + return NagExtension() diff --git a/nodes.py b/nodes.py index db5f98408..dff56b79c 100644 --- a/nodes.py +++ b/nodes.py @@ -2437,6 +2437,7 @@ async def init_builtin_extra_nodes(): "nodes_color.py", "nodes_toolkit.py", "nodes_replacements.py", + "nodes_nag.py", ] import_failed = [] From c39653163d77161b2df2d57419129a4d6d081aa1 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 16 Feb 2026 21:29:20 -0800 Subject: [PATCH 006/317] Fix anima preprocess text embeds not using right inference dtype. (#12501) --- comfy/model_base.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/comfy/model_base.py b/comfy/model_base.py index 4a74cb1ce..9dcef8741 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -178,10 +178,7 @@ class BaseModel(torch.nn.Module): xc = torch.cat([xc] + [comfy.model_management.cast_to_device(c_concat, xc.device, xc.dtype)], dim=1) context = c_crossattn - dtype = self.get_dtype() - - if self.manual_cast_dtype is not None: - dtype = self.manual_cast_dtype + dtype = self.get_dtype_inference() xc = xc.to(dtype) device = xc.device @@ -218,6 +215,13 @@ class BaseModel(torch.nn.Module): def get_dtype(self): return self.diffusion_model.dtype + def get_dtype_inference(self): + dtype = self.get_dtype() + + if self.manual_cast_dtype is not None: + dtype = self.manual_cast_dtype + return dtype + def encode_adm(self, **kwargs): return None @@ -372,9 +376,7 @@ class BaseModel(torch.nn.Module): input_shapes += shape if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention(): - dtype = self.get_dtype() - if self.manual_cast_dtype is not None: - dtype = self.manual_cast_dtype + dtype = self.get_dtype_inference() #TODO: this needs to be tweaked area = sum(map(lambda input_shape: input_shape[0] * math.prod(input_shape[2:]), input_shapes)) return (area * comfy.model_management.dtype_size(dtype) * 0.01 * self.memory_usage_factor) * (1024 * 1024) @@ -1165,7 +1167,7 @@ class Anima(BaseModel): t5xxl_ids = t5xxl_ids.unsqueeze(0) if torch.is_inference_mode_enabled(): # if not we are training - cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype())) + cross_attn = self.diffusion_model.preprocess_text_embeds(cross_attn.to(device=device, dtype=self.get_dtype_inference()), t5xxl_ids.to(device=device), t5xxl_weights=t5xxl_weights.to(device=device, dtype=self.get_dtype_inference())) else: out['t5xxl_ids'] = comfy.conds.CONDRegular(t5xxl_ids) out['t5xxl_weights'] = comfy.conds.CONDRegular(t5xxl_weights) From fe52843fe55b92dedaabff684294dd7a115d2204 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 17 Feb 2026 00:39:54 -0500 Subject: [PATCH 007/317] ComfyUI v0.14.0 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index cf4e89816..8f7f3228e 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.13.0" +__version__ = "0.14.0" diff --git a/pyproject.toml b/pyproject.toml index 9dab9a50c..b132bb9c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.13.0" +version = "0.14.0" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" From 44f85985215b4d819665e4cec84c00ef87aa9a7a Mon Sep 17 00:00:00 2001 From: chaObserv <154517000+chaObserv@users.noreply.github.com> Date: Tue, 17 Feb 2026 23:56:44 +0800 Subject: [PATCH 008/317] Fix anima LLM adapter forward when manual cast (#12504) --- comfy/ldm/anima/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy/ldm/anima/model.py b/comfy/ldm/anima/model.py index 6fb51c4a4..6fcf8df90 100644 --- a/comfy/ldm/anima/model.py +++ b/comfy/ldm/anima/model.py @@ -179,8 +179,8 @@ class LLMAdapter(nn.Module): if source_attention_mask.ndim == 2: source_attention_mask = source_attention_mask.unsqueeze(1).unsqueeze(1) - x = self.in_proj(self.embed(target_input_ids)) context = source_hidden_states + x = self.in_proj(self.embed(target_input_ids, out_dtype=context.dtype)) position_ids = torch.arange(x.shape[1], device=x.device).unsqueeze(0) position_ids_context = torch.arange(context.shape[1], device=x.device).unsqueeze(0) position_embeddings = self.rotary_emb(x, position_ids) From 5284e6bf69b6e2e856c672595fd413fd505377ee Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 17 Feb 2026 20:07:14 +0200 Subject: [PATCH 009/317] feat(api-nodes): add "viduq3-turbo" model and Vidu3StartEnd node; fix the price badges (#12482) --- comfy_api_nodes/nodes_vidu.py | 226 ++++++++++++++++++++++++++++++++-- 1 file changed, 218 insertions(+), 8 deletions(-) diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 80de14dfe..bbe7ebba2 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -54,6 +54,7 @@ async def execute_task( response_model=TaskStatusResponse, status_extractor=lambda r: r.state, progress_extractor=lambda r: r.progress, + price_extractor=lambda r: r.credits * 0.005 if r.credits is not None else None, max_poll_attempts=max_poll_attempts, ) if not response.creations: @@ -1306,6 +1307,36 @@ class Vidu3TextToVideoNode(IO.ComfyNode): ), ], ), + IO.DynamicCombo.Option( + "viduq3-turbo", + [ + IO.Combo.Input( + "aspect_ratio", + options=["16:9", "9:16", "3:4", "4:3", "1:1"], + tooltip="The aspect ratio of the output video.", + ), + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + IO.Int.Input( + "duration", + default=5, + min=1, + max=16, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds.", + ), + IO.Boolean.Input( + "audio", + default=False, + tooltip="When enabled, outputs video with sound " + "(including dialogue and sound effects).", + ), + ], + ), ], tooltip="Model to use for video generation.", ), @@ -1334,13 +1365,20 @@ class Vidu3TextToVideoNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - depends_on=IO.PriceBadgeDepends(widgets=["model.duration", "model.resolution"]), + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.duration", "model.resolution"]), expr=""" ( $res := $lookup(widgets, "model.resolution"); - $base := $lookup({"720p": 0.075, "1080p": 0.1}, $res); - $perSec := $lookup({"720p": 0.025, "1080p": 0.05}, $res); - {"type":"usd","usd": $base + $perSec * ($lookup(widgets, "model.duration") - 1)} + $d := $lookup(widgets, "model.duration"); + $contains(widgets.model, "turbo") + ? ( + $rate := $lookup({"720p": 0.06, "1080p": 0.08}, $res); + {"type":"usd","usd": $rate * $d} + ) + : ( + $rate := $lookup({"720p": 0.15, "1080p": 0.16}, $res); + {"type":"usd","usd": $rate * $d} + ) ) """, ), @@ -1409,6 +1447,31 @@ class Vidu3ImageToVideoNode(IO.ComfyNode): ), ], ), + IO.DynamicCombo.Option( + "viduq3-turbo", + [ + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + IO.Int.Input( + "duration", + default=5, + min=1, + max=16, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds.", + ), + IO.Boolean.Input( + "audio", + default=False, + tooltip="When enabled, outputs video with sound " + "(including dialogue and sound effects).", + ), + ], + ), ], tooltip="Model to use for video generation.", ), @@ -1442,13 +1505,20 @@ class Vidu3ImageToVideoNode(IO.ComfyNode): ], is_api_node=True, price_badge=IO.PriceBadge( - depends_on=IO.PriceBadgeDepends(widgets=["model.duration", "model.resolution"]), + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.duration", "model.resolution"]), expr=""" ( $res := $lookup(widgets, "model.resolution"); - $base := $lookup({"720p": 0.075, "1080p": 0.275, "2k": 0.35}, $res); - $perSec := $lookup({"720p": 0.05, "1080p": 0.075, "2k": 0.075}, $res); - {"type":"usd","usd": $base + $perSec * ($lookup(widgets, "model.duration") - 1)} + $d := $lookup(widgets, "model.duration"); + $contains(widgets.model, "turbo") + ? ( + $rate := $lookup({"720p": 0.06, "1080p": 0.08}, $res); + {"type":"usd","usd": $rate * $d} + ) + : ( + $rate := $lookup({"720p": 0.15, "1080p": 0.16, "2k": 0.2}, $res); + {"type":"usd","usd": $rate * $d} + ) ) """, ), @@ -1481,6 +1551,145 @@ class Vidu3ImageToVideoNode(IO.ComfyNode): return IO.NodeOutput(await download_url_to_video_output(results[0].url)) +class Vidu3StartEndToVideoNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Vidu3StartEndToVideoNode", + display_name="Vidu Q3 Start/End Frame-to-Video Generation", + category="api node/video/Vidu", + description="Generate a video from a start frame, an end frame, and a prompt.", + inputs=[ + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "viduq3-pro", + [ + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + IO.Int.Input( + "duration", + default=5, + min=1, + max=16, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds.", + ), + IO.Boolean.Input( + "audio", + default=False, + tooltip="When enabled, outputs video with sound " + "(including dialogue and sound effects).", + ), + ], + ), + IO.DynamicCombo.Option( + "viduq3-turbo", + [ + IO.Combo.Input( + "resolution", + options=["720p", "1080p"], + tooltip="Resolution of the output video.", + ), + IO.Int.Input( + "duration", + default=5, + min=1, + max=16, + step=1, + display_mode=IO.NumberDisplay.slider, + tooltip="Duration of the output video in seconds.", + ), + IO.Boolean.Input( + "audio", + default=False, + tooltip="When enabled, outputs video with sound " + "(including dialogue and sound effects).", + ), + ], + ), + ], + tooltip="Model to use for video generation.", + ), + IO.Image.Input("first_frame"), + IO.Image.Input("end_frame"), + IO.String.Input( + "prompt", + multiline=True, + tooltip="Prompt description (max 2000 characters).", + ), + IO.Int.Input( + "seed", + default=1, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "model.duration", "model.resolution"]), + expr=""" + ( + $res := $lookup(widgets, "model.resolution"); + $d := $lookup(widgets, "model.duration"); + $contains(widgets.model, "turbo") + ? ( + $rate := $lookup({"720p": 0.06, "1080p": 0.08}, $res); + {"type":"usd","usd": $rate * $d} + ) + : ( + $rate := $lookup({"720p": 0.15, "1080p": 0.16}, $res); + {"type":"usd","usd": $rate * $d} + ) + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + model: dict, + first_frame: Input.Image, + end_frame: Input.Image, + prompt: str, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, max_length=2000) + validate_images_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) + payload = TaskCreationRequest( + model=model["model"], + prompt=prompt, + duration=model["duration"], + seed=seed, + resolution=model["resolution"], + audio=model["audio"], + images=[ + (await upload_images_to_comfyapi(cls, frame, max_images=1, mime_type="image/png"))[0] + for frame in (first_frame, end_frame) + ], + ) + results = await execute_task(cls, VIDU_START_END_VIDEO, payload) + return IO.NodeOutput(await download_url_to_video_output(results[0].url)) + + class ViduExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -1497,6 +1706,7 @@ class ViduExtension(ComfyExtension): ViduMultiFrameVideoNode, Vidu3TextToVideoNode, Vidu3ImageToVideoNode, + Vidu3StartEndToVideoNode, ] From 262abf437b0666f3d00d1f335a526073503e59e4 Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Tue, 17 Feb 2026 20:25:44 +0200 Subject: [PATCH 010/317] feat(api-nodes): add Recraft V4 nodes (#12502) --- comfy_api_nodes/apis/recraft.py | 45 ++++- comfy_api_nodes/nodes_recraft.py | 272 +++++++++++++++++++++++++++++-- 2 files changed, 297 insertions(+), 20 deletions(-) diff --git a/comfy_api_nodes/apis/recraft.py b/comfy_api_nodes/apis/recraft.py index 0bd7d23b3..78ededd94 100644 --- a/comfy_api_nodes/apis/recraft.py +++ b/comfy_api_nodes/apis/recraft.py @@ -198,11 +198,6 @@ dict_recraft_substyles_v3 = { } -class RecraftModel(str, Enum): - recraftv3 = 'recraftv3' - recraftv2 = 'recraftv2' - - class RecraftImageSize(str, Enum): res_1024x1024 = '1024x1024' res_1365x1024 = '1365x1024' @@ -221,6 +216,41 @@ class RecraftImageSize(str, Enum): res_1707x1024 = '1707x1024' +RECRAFT_V4_SIZES = [ + "1024x1024", + "1536x768", + "768x1536", + "1280x832", + "832x1280", + "1216x896", + "896x1216", + "1152x896", + "896x1152", + "832x1344", + "1280x896", + "896x1280", + "1344x768", + "768x1344", +] + +RECRAFT_V4_PRO_SIZES = [ + "2048x2048", + "3072x1536", + "1536x3072", + "2560x1664", + "1664x2560", + "2432x1792", + "1792x2432", + "2304x1792", + "1792x2304", + "1664x2688", + "1434x1024", + "1024x1434", + "2560x1792", + "1792x2560", +] + + class RecraftColorObject(BaseModel): rgb: list[int] = Field(..., description='An array of 3 integer values in range of 0...255 defining RGB Color Model') @@ -234,17 +264,16 @@ class RecraftControlsObject(BaseModel): class RecraftImageGenerationRequest(BaseModel): prompt: str = Field(..., description='The text prompt describing the image to generate') - size: RecraftImageSize | None = Field(None, description='The size of the generated image (e.g., "1024x1024")') + size: str | None = Field(None, description='The size of the generated image (e.g., "1024x1024")') n: int = Field(..., description='The number of images to generate') negative_prompt: str | None = Field(None, description='A text description of undesired elements on an image') - model: RecraftModel | None = Field(RecraftModel.recraftv3, description='The model to use for generation (e.g., "recraftv3")') + model: str = Field(...) style: str | None = Field(None, description='The style to apply to the generated image (e.g., "digital_illustration")') substyle: str | None = Field(None, description='The substyle to apply to the generated image, depending on the style input') controls: RecraftControlsObject | None = Field(None, description='A set of custom parameters to tweak generation process') style_id: str | None = Field(None, description='Use a previously uploaded style as a reference; UUID') strength: float | None = Field(None, description='Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity') random_seed: int | None = Field(None, description="Seed for video generation") - # text_layout class RecraftReturnedObject(BaseModel): diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index 3a1f32263..773cb7dbe 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -1,5 +1,4 @@ from io import BytesIO -from typing import Optional, Union import aiohttp import torch @@ -9,6 +8,8 @@ from typing_extensions import override from comfy.utils import ProgressBar from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.recraft import ( + RECRAFT_V4_PRO_SIZES, + RECRAFT_V4_SIZES, RecraftColor, RecraftColorChain, RecraftControls, @@ -18,7 +19,6 @@ from comfy_api_nodes.apis.recraft import ( RecraftImageGenerationResponse, RecraftImageSize, RecraftIO, - RecraftModel, RecraftStyle, RecraftStyleV3, get_v3_substyles, @@ -39,7 +39,7 @@ async def handle_recraft_file_request( cls: type[IO.ComfyNode], image: torch.Tensor, path: str, - mask: Optional[torch.Tensor] = None, + mask: torch.Tensor | None = None, total_pixels: int = 4096 * 4096, timeout: int = 1024, request=None, @@ -73,11 +73,11 @@ async def handle_recraft_file_request( def recraft_multipart_parser( data, parent_key=None, - formatter: Optional[type[callable]] = None, - converted_to_check: Optional[list[list]] = None, + formatter: type[callable] | None = None, + converted_to_check: list[list] | None = None, is_list: bool = False, return_mode: str = "formdata", # "dict" | "formdata" -) -> Union[dict, aiohttp.FormData]: +) -> dict | aiohttp.FormData: """ Formats data such that multipart/form-data will work with aiohttp library when both files and data are present. @@ -309,7 +309,7 @@ class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode): node_id="RecraftStyleV3InfiniteStyleLibrary", display_name="Recraft Style - Infinite Style Library", category="api node/image/Recraft", - description="Select style based on preexisting UUID from Recraft's Infinite Style Library.", + description="Choose style based on preexisting UUID from Recraft's Infinite Style Library.", inputs=[ IO.String.Input("style_id", default="", tooltip="UUID of style from Infinite Style Library."), ], @@ -485,7 +485,7 @@ class RecraftTextToImageNode(IO.ComfyNode): data=RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, - model=RecraftModel.recraftv3, + model="recraftv3", size=size, n=n, style=recraft_style.style, @@ -598,7 +598,7 @@ class RecraftImageToImageNode(IO.ComfyNode): request = RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, - model=RecraftModel.recraftv3, + model="recraftv3", n=n, strength=round(strength, 2), style=recraft_style.style, @@ -698,7 +698,7 @@ class RecraftImageInpaintingNode(IO.ComfyNode): request = RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, - model=RecraftModel.recraftv3, + model="recraftv3", n=n, style=recraft_style.style, substyle=recraft_style.substyle, @@ -810,7 +810,7 @@ class RecraftTextToVectorNode(IO.ComfyNode): data=RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, - model=RecraftModel.recraftv3, + model="recraftv3", size=size, n=n, style=recraft_style.style, @@ -933,7 +933,7 @@ class RecraftReplaceBackgroundNode(IO.ComfyNode): request = RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, - model=RecraftModel.recraftv3, + model="recraftv3", n=n, style=recraft_style.style, substyle=recraft_style.substyle, @@ -1078,6 +1078,252 @@ class RecraftCreativeUpscaleNode(RecraftCrispUpscaleNode): ) +class RecraftV4TextToImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftV4TextToImageNode", + display_name="Recraft V4 Text to Image", + category="api node/image/Recraft", + description="Generates images using Recraft V4 or V4 Pro models.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + tooltip="Prompt for the image generation. Maximum 10,000 characters.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + tooltip="An optional text description of undesired elements on an image.", + ), + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "recraftv4", + [ + IO.Combo.Input( + "size", + options=RECRAFT_V4_SIZES, + default="1024x1024", + tooltip="The size of the generated image.", + ), + ], + ), + IO.DynamicCombo.Option( + "recraftv4_pro", + [ + IO.Combo.Input( + "size", + options=RECRAFT_V4_PRO_SIZES, + default="2048x2048", + tooltip="The size of the generated image.", + ), + ], + ), + ], + tooltip="The model to use for generation.", + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "n"]), + expr=""" + ( + $prices := {"recraftv4": 0.04, "recraftv4_pro": 0.25}; + {"type":"usd","usd": $lookup($prices, widgets.model) * widgets.n} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: str, + model: dict, + n: int, + seed: int, + recraft_controls: RecraftControls | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=False, min_length=1, max_length=10000) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/recraft/image_generation", method="POST"), + response_model=RecraftImageGenerationResponse, + data=RecraftImageGenerationRequest( + prompt=prompt, + negative_prompt=negative_prompt if negative_prompt else None, + model=model["model"], + size=model["size"], + n=n, + controls=recraft_controls.create_api_model() if recraft_controls else None, + ), + max_retries=1, + ) + images = [] + for data in response.data: + with handle_recraft_image_output(): + image = bytesio_to_image_tensor(await download_url_as_bytesio(data.url, timeout=1024)) + if len(image.shape) < 4: + image = image.unsqueeze(0) + images.append(image) + return IO.NodeOutput(torch.cat(images, dim=0)) + + +class RecraftV4TextToVectorNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftV4TextToVectorNode", + display_name="Recraft V4 Text to Vector", + category="api node/image/Recraft", + description="Generates SVG using Recraft V4 or V4 Pro models.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + tooltip="Prompt for the image generation. Maximum 10,000 characters.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + tooltip="An optional text description of undesired elements on an image.", + ), + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option( + "recraftv4", + [ + IO.Combo.Input( + "size", + options=RECRAFT_V4_SIZES, + default="1024x1024", + tooltip="The size of the generated image.", + ), + ], + ), + IO.DynamicCombo.Option( + "recraftv4_pro", + [ + IO.Combo.Input( + "size", + options=RECRAFT_V4_PRO_SIZES, + default="2048x2048", + tooltip="The size of the generated image.", + ), + ], + ), + ], + tooltip="The model to use for generation.", + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.SVG.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["model", "n"]), + expr=""" + ( + $prices := {"recraftv4": 0.08, "recraftv4_pro": 0.30}; + {"type":"usd","usd": $lookup($prices, widgets.model) * widgets.n} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: str, + model: dict, + n: int, + seed: int, + recraft_controls: RecraftControls | None = None, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=False, min_length=1, max_length=10000) + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/recraft/image_generation", method="POST"), + response_model=RecraftImageGenerationResponse, + data=RecraftImageGenerationRequest( + prompt=prompt, + negative_prompt=negative_prompt if negative_prompt else None, + model=model["model"], + size=model["size"], + n=n, + style="vector_illustration", + substyle=None, + controls=recraft_controls.create_api_model() if recraft_controls else None, + ), + max_retries=1, + ) + svg_data = [] + for data in response.data: + svg_data.append(await download_url_as_bytesio(data.url, timeout=1024)) + return IO.NodeOutput(SVG(svg_data)) + + class RecraftExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -1098,6 +1344,8 @@ class RecraftExtension(ComfyExtension): RecraftCreateStyleNode, RecraftColorRGBNode, RecraftControlsNode, + RecraftV4TextToImageNode, + RecraftV4TextToVectorNode, ] From 73c3f869737bbb1035f6b72b2e1068a1a5642764 Mon Sep 17 00:00:00 2001 From: ComfyUI Wiki Date: Wed, 18 Feb 2026 02:25:55 +0800 Subject: [PATCH 011/317] chore: update workflow templates to v0.8.43 (#12507) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0930bbbb8..881d6bd58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.38.14 -comfyui-workflow-templates==0.8.42 +comfyui-workflow-templates==0.8.43 comfyui-embedded-docs==0.4.1 torch torchsde From 19236edfa4d2f66070d66a6b3aee592c9c2ad574 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Tue, 17 Feb 2026 13:28:06 -0500 Subject: [PATCH 012/317] ComfyUI v0.14.1 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 8f7f3228e..f24c15cc5 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.14.0" +__version__ = "0.14.1" diff --git a/pyproject.toml b/pyproject.toml index b132bb9c4..51c3d224d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.14.0" +version = "0.14.1" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.10" From 58dcc97dcfadc548ac8d8d5e80741ddfb807d213 Mon Sep 17 00:00:00 2001 From: rattus <46076784+rattus128@users.noreply.github.com> Date: Tue, 17 Feb 2026 12:32:27 -0800 Subject: [PATCH 013/317] ops: limit return of requants (#12506) This check was far too broad and the dtype is not a reliable indicator of wanting the requant (as QT returns the compute dtype as the dtype). So explictly plumb whether fp8mm wants the requant or not. --- comfy/ops.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/comfy/ops.py b/comfy/ops.py index 026062f56..a6c642795 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -79,7 +79,7 @@ def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) -def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype): +def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant): offload_stream = None xfer_dest = None @@ -170,10 +170,10 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu #FIXME: this is not accurate, we need to be sensitive to the compute dtype x = lowvram_fn(x) if (isinstance(orig, QuantizedTensor) and - (orig.dtype == dtype and len(fns) == 0 or update_weight)): + (want_requant and len(fns) == 0 or update_weight)): seed = comfy.utils.string_to_seed(s.seed_key) y = QuantizedTensor.from_float(x, s.layout_type, scale="recalculate", stochastic_rounding=seed) - if orig.dtype == dtype and len(fns) == 0: + if want_requant and len(fns) == 0: #The layer actually wants our freshly saved QT x = y elif update_weight: @@ -194,7 +194,7 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu return weight, bias, (offload_stream, device if signature is not None else None, None) -def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None): +def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False): # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This # will add async-offload support to your cast and improve performance. @@ -212,7 +212,7 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of non_blocking = comfy.model_management.device_supports_non_blocking(device) if hasattr(s, "_v"): - return cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype) + return cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant) if offloadable and (device != s.weight.device or (s.bias is not None and device != s.bias.device)): @@ -850,8 +850,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec def _forward(self, input, weight, bias): return torch.nn.functional.linear(input, weight, bias) - def forward_comfy_cast_weights(self, input, compute_dtype=None): - weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True, compute_dtype=compute_dtype) + def forward_comfy_cast_weights(self, input, compute_dtype=None, want_requant=False): + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True, compute_dtype=compute_dtype, want_requant=want_requant) x = self._forward(input, weight, bias) uncast_bias_weight(self, weight, bias, offload_stream) return x @@ -881,8 +881,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec scale = comfy.model_management.cast_to_device(scale, input.device, None) input = QuantizedTensor.from_float(input_reshaped, self.layout_type, scale=scale) - - output = self.forward_comfy_cast_weights(input, compute_dtype) + output = self.forward_comfy_cast_weights(input, compute_dtype, want_requant=isinstance(input, QuantizedTensor)) # Reshape output back to 3D if input was 3D if reshaped_3d: From 6c14f129af4fd94c4197644e6950bddbba0c9e51 Mon Sep 17 00:00:00 2001 From: Comfy Org PR Bot Date: Wed, 18 Feb 2026 06:41:34 +0900 Subject: [PATCH 014/317] Bump comfyui-frontend-package to 1.39.14 (#12494) * Bump comfyui-frontend-package to 1.39.13 * Update requirements.txt --------- Co-authored-by: Christian Byrne --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 881d6bd58..807fea5e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.38.14 +comfyui-frontend-package==1.39.14 comfyui-workflow-templates==0.8.43 comfyui-embedded-docs==0.4.1 torch From 8ad38d2073b019a204f730182dcf5456fb260858 Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Tue, 17 Feb 2026 20:13:39 -0500 Subject: [PATCH 015/317] BBox widget (#11594) * Boundingbox widget * code improve --------- Co-authored-by: Jedrzej Kosinski Co-authored-by: Christian Byrne --- comfy_api/latest/_io.py | 25 ++++++++++++++++ comfy_extras/nodes_images.py | 56 +++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index d18330d0b..312681249 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -1209,6 +1209,30 @@ class Color(ComfyTypeIO): def as_dict(self): return super().as_dict() +@comfytype(io_type="BOUNDING_BOX") +class BoundingBox(ComfyTypeIO): + class BoundingBoxDict(TypedDict): + x: int + y: int + width: int + height: int + Type = BoundingBoxDict + + class Input(WidgetInput): + def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None, + socketless: bool=True, default: dict=None, component: str=None): + super().__init__(id, display_name, optional, tooltip, None, default, socketless) + self.component = component + if default is None: + self.default = {"x": 0, "y": 0, "width": 512, "height": 512} + + def as_dict(self): + d = super().as_dict() + if self.component: + d["component"] = self.component + return d + + DYNAMIC_INPUT_LOOKUP: dict[str, Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]] = {} def register_dynamic_input_func(io_type: str, func: Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]): DYNAMIC_INPUT_LOOKUP[io_type] = func @@ -2190,5 +2214,6 @@ __all__ = [ "ImageCompare", "PriceBadgeDepends", "PriceBadge", + "BoundingBox", "NodeReplace", ] diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index cb4fb24a1..23419a65d 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -23,8 +23,9 @@ class ImageCrop(IO.ComfyNode): return IO.Schema( node_id="ImageCrop", search_aliases=["trim"], - display_name="Image Crop", + display_name="Image Crop (Deprecated)", category="image/transform", + is_deprecated=True, inputs=[ IO.Image.Input("image"), IO.Int.Input("width", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1), @@ -47,6 +48,57 @@ class ImageCrop(IO.ComfyNode): crop = execute # TODO: remove +class ImageCropV2(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="ImageCropV2", + search_aliases=["trim"], + display_name="Image Crop", + category="image/transform", + inputs=[ + IO.Image.Input("image"), + IO.BoundingBox.Input("crop_region", component="ImageCrop"), + ], + outputs=[IO.Image.Output()], + ) + + @classmethod + def execute(cls, image, crop_region) -> IO.NodeOutput: + x = crop_region.get("x", 0) + y = crop_region.get("y", 0) + width = crop_region.get("width", 512) + height = crop_region.get("height", 512) + + x = min(x, image.shape[2] - 1) + y = min(y, image.shape[1] - 1) + to_x = width + x + to_y = height + y + img = image[:,y:to_y, x:to_x, :] + return IO.NodeOutput(img, ui=UI.PreviewImage(img)) + + +class BoundingBox(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="PrimitiveBoundingBox", + display_name="Bounding Box", + category="utils/primitive", + inputs=[ + IO.Int.Input("x", default=0, min=0, max=MAX_RESOLUTION), + IO.Int.Input("y", default=0, min=0, max=MAX_RESOLUTION), + IO.Int.Input("width", default=512, min=1, max=MAX_RESOLUTION), + IO.Int.Input("height", default=512, min=1, max=MAX_RESOLUTION), + ], + outputs=[IO.BoundingBox.Output()], + ) + + @classmethod + def execute(cls, x, y, width, height) -> IO.NodeOutput: + return IO.NodeOutput({"x": x, "y": y, "width": width, "height": height}) + + class RepeatImageBatch(IO.ComfyNode): @classmethod def define_schema(cls): @@ -632,6 +684,8 @@ class ImagesExtension(ComfyExtension): async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ ImageCrop, + ImageCropV2, + BoundingBox, RepeatImageBatch, ImageFromBatch, ImageAddNoise, From 83dd65f23ae78186df5be7f579af5c0cdb61f0f9 Mon Sep 17 00:00:00 2001 From: Hunter Date: Wed, 18 Feb 2026 00:03:54 -0500 Subject: [PATCH 016/317] fix: use glob matching for Gemini image MIME types (#12511) gemini-3-pro-image-preview nondeterministically returns image/jpeg instead of image/png. get_image_from_response() hardcoded get_parts_by_type(response, "image/png"), silently dropping JPEG responses and falling back to torch.zeros (all-black output). Add _mime_matches() helper using fnmatch for glob-style MIME matching. Change get_image_from_response() to request "image/*" so any image format returned by the API is correctly captured. --- comfy_api_nodes/nodes_gemini.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index 3b31caa7b..5287a777a 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -6,6 +6,7 @@ See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/infer import base64 import os from enum import Enum +from fnmatch import fnmatch from io import BytesIO from typing import Literal @@ -119,6 +120,13 @@ async def create_image_parts( return image_parts +def _mime_matches(mime: GeminiMimeType | None, pattern: str) -> bool: + """Check if a MIME type matches a pattern. Supports fnmatch globs (e.g. 'image/*').""" + if mime is None: + return False + return fnmatch(mime.value, pattern) + + def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Literal["text"] | str) -> list[GeminiPart]: """ Filter response parts by their type. @@ -151,9 +159,9 @@ def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Litera for part in candidate.content.parts: if part_type == "text" and part.text: parts.append(part) - elif part.inlineData and part.inlineData.mimeType == part_type: + elif part.inlineData and _mime_matches(part.inlineData.mimeType, part_type): parts.append(part) - elif part.fileData and part.fileData.mimeType == part_type: + elif part.fileData and _mime_matches(part.fileData.mimeType, part_type): parts.append(part) if not parts and blocked_reasons: @@ -178,7 +186,7 @@ def get_text_from_response(response: GeminiGenerateContentResponse) -> str: async def get_image_from_response(response: GeminiGenerateContentResponse) -> Input.Image: image_tensors: list[Input.Image] = [] - parts = get_parts_by_type(response, "image/png") + parts = get_parts_by_type(response, "image/*") for part in parts: if part.inlineData: image_data = base64.b64decode(part.inlineData.data) From 239ddd332724c63934bf517cfc6d0026214d8aee Mon Sep 17 00:00:00 2001 From: Alexander Piskun <13381981+bigcat88@users.noreply.github.com> Date: Wed, 18 Feb 2026 09:15:23 +0200 Subject: [PATCH 017/317] fix(api-nodes): add price badge for Rodin Gen-2 node (#12512) --- comfy_api_nodes/nodes_rodin.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index f9cff121f..9c1adaa51 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -505,6 +505,9 @@ class Rodin3D_Gen2(IO.ComfyNode): IO.Hidden.unique_id, ], is_api_node=True, + price_badge=IO.PriceBadge( + expr="""{"type":"usd","usd":0.4}""", + ), ) @classmethod From f262444dd4818b6acdbc1350856679dd6245f7f5 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 18 Feb 2026 15:36:35 -0800 Subject: [PATCH 018/317] Add simple 3 band equalizer node for audio. (#12519) --- comfy_extras/nodes_audio.py | 62 +++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index b63dd8e97..7e74169f2 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -698,6 +698,67 @@ class EmptyAudio(IO.ComfyNode): create_empty_audio = execute # TODO: remove +class AudioEqualizer3Band(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="AudioEqualizer3Band", + search_aliases=["eq", "bass boost", "treble boost", "equalizer"], + display_name="Audio Equalizer (3-Band)", + category="audio", + is_experimental=True, + inputs=[ + IO.Audio.Input("audio"), + IO.Float.Input("low_gain_dB", default=0.0, min=-24.0, max=24.0, step=0.1, tooltip="Gain for Low frequencies (Bass)"), + IO.Int.Input("low_freq", default=100, min=20, max=500, tooltip="Cutoff frequency for Low shelf"), + IO.Float.Input("mid_gain_dB", default=0.0, min=-24.0, max=24.0, step=0.1, tooltip="Gain for Mid frequencies"), + IO.Int.Input("mid_freq", default=1000, min=200, max=4000, tooltip="Center frequency for Mids"), + IO.Float.Input("mid_q", default=0.707, min=0.1, max=10.0, step=0.1, tooltip="Q factor (bandwidth) for Mids"), + IO.Float.Input("high_gain_dB", default=0.0, min=-24.0, max=24.0, step=0.1, tooltip="Gain for High frequencies (Treble)"), + IO.Int.Input("high_freq", default=5000, min=1000, max=15000, tooltip="Cutoff frequency for High shelf"), + ], + outputs=[IO.Audio.Output()], + ) + + @classmethod + def execute(cls, audio, low_gain_dB, low_freq, mid_gain_dB, mid_freq, mid_q, high_gain_dB, high_freq) -> IO.NodeOutput: + waveform = audio["waveform"] + sample_rate = audio["sample_rate"] + eq_waveform = waveform.clone() + + # 1. Apply Low Shelf (Bass) + if low_gain_dB != 0: + eq_waveform = torchaudio.functional.bass_biquad( + eq_waveform, + sample_rate, + gain=low_gain_dB, + central_freq=float(low_freq), + Q=0.707 + ) + + # 2. Apply Peaking EQ (Mids) + if mid_gain_dB != 0: + eq_waveform = torchaudio.functional.equalizer_biquad( + eq_waveform, + sample_rate, + center_freq=float(mid_freq), + gain=mid_gain_dB, + Q=mid_q + ) + + # 3. Apply High Shelf (Treble) + if high_gain_dB != 0: + eq_waveform = torchaudio.functional.treble_biquad( + eq_waveform, + sample_rate, + gain=high_gain_dB, + central_freq=float(high_freq), + Q=0.707 + ) + + return IO.NodeOutput({"waveform": eq_waveform, "sample_rate": sample_rate}) + + class AudioExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -720,6 +781,7 @@ class AudioExtension(ComfyExtension): AudioMerge, AudioAdjustVolume, EmptyAudio, + AudioEqualizer3Band, ] async def comfy_entrypoint() -> AudioExtension: From 6d11cc73549e14a0a31e9ff8c90bfd71b380fe2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Thu, 19 Feb 2026 03:49:43 +0200 Subject: [PATCH 019/317] feat: Add basic text generation support with native models, initially supporting Gemma3 (#12392) --- comfy/sd.py | 29 +++- comfy/sd1_clip.py | 18 +++ comfy/text_encoders/llama.py | 148 +++++++++++++++++++- comfy/text_encoders/lt.py | 92 ++++++++++--- comfy/text_encoders/lumina2.py | 36 ++++- comfy/text_encoders/spiece_tokenizer.py | 27 +++- comfy/utils.py | 8 ++ comfy_extras/nodes_textgen.py | 176 ++++++++++++++++++++++++ nodes.py | 1 + 9 files changed, 502 insertions(+), 33 deletions(-) create mode 100644 comfy_extras/nodes_textgen.py diff --git a/comfy/sd.py b/comfy/sd.py index f65e7cadd..164f30803 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -423,6 +423,19 @@ class CLIP: def get_key_patches(self): return self.patcher.get_key_patches() + def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None): + self.cond_stage_model.reset_clip_options() + + if self.layer_idx is not None: + self.cond_stage_model.set_clip_options({"layer": self.layer_idx}) + + self.load_model() + self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) + return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed) + + def decode(self, token_ids, skip_special_tokens=True): + return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + class VAE: def __init__(self, sd=None, device=None, config=None, dtype=None, metadata=None): if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format @@ -1182,6 +1195,7 @@ class TEModel(Enum): JINA_CLIP_2 = 19 QWEN3_8B = 20 QWEN3_06B = 21 + GEMMA_3_4B_VISION = 22 def detect_te_model(sd): @@ -1210,7 +1224,10 @@ def detect_te_model(sd): if 'model.layers.47.self_attn.q_norm.weight' in sd: return TEModel.GEMMA_3_12B if 'model.layers.0.self_attn.q_norm.weight' in sd: - return TEModel.GEMMA_3_4B + if 'vision_model.embeddings.patch_embedding.weight' in sd: + return TEModel.GEMMA_3_4B_VISION + else: + return TEModel.GEMMA_3_4B return TEModel.GEMMA_2_2B if 'model.layers.0.self_attn.k_proj.bias' in sd: weight = sd['model.layers.0.self_attn.k_proj.bias'] @@ -1270,6 +1287,8 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip else: if "text_projection" in clip_data[i]: clip_data[i]["text_projection.weight"] = clip_data[i]["text_projection"].transpose(0, 1) #old models saved with the CLIPSave node + if "lm_head.weight" in clip_data[i]: + clip_data[i]["model.lm_head.weight"] = clip_data[i].pop("lm_head.weight") # prefix missing in some models tokenizer_data = {} clip_target = EmptyClass() @@ -1335,6 +1354,14 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b") clip_target.tokenizer = comfy.text_encoders.lumina2.NTokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) + elif te_model == TEModel.GEMMA_3_4B_VISION: + clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b_vision") + clip_target.tokenizer = comfy.text_encoders.lumina2.NTokenizer + tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) + elif te_model == TEModel.GEMMA_3_12B: + clip_target.clip = comfy.text_encoders.lt.gemma3_te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.lt.Gemma3_12BTokenizer + tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.LLAMA3_8: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data), clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index b564d1529..d9d014055 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -308,6 +308,15 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder): def load_sd(self, sd): return self.transformer.load_state_dict(sd, strict=False, assign=getattr(self, "can_assign_sd", False)) + def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[]): + if isinstance(tokens, dict): + tokens_only = next(iter(tokens.values())) # todo: get this better? + else: + tokens_only = tokens + tokens_only = [[t[0] for t in b] for b in tokens_only] + embeds = self.process_tokens(tokens_only, device=self.execution_device)[0] + return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens) + def parse_parentheses(string): result = [] current_item = "" @@ -663,6 +672,9 @@ class SDTokenizer: def state_dict(self): return {} + def decode(self, token_ids, skip_special_tokens=True): + return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + class SD1Tokenizer: def __init__(self, embedding_directory=None, tokenizer_data={}, clip_name="l", tokenizer=SDTokenizer, name=None): if name is not None: @@ -686,6 +698,9 @@ class SD1Tokenizer: def state_dict(self): return getattr(self, self.clip).state_dict() + def decode(self, token_ids, skip_special_tokens=True): + return getattr(self, self.clip).decode(token_ids, skip_special_tokens=skip_special_tokens) + class SD1CheckpointClipModel(SDClipModel): def __init__(self, device="cpu", dtype=None, model_options={}): super().__init__(device=device, return_projected_pooled=False, dtype=dtype, model_options=model_options) @@ -722,3 +737,6 @@ class SD1ClipModel(torch.nn.Module): def load_sd(self, sd): return getattr(self, self.clip).load_sd(sd) + + def generate(self, tokens, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.95, min_p=0.0, repetition_penalty=1.0, seed=None): + return getattr(self, self.clip).generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed) diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index 54f3d5595..e5d21fa74 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -3,6 +3,8 @@ import torch.nn as nn from dataclasses import dataclass from typing import Optional, Any, Tuple import math +from tqdm import tqdm +import comfy.utils from comfy.ldm.modules.attention import optimized_attention_for_device import comfy.model_management @@ -313,6 +315,13 @@ class Gemma3_4B_Config: final_norm: bool = True lm_head: bool = False +GEMMA3_VISION_CONFIG = {"num_channels": 3, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "image_size": 896, "intermediate_size": 4304, "model_type": "siglip_vision_model", "num_attention_heads": 16, "num_hidden_layers": 27, "patch_size": 14} + +@dataclass +class Gemma3_4B_Vision_Config(Gemma3_4B_Config): + vision_config = GEMMA3_VISION_CONFIG + mm_tokens_per_image = 256 + @dataclass class Gemma3_12B_Config: vocab_size: int = 262208 @@ -336,7 +345,7 @@ class Gemma3_12B_Config: rope_scale = [8.0, 1.0] final_norm: bool = True lm_head: bool = False - vision_config = {"num_channels": 3, "hidden_act": "gelu_pytorch_tanh", "hidden_size": 1152, "image_size": 896, "intermediate_size": 4304, "model_type": "siglip_vision_model", "num_attention_heads": 16, "num_hidden_layers": 27, "patch_size": 14} + vision_config = GEMMA3_VISION_CONFIG mm_tokens_per_image = 256 class RMSNorm(nn.Module): @@ -441,8 +450,10 @@ class Attention(nn.Module): freqs_cis: Optional[torch.Tensor] = None, optimized_attention=None, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + sliding_window: Optional[int] = None, ): batch_size, seq_length, _ = hidden_states.shape + xq = self.q_proj(hidden_states) xk = self.k_proj(hidden_states) xv = self.v_proj(hidden_states) @@ -477,6 +488,11 @@ class Attention(nn.Module): else: present_key_value = (xk, xv, index + num_tokens) + if sliding_window is not None and xk.shape[2] > sliding_window: + xk = xk[:, :, -sliding_window:] + xv = xv[:, :, -sliding_window:] + attention_mask = attention_mask[..., -sliding_window:] if attention_mask is not None else None + xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) @@ -559,10 +575,12 @@ class TransformerBlockGemma2(nn.Module): optimized_attention=None, past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, ): + sliding_window = None if self.transformer_type == 'gemma3': if self.sliding_attention: + sliding_window = self.sliding_attention if x.shape[1] > self.sliding_attention: - sliding_mask = torch.full((x.shape[1], x.shape[1]), float("-inf"), device=x.device, dtype=x.dtype) + sliding_mask = torch.full((x.shape[1], x.shape[1]), torch.finfo(x.dtype).min, device=x.device, dtype=x.dtype) sliding_mask.tril_(diagonal=-self.sliding_attention) if attention_mask is not None: attention_mask = attention_mask + sliding_mask @@ -581,6 +599,7 @@ class TransformerBlockGemma2(nn.Module): freqs_cis=freqs_cis, optimized_attention=optimized_attention, past_key_value=past_key_value, + sliding_window=sliding_window, ) x = self.post_attention_layernorm(x) @@ -765,6 +784,104 @@ class BaseLlama: def forward(self, input_ids, *args, **kwargs): return self.model(input_ids, *args, **kwargs) +class BaseGenerate: + def logits(self, x): + input = x[:, -1:] + if hasattr(self.model, "lm_head"): + module = self.model.lm_head + else: + module = self.model.embed_tokens + + offload_stream = None + if module.comfy_cast_weights: + weight, _, offload_stream = comfy.ops.cast_bias_weight(module, input, offloadable=True) + else: + weight = self.model.embed_tokens.weight.to(x) + + x = torch.nn.functional.linear(input, weight, None) + + comfy.ops.uncast_bias_weight(module, weight, None, offload_stream) + return x + + def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=[], initial_tokens=[], execution_dtype=None, min_tokens=0): + device = embeds.device + model_config = self.model.config + + if execution_dtype is None: + if comfy.model_management.should_use_bf16(device): + execution_dtype = torch.bfloat16 + else: + execution_dtype = torch.float32 + embeds = embeds.to(execution_dtype) + + if embeds.ndim == 2: + embeds = embeds.unsqueeze(0) + + past_key_values = [] #kv_cache init + max_cache_len = embeds.shape[1] + max_length + for x in range(model_config.num_hidden_layers): + past_key_values.append((torch.empty([embeds.shape[0], model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), + torch.empty([embeds.shape[0], model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0)) + + generator = torch.Generator(device=device).manual_seed(seed) if do_sample else None + + generated_token_ids = [] + pbar = comfy.utils.ProgressBar(max_length) + + # Generation loop + for step in tqdm(range(max_length), desc="Generating tokens"): + x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values) + logits = self.logits(x)[:, -1] + next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample) + token_id = next_token[0].item() + generated_token_ids.append(token_id) + + embeds = self.model.embed_tokens(next_token).to(execution_dtype) + pbar.update(1) + + if token_id in stop_tokens: + break + + return generated_token_ids + + def sample_token(self, logits, temperature, top_k, top_p, min_p, repetition_penalty, token_history, generator, do_sample=True): + + if not do_sample or temperature == 0.0: + return torch.argmax(logits, dim=-1, keepdim=True) + + # Sampling mode + if repetition_penalty != 1.0: + for i in range(logits.shape[0]): + for token_id in set(token_history): + logits[i, token_id] *= repetition_penalty if logits[i, token_id] < 0 else 1/repetition_penalty + + if temperature != 1.0: + logits = logits / temperature + + if top_k > 0: + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = torch.finfo(logits.dtype).min + + if min_p > 0.0: + probs_before_filter = torch.nn.functional.softmax(logits, dim=-1) + top_probs, _ = probs_before_filter.max(dim=-1, keepdim=True) + min_threshold = min_p * top_probs + indices_to_remove = probs_before_filter < min_threshold + logits[indices_to_remove] = torch.finfo(logits.dtype).min + + if top_p < 1.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1) + sorted_indices_to_remove = cumulative_probs > top_p + sorted_indices_to_remove[..., 0] = False + indices_to_remove = torch.zeros_like(logits, dtype=torch.bool) + indices_to_remove.scatter_(1, sorted_indices, sorted_indices_to_remove) + logits[indices_to_remove] = torch.finfo(logits.dtype).min + + probs = torch.nn.functional.softmax(logits, dim=-1) + + return torch.multinomial(probs, num_samples=1, generator=generator) + class BaseQwen3: def logits(self, x): input = x[:, -1:] @@ -871,7 +988,7 @@ class Ovis25_2B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype -class Qwen25_7BVLI(BaseLlama, torch.nn.Module): +class Qwen25_7BVLI(BaseLlama, BaseGenerate, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() config = Qwen25_7BVLI_Config(**config_dict) @@ -881,6 +998,9 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module): self.visual = qwen_vl.Qwen2VLVisionTransformer(hidden_size=1280, output_hidden_size=config.hidden_size, device=device, dtype=dtype, ops=operations) self.dtype = dtype + # todo: should this be tied or not? + #self.lm_head = operations.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) + def preprocess_embed(self, embed, device): if embed["type"] == "image": image, grid = qwen_vl.process_qwen2vl_images(embed["data"]) @@ -923,7 +1043,7 @@ class Gemma2_2B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype -class Gemma3_4B(BaseLlama, torch.nn.Module): +class Gemma3_4B(BaseLlama, BaseGenerate, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() config = Gemma3_4B_Config(**config_dict) @@ -932,7 +1052,25 @@ class Gemma3_4B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype -class Gemma3_12B(BaseLlama, torch.nn.Module): +class Gemma3_4B_Vision(BaseLlama, BaseGenerate, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Gemma3_4B_Vision_Config(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype + self.multi_modal_projector = Gemma3MultiModalProjector(config, dtype, device, operations) + self.vision_model = comfy.clip_model.CLIPVision(config.vision_config, dtype, device, operations) + self.image_size = config.vision_config["image_size"] + + def preprocess_embed(self, embed, device): + if embed["type"] == "image": + image = comfy.clip_model.clip_preprocess(embed["data"], size=self.image_size, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], crop=True) + return self.multi_modal_projector(self.vision_model(image.to(device, dtype=torch.float32))[0]), None + return None, None + +class Gemma3_12B(BaseLlama, BaseGenerate, torch.nn.Module): def __init__(self, config_dict, dtype, device, operations): super().__init__() config = Gemma3_12B_Config(**config_dict) diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py index 9cf87c0b2..82fbacf59 100644 --- a/comfy/text_encoders/lt.py +++ b/comfy/text_encoders/lt.py @@ -6,6 +6,7 @@ import comfy.text_encoders.genmo from comfy.ldm.lightricks.embeddings_connector import Embeddings1DConnector import torch import comfy.utils +import math class T5XXLTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): @@ -22,40 +23,79 @@ def ltxv_te(*args, **kwargs): return comfy.text_encoders.genmo.mochi_te(*args, **kwargs) -class Gemma3_12BTokenizer(sd1_clip.SDTokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - tokenizer = tokenizer_data.get("spiece_model", None) - super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_left=True, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) - +class Gemma3_Tokenizer(): def state_dict(self): return {"spiece_model": self.tokenizer.serialize_model()} + def tokenize_with_weights(self, text, return_word_ids=False, image=None, llama_template=None, skip_template=True, **kwargs): + self.llama_template = "system\nYou are a helpful assistant.\nuser\n{}\nmodel\n" + self.llama_template_images = "system\nYou are a helpful assistant.\nuser\n\n{}\n\nmodel\n" + + if image is None: + images = [] + else: + samples = image.movedim(-1, 1) + total = int(896 * 896) + + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = comfy.utils.common_upscale(samples, width, height, "area", "disabled").movedim(1, -1) + images = [s[:, :, :, :3]] + + if text.startswith(''): + skip_template = True + + if skip_template: + llama_text = text + else: + if llama_template is None: + if len(images) > 0: + llama_text = self.llama_template_images.format(text) + else: + llama_text = self.llama_template.format(text) + else: + llama_text = llama_template.format(text) + + text_tokens = super().tokenize_with_weights(llama_text, return_word_ids) + + if len(images) > 0: + embed_count = 0 + for r in text_tokens: + for i, token in enumerate(r): + if token[0] == 262144 and embed_count < len(images): + r[i] = ({"type": "image", "data": images[embed_count]},) + token[1:] + embed_count += 1 + return text_tokens + +class Gemma3_12BTokenizer(Gemma3_Tokenizer, sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer = tokenizer_data.get("spiece_model", None) + special_tokens = {"": 262144, "": 106} + super().__init__(tokenizer, pad_with_end=False, embedding_size=3840, embedding_key='gemma3_12b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=512, pad_left=True, disable_weights=True, tokenizer_args={"add_bos": True, "add_eos": False, "special_tokens": special_tokens}, tokenizer_data=tokenizer_data) + + class LTXAVGemmaTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma3_12b", tokenizer=Gemma3_12BTokenizer) + class Gemma3_12BModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="all", layer_idx=None, dtype=None, attention_mask=True, model_options={}): llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) if llama_quantization_metadata is not None: model_options = model_options.copy() model_options["quantization_metadata"] = llama_quantization_metadata - + self.dtypes = set() + self.dtypes.add(dtype) super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_12B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) - def tokenize_with_weights(self, text, return_word_ids=False, llama_template="{}", image_embeds=None, **kwargs): - text = llama_template.format(text) - text_tokens = super().tokenize_with_weights(text, return_word_ids) - embed_count = 0 - for k in text_tokens: - tt = text_tokens[k] - for r in tt: - for i in range(len(r)): - if r[i][0] == 262144: - if image_embeds is not None and embed_count < image_embeds.shape[0]: - r[i] = ({"type": "embedding", "data": image_embeds[embed_count], "original_type": "image"},) + r[i][1:] - embed_count += 1 - return text_tokens + def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed): + tokens_only = [[t[0] for t in b] for b in tokens] + embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device) + comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5) + return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106]) # 106 is class LTXAVTEModel(torch.nn.Module): def __init__(self, dtype_llama=None, device="cpu", dtype=None, model_options={}): @@ -112,6 +152,9 @@ class LTXAVTEModel(torch.nn.Module): return out.to(out_device), pooled + def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed): + return self.gemma3_12b.generate(tokens["gemma3_12b"], do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed) + def load_sd(self, sd): if "model.layers.47.self_attn.q_norm.weight" in sd: return self.gemma3_12b.load_sd(sd) @@ -152,3 +195,14 @@ def ltxav_te(dtype_llama=None, llama_quantization_metadata=None): dtype = dtype_llama super().__init__(dtype_llama=dtype_llama, device=device, dtype=dtype, model_options=model_options) return LTXAVTEModel_ + +def gemma3_te(dtype_llama=None, llama_quantization_metadata=None): + class Gemma3_12BModel_(Gemma3_12BModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["llama_quantization_metadata"] = llama_quantization_metadata + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return Gemma3_12BModel_ diff --git a/comfy/text_encoders/lumina2.py b/comfy/text_encoders/lumina2.py index b29a7cc87..1b731e094 100644 --- a/comfy/text_encoders/lumina2.py +++ b/comfy/text_encoders/lumina2.py @@ -1,23 +1,23 @@ from comfy import sd1_clip from .spiece_tokenizer import SPieceTokenizer import comfy.text_encoders.llama - +from comfy.text_encoders.lt import Gemma3_Tokenizer +import comfy.utils class Gemma2BTokenizer(sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer = tokenizer_data.get("spiece_model", None) - super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) + special_tokens = {"": 107} + super().__init__(tokenizer, pad_with_end=False, embedding_size=2304, embedding_key='gemma2_2b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False, "special_tokens": special_tokens}, tokenizer_data=tokenizer_data) def state_dict(self): return {"spiece_model": self.tokenizer.serialize_model()} -class Gemma3_4BTokenizer(sd1_clip.SDTokenizer): +class Gemma3_4BTokenizer(Gemma3_Tokenizer, sd1_clip.SDTokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): tokenizer = tokenizer_data.get("spiece_model", None) - super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, disable_weights=True, tokenizer_data=tokenizer_data) - - def state_dict(self): - return {"spiece_model": self.tokenizer.serialize_model()} + special_tokens = {"": 262144, "": 106} + super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False, "special_tokens": special_tokens}, disable_weights=True, tokenizer_data=tokenizer_data) class LuminaTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): @@ -31,6 +31,9 @@ class Gemma2_2BModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + def generate(self, embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed): + return super().generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[107]) + class Gemma3_4BModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) @@ -40,6 +43,23 @@ class Gemma3_4BModel(sd1_clip.SDClipModel): super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + def generate(self, embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed): + return super().generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106]) + +class Gemma3_4B_Vision_Model(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["quantization_metadata"] = llama_quantization_metadata + + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B_Vision, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + def process_tokens(self, tokens, device): + embeds, _, _, embeds_info = super().process_tokens(tokens, device) + comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5) + return embeds + class LuminaModel(sd1_clip.SD1ClipModel): def __init__(self, device="cpu", dtype=None, model_options={}, name="gemma2_2b", clip_model=Gemma2_2BModel): super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options) @@ -50,6 +70,8 @@ def te(dtype_llama=None, llama_quantization_metadata=None, model_type="gemma2_2b model = Gemma2_2BModel elif model_type == "gemma3_4b": model = Gemma3_4BModel + elif model_type == "gemma3_4b_vision": + model = Gemma3_4B_Vision_Model class LuminaTEModel_(LuminaModel): def __init__(self, device="cpu", dtype=None, model_options={}): diff --git a/comfy/text_encoders/spiece_tokenizer.py b/comfy/text_encoders/spiece_tokenizer.py index caccb3ca2..099d8d2d9 100644 --- a/comfy/text_encoders/spiece_tokenizer.py +++ b/comfy/text_encoders/spiece_tokenizer.py @@ -6,9 +6,10 @@ class SPieceTokenizer: def from_pretrained(path, **kwargs): return SPieceTokenizer(path, **kwargs) - def __init__(self, tokenizer_path, add_bos=False, add_eos=True): + def __init__(self, tokenizer_path, add_bos=False, add_eos=True, special_tokens=None): self.add_bos = add_bos self.add_eos = add_eos + self.special_tokens = special_tokens import sentencepiece if torch.is_tensor(tokenizer_path): tokenizer_path = tokenizer_path.numpy().tobytes() @@ -27,8 +28,32 @@ class SPieceTokenizer: return out def __call__(self, string): + if self.special_tokens is not None: + import re + special_tokens_pattern = '|'.join(re.escape(token) for token in self.special_tokens.keys()) + if special_tokens_pattern and re.search(special_tokens_pattern, string): + parts = re.split(f'({special_tokens_pattern})', string) + result = [] + for part in parts: + if not part: + continue + if part in self.special_tokens: + result.append(self.special_tokens[part]) + else: + encoded = self.tokenizer.encode(part, add_bos=False, add_eos=False) + result.extend(encoded) + return {"input_ids": result} + out = self.tokenizer.encode(string) return {"input_ids": out} + def decode(self, token_ids, skip_special_tokens=False): + + if skip_special_tokens and self.special_tokens: + special_token_ids = set(self.special_tokens.values()) + token_ids = [tid for tid in token_ids if tid not in special_token_ids] + + return self.tokenizer.decode(token_ids) + def serialize_model(self): return torch.ByteTensor(list(self.tokenizer.serialized_model_proto())) diff --git a/comfy/utils.py b/comfy/utils.py index c1ce540b5..17443b4cc 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -1418,3 +1418,11 @@ def deepcopy_list_dict(obj, memo=None): memo[obj_id] = res return res + +def normalize_image_embeddings(embeds, embeds_info, scale_factor): + """Normalize image embeddings to match text embedding scale""" + for info in embeds_info: + if info.get("type") == "image": + start_idx = info["index"] + end_idx = start_idx + info["size"] + embeds[:, start_idx:end_idx, :] /= scale_factor diff --git a/comfy_extras/nodes_textgen.py b/comfy_extras/nodes_textgen.py new file mode 100644 index 000000000..dd4f6b0d3 --- /dev/null +++ b/comfy_extras/nodes_textgen.py @@ -0,0 +1,176 @@ +from comfy_api.latest import ComfyExtension, io +from typing_extensions import override + +class TextGenerate(io.ComfyNode): + @classmethod + def define_schema(cls): + # Define dynamic combo options for sampling mode + sampling_options = [ + io.DynamicCombo.Option( + key="on", + inputs=[ + io.Float.Input("temperature", default=0.7, min=0.01, max=2.0, step=0.000001), + io.Int.Input("top_k", default=64, min=0, max=1000), + io.Float.Input("top_p", default=0.95, min=0.0, max=1.0, step=0.01), + io.Float.Input("min_p", default=0.05, min=0.0, max=1.0, step=0.01), + io.Float.Input("repetition_penalty", default=1.05, min=0.0, max=5.0, step=0.01), + io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff), + ] + ), + io.DynamicCombo.Option( + key="off", + inputs=[] + ), + ] + + return io.Schema( + node_id="TextGenerate", + category="textgen/", + search_aliases=["LLM", "gemma"], + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True, default=""), + io.Image.Input("image", optional=True), + io.Int.Input("max_length", default=256, min=1, max=2048), + io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"), + ], + outputs=[ + io.String.Output(display_name="generated_text"), + ], + ) + + @classmethod + def execute(cls, clip, prompt, max_length, sampling_mode, image=None) -> io.NodeOutput: + + tokens = clip.tokenize(prompt, image=image, skip_template=False) + + # Get sampling parameters from dynamic combo + do_sample = sampling_mode.get("sampling_mode") == "on" + temperature = sampling_mode.get("temperature", 1.0) + top_k = sampling_mode.get("top_k", 50) + top_p = sampling_mode.get("top_p", 1.0) + min_p = sampling_mode.get("min_p", 0.0) + seed = sampling_mode.get("seed", None) + repetition_penalty = sampling_mode.get("repetition_penalty", 1.0) + + generated_ids = clip.generate( + tokens, + do_sample=do_sample, + max_length=max_length, + temperature=temperature, + top_k=top_k, + top_p=top_p, + min_p=min_p, + repetition_penalty=repetition_penalty, + seed=seed + ) + + generated_text = clip.decode(generated_ids, skip_special_tokens=True) + return io.NodeOutput(generated_text) + + +LTX2_T2V_SYSTEM_PROMPT = """You are a Creative Assistant. Given a user's raw input prompt describing a scene or concept, expand it into a detailed video generation prompt with specific visuals and integrated audio to guide a text-to-video model. +#### Guidelines +- Strictly follow all aspects of the user's raw input: include every element requested (style, visuals, motions, actions, camera movement, audio). + - If the input is vague, invent concrete details: lighting, textures, materials, scene settings, etc. + - For characters: describe gender, clothing, hair, expressions. DO NOT invent unrequested characters. +- Use active language: present-progressive verbs ("is walking," "speaking"). If no action specified, describe natural movements. +- Maintain chronological flow: use temporal connectors ("as," "then," "while"). +- Audio layer: Describe complete soundscape (background audio, ambient sounds, SFX, speech/music when requested). Integrate sounds chronologically alongside actions. Be specific (e.g., "soft footsteps on tile"), not vague (e.g., "ambient sound is present"). +- Speech (only when requested): + - For ANY speech-related input (talking, conversation, singing, etc.), ALWAYS include exact words in quotes with voice characteristics (e.g., "The man says in an excited voice: 'You won't believe what I just saw!'"). + - Specify language if not English and accent if relevant. +- Style: Include visual style at the beginning: "Style: