From b1c7291569daaeec34ccf6df25d57886eb73d98e Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 30 Apr 2025 11:18:20 -0700 Subject: [PATCH 01/16] Test updater in the windows release workflow. (#7886) --- .github/workflows/windows_release_package.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 80a45b321..3926a65f3 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -88,6 +88,8 @@ jobs: cd ComfyUI_windows_portable python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu + python_embeded/python.exe -s ./update/update.py ComfyUI/ + ls - name: Upload binaries to release From 39c27a37052b5e18850a6c63cd0a4b92131db3ba Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Wed, 30 Apr 2025 11:42:18 -0700 Subject: [PATCH 02/16] Add updater test to stable release workflow. (#7887) --- .github/workflows/stable-release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index c4302cdd6..a046ff9ea 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -91,6 +91,8 @@ jobs: cd ComfyUI_windows_portable python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu + python_embeded/python.exe -s ./update/update.py ComfyUI/ + ls - name: Upload binaries to release From 4ca3d842774421968cc19c319ee96daa58a98fbb Mon Sep 17 00:00:00 2001 From: Silver <65376327+silveroxides@users.noreply.github.com> Date: Thu, 1 May 2025 02:57:00 +0200 Subject: [PATCH 03/16] Support for Chroma - Flux1 Schnell distilled with CFG (#7355) * Upload files for Chroma Implementation * Remove trailing whitespace * trim more trailing whitespace..oops * remove unused imports * Add supported_inference_dtypes * Set min_length to 0 and remove attention_mask=True * Set min_length to 1 * get_mdulations added from blepping and minor changes * Add lora conversion if statement in lora.py * Update supported_models.py * update model_base.py * add uptream commits * set modelType.FLOW, will cause beta scheduler to work properly * Adjust memory usage factor and remove unnecessary code * fix mistake * reduce code duplication * remove unused imports * refactor for upstream sync * sync chroma-support with upstream via syncbranch patch * Update sd.py * Add Chroma as option for the OptimalStepsScheduler node --- comfy/ldm/chroma/layers.py | 183 +++++++++++++++++++ comfy/ldm/chroma/math.py | 44 +++++ comfy/ldm/chroma/model.py | 271 +++++++++++++++++++++++++++++ comfy/lora.py | 2 +- comfy/model_base.py | 62 +++++++ comfy/model_detection.py | 26 +++ comfy/sd.py | 5 + comfy/supported_models.py | 30 +++- comfy/text_encoders/chroma.py | 43 +++++ comfy_extras/nodes_optimalsteps.py | 3 +- nodes.py | 2 +- 11 files changed, 667 insertions(+), 4 deletions(-) create mode 100644 comfy/ldm/chroma/layers.py create mode 100644 comfy/ldm/chroma/math.py create mode 100644 comfy/ldm/chroma/model.py create mode 100644 comfy/text_encoders/chroma.py diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py new file mode 100644 index 000000000..dd0b72f70 --- /dev/null +++ b/comfy/ldm/chroma/layers.py @@ -0,0 +1,183 @@ +import torch +from torch import Tensor, nn + +from .math import attention +from comfy.ldm.flux.layers import ( + MLPEmbedder, + RMSNorm, + QKNorm, + SelfAttention, + ModulationOut, +) + + + +class ChromaModulationOut(ModulationOut): + @classmethod + def from_offset(cls, tensor: torch.Tensor, offset: int = 0) -> ModulationOut: + return cls( + shift=tensor[:, offset : offset + 1, :], + scale=tensor[:, offset + 1 : offset + 2, :], + gate=tensor[:, offset + 2 : offset + 3, :], + ) + + + + +class Approximator(nn.Module): + def __init__(self, in_dim: int, out_dim: int, hidden_dim: int, n_layers = 5, dtype=None, device=None, operations=None): + super().__init__() + self.in_proj = operations.Linear(in_dim, hidden_dim, bias=True, dtype=dtype, device=device) + self.layers = nn.ModuleList([MLPEmbedder(hidden_dim, hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)]) + self.norms = nn.ModuleList([RMSNorm(hidden_dim, dtype=dtype, device=device, operations=operations) for x in range( n_layers)]) + self.out_proj = operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) + + @property + def device(self): + # Get the device of the module (assumes all parameters are on the same device) + return next(self.parameters()).device + + def forward(self, x: Tensor) -> Tensor: + x = self.in_proj(x) + + for layer, norms in zip(self.layers, self.norms): + x = x + layer(norms(x)) + + x = self.out_proj(x) + + return x + + +class DoubleStreamBlock(nn.Module): + def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None): + super().__init__() + + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.num_heads = num_heads + self.hidden_size = hidden_size + self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) + + self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.img_mlp = nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), + nn.GELU(approximate="tanh"), + operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), + ) + + self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) + + self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.txt_mlp = nn.Sequential( + operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), + nn.GELU(approximate="tanh"), + operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), + ) + self.flipped_img_txt = flipped_img_txt + + def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None): + (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec + + # prepare image for attention + img_modulated = self.img_norm1(img) + img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift + img_qkv = self.img_attn.qkv(img_modulated) + img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) + + # prepare txt for attention + txt_modulated = self.txt_norm1(txt) + txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift + txt_qkv = self.txt_attn.qkv(txt_modulated) + txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) + + # run actual attention + attn = attention(torch.cat((txt_q, img_q), dim=2), + torch.cat((txt_k, img_k), dim=2), + torch.cat((txt_v, img_v), dim=2), + pe=pe, mask=attn_mask) + + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] + + # calculate the img bloks + img = img + img_mod1.gate * self.img_attn.proj(img_attn) + img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift) + + # calculate the txt bloks + txt += txt_mod1.gate * self.txt_attn.proj(txt_attn) + txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift) + + if txt.dtype == torch.float16: + txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504) + + return img, txt + + +class SingleStreamBlock(nn.Module): + """ + A DiT block with parallel linear layers as described in + https://arxiv.org/abs/2302.05442 and adapted modulation interface. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + mlp_ratio: float = 4.0, + qk_scale: float = None, + dtype=None, + device=None, + operations=None + ): + super().__init__() + self.hidden_dim = hidden_size + self.num_heads = num_heads + head_dim = hidden_size // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.mlp_hidden_dim = int(hidden_size * mlp_ratio) + # qkv and mlp_in + self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device) + # proj and mlp_out + self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device) + + self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations) + + self.hidden_size = hidden_size + self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + + self.mlp_act = nn.GELU(approximate="tanh") + + def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor: + mod = vec + x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift + qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) + + q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k = self.norm(q, k, v) + + # compute attention + attn = attention(q, k, v, pe=pe, mask=attn_mask) + # compute activation in mlp stream, cat again and run second linear layer + output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) + x += mod.gate * output + if x.dtype == torch.float16: + x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504) + return x + + +class LastLayer(nn.Module): + def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None): + super().__init__() + self.norm_final = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) + self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device) + + def forward(self, x: Tensor, vec: Tensor) -> Tensor: + shift, scale = vec + shift = shift.squeeze(1) + scale = scale.squeeze(1) + x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :] + x = self.linear(x) + return x diff --git a/comfy/ldm/chroma/math.py b/comfy/ldm/chroma/math.py new file mode 100644 index 000000000..36b67931c --- /dev/null +++ b/comfy/ldm/chroma/math.py @@ -0,0 +1,44 @@ +import torch +from einops import rearrange +from torch import Tensor + +from comfy.ldm.modules.attention import optimized_attention +import comfy.model_management + + +def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor: + q_shape = q.shape + k_shape = k.shape + + q = q.float().reshape(*q.shape[:-1], -1, 1, 2) + k = k.float().reshape(*k.shape[:-1], -1, 1, 2) + q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v) + k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v) + + heads = q.shape[1] + x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask) + return x + + +def rope(pos: Tensor, dim: int, theta: int) -> Tensor: + assert dim % 2 == 0 + if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled(): + device = torch.device("cpu") + else: + device = pos.device + + scale = torch.linspace(0, (dim - 2) / dim, steps=dim//2, dtype=torch.float64, device=device) + omega = 1.0 / (theta**scale) + out = torch.einsum("...n,d->...nd", pos.to(dtype=torch.float32, device=device), omega) + out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1) + out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2) + return out.to(dtype=torch.float32, device=pos.device) + + +def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): + xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2) + xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2) + xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1] + xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1] + return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk) + diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py new file mode 100644 index 000000000..636748fc5 --- /dev/null +++ b/comfy/ldm/chroma/model.py @@ -0,0 +1,271 @@ +#Original code can be found on: https://github.com/black-forest-labs/flux + +from dataclasses import dataclass + +import torch +from torch import Tensor, nn +from einops import rearrange, repeat +import comfy.ldm.common_dit + +from comfy.ldm.flux.layers import ( + EmbedND, + timestep_embedding, +) + +from .layers import ( + DoubleStreamBlock, + LastLayer, + SingleStreamBlock, + Approximator, + ChromaModulationOut, +) + + +@dataclass +class ChromaParams: + in_channels: int + out_channels: int + context_in_dim: int + hidden_size: int + mlp_ratio: float + num_heads: int + depth: int + depth_single_blocks: int + axes_dim: list + theta: int + patch_size: int + qkv_bias: bool + in_dim: int + out_dim: int + hidden_dim: int + n_layers: int + + + + +class Chroma(nn.Module): + """ + Transformer model for flow matching on sequences. + """ + + def __init__(self, image_model=None, final_layer=True, dtype=None, device=None, operations=None, **kwargs): + super().__init__() + self.dtype = dtype + params = ChromaParams(**kwargs) + self.params = params + self.patch_size = params.patch_size + self.in_channels = params.in_channels + self.out_channels = params.out_channels + if params.hidden_size % params.num_heads != 0: + raise ValueError( + f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}" + ) + pe_dim = params.hidden_size // params.num_heads + if sum(params.axes_dim) != pe_dim: + raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}") + self.hidden_size = params.hidden_size + self.num_heads = params.num_heads + self.in_dim = params.in_dim + self.out_dim = params.out_dim + self.hidden_dim = params.hidden_dim + self.n_layers = params.n_layers + self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim) + self.img_in = operations.Linear(self.in_channels, self.hidden_size, bias=True, dtype=dtype, device=device) + self.txt_in = operations.Linear(params.context_in_dim, self.hidden_size, dtype=dtype, device=device) + # set as nn identity for now, will overwrite it later. + self.distilled_guidance_layer = Approximator( + in_dim=self.in_dim, + hidden_dim=self.hidden_dim, + out_dim=self.out_dim, + n_layers=self.n_layers, + dtype=dtype, device=device, operations=operations + ) + + + self.double_blocks = nn.ModuleList( + [ + DoubleStreamBlock( + self.hidden_size, + self.num_heads, + mlp_ratio=params.mlp_ratio, + qkv_bias=params.qkv_bias, + dtype=dtype, device=device, operations=operations + ) + for _ in range(params.depth) + ] + ) + + self.single_blocks = nn.ModuleList( + [ + SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations) + for _ in range(params.depth_single_blocks) + ] + ) + + if final_layer: + self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels, dtype=dtype, device=device, operations=operations) + + self.skip_mmdit = [] + self.skip_dit = [] + self.lite = False + + def get_modulations(self, tensor: torch.Tensor, block_type: str, *, idx: int = 0): + # This function slices up the modulations tensor which has the following layout: + # single : num_single_blocks * 3 elements + # double_img : num_double_blocks * 6 elements + # double_txt : num_double_blocks * 6 elements + # final : 2 elements + if block_type == "final": + return (tensor[:, -2:-1, :], tensor[:, -1:, :]) + single_block_count = self.params.depth_single_blocks + double_block_count = self.params.depth + offset = 3 * idx + if block_type == "single": + return ChromaModulationOut.from_offset(tensor, offset) + # Double block modulations are 6 elements so we double 3 * idx. + offset *= 2 + if block_type in {"double_img", "double_txt"}: + # Advance past the single block modulations. + offset += 3 * single_block_count + if block_type == "double_txt": + # Advance past the double block img modulations. + offset += 6 * double_block_count + return ( + ChromaModulationOut.from_offset(tensor, offset), + ChromaModulationOut.from_offset(tensor, offset + 3), + ) + raise ValueError("Bad block_type") + + + def forward_orig( + self, + img: Tensor, + img_ids: Tensor, + txt: Tensor, + txt_ids: Tensor, + timesteps: Tensor, + guidance: Tensor = None, + control = None, + transformer_options={}, + attn_mask: Tensor = None, + ) -> Tensor: + patches_replace = transformer_options.get("patches_replace", {}) + if img.ndim != 3 or txt.ndim != 3: + raise ValueError("Input img and txt tensors must have 3 dimensions.") + + # running on sequences img + img = self.img_in(img) + + # distilled vector guidance + mod_index_length = 344 + distill_timestep = timestep_embedding(timesteps.detach().clone(), 16).to(img.device, img.dtype) + # guidance = guidance * + distil_guidance = timestep_embedding(guidance.detach().clone(), 16).to(img.device, img.dtype) + + # get all modulation index + modulation_index = timestep_embedding(torch.arange(mod_index_length), 32).to(img.device, img.dtype) + # we need to broadcast the modulation index here so each batch has all of the index + modulation_index = modulation_index.unsqueeze(0).repeat(img.shape[0], 1, 1).to(img.device, img.dtype) + # and we need to broadcast timestep and guidance along too + timestep_guidance = torch.cat([distill_timestep, distil_guidance], dim=1).unsqueeze(1).repeat(1, mod_index_length, 1).to(img.dtype).to(img.device, img.dtype) + # then and only then we could concatenate it together + input_vec = torch.cat([timestep_guidance, modulation_index], dim=-1).to(img.device, img.dtype) + + mod_vectors = self.distilled_guidance_layer(input_vec) + + txt = self.txt_in(txt) + + ids = torch.cat((txt_ids, img_ids), dim=1) + pe = self.pe_embedder(ids) + + blocks_replace = patches_replace.get("dit", {}) + for i, block in enumerate(self.double_blocks): + if i not in self.skip_mmdit: + double_mod = ( + self.get_modulations(mod_vectors, "double_img", idx=i), + self.get_modulations(mod_vectors, "double_txt", idx=i), + ) + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"], out["txt"] = block(img=args["img"], + txt=args["txt"], + vec=args["vec"], + pe=args["pe"], + attn_mask=args.get("attn_mask")) + return out + + out = blocks_replace[("double_block", i)]({"img": img, + "txt": txt, + "vec": double_mod, + "pe": pe, + "attn_mask": attn_mask}, + {"original_block": block_wrap}) + txt = out["txt"] + img = out["img"] + else: + img, txt = block(img=img, + txt=txt, + vec=double_mod, + pe=pe, + attn_mask=attn_mask) + + if control is not None: # Controlnet + control_i = control.get("input") + if i < len(control_i): + add = control_i[i] + if add is not None: + img += add + + img = torch.cat((txt, img), 1) + + for i, block in enumerate(self.single_blocks): + if i not in self.skip_dit: + single_mod = self.get_modulations(mod_vectors, "single", idx=i) + if ("single_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["img"] = block(args["img"], + vec=args["vec"], + pe=args["pe"], + attn_mask=args.get("attn_mask")) + return out + + out = blocks_replace[("single_block", i)]({"img": img, + "vec": single_mod, + "pe": pe, + "attn_mask": attn_mask}, + {"original_block": block_wrap}) + img = out["img"] + else: + img = block(img, vec=single_mod, pe=pe, attn_mask=attn_mask) + + if control is not None: # Controlnet + control_o = control.get("output") + if i < len(control_o): + add = control_o[i] + if add is not None: + img[:, txt.shape[1] :, ...] += add + + img = img[:, txt.shape[1] :, ...] + final_mod = self.get_modulations(mod_vectors, "final") + img = self.final_layer(img, vec=final_mod) # (N, T, patch_size ** 2 * out_channels) + return img + + def forward(self, x, timestep, context, guidance, control=None, transformer_options={}, **kwargs): + bs, c, h, w = x.shape + patch_size = 2 + x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size)) + + img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) + + h_len = ((h + (patch_size // 2)) // patch_size) + w_len = ((w + (patch_size // 2)) // patch_size) + img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(0, h_len - 1, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(0, w_len - 1, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs) + + txt_ids = torch.zeros((bs, context.shape[1], 3), device=x.device, dtype=x.dtype) + out = self.forward_orig(img, img_ids, context, txt_ids, timestep, guidance, control, transformer_options, attn_mask=kwargs.get("attention_mask", None)) + return rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)[:,:,:h,:w] diff --git a/comfy/lora.py b/comfy/lora.py index fff524be2..dbabd3335 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -252,7 +252,7 @@ def model_lora_keys_unet(model, key_map={}): key_lora = k[len("diffusion_model."):-len(".weight")] key_map["base_model.model.{}".format(key_lora)] = k #official hunyuan lora format - if isinstance(model, comfy.model_base.Flux): #Diffusers lora Flux + if isinstance(model, comfy.model_base.Flux) or isinstance(model, comfy.model_base.Chroma): #Diffusers lora Flux or a diffusers lora Chroma diffusers_keys = comfy.utils.flux_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.") for k in diffusers_keys: if k.endswith(".weight"): diff --git a/comfy/model_base.py b/comfy/model_base.py index d2aa4ce7a..1a06bb503 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -38,6 +38,7 @@ import comfy.ldm.lumina.model import comfy.ldm.wan.model import comfy.ldm.hunyuan3d.model import comfy.ldm.hidream.model +import comfy.ldm.chroma.model import comfy.model_management import comfy.patcher_extension @@ -1108,3 +1109,64 @@ class HiDream(BaseModel): if image_cond is not None: out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond)) return out + +class Chroma(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma) + + def concat_cond(self, **kwargs): + try: + #Handle Flux control loras dynamically changing the img_in weight. + num_channels = self.diffusion_model.img_in.weight.shape[1] + except: + #Some cases like tensorrt might not have the weights accessible + num_channels = self.model_config.unet_config["in_channels"] + + out_channels = self.model_config.unet_config["out_channels"] + + if num_channels <= out_channels: + return None + + image = kwargs.get("concat_latent_image", None) + noise = kwargs.get("noise", None) + device = kwargs["device"] + + if image is None: + image = torch.zeros_like(noise) + + image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") + image = utils.resize_to_batch_size(image, noise.shape[0]) + image = self.process_latent_in(image) + if num_channels <= out_channels * 2: + return image + + #inpaint model + mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) + if mask is None: + mask = torch.ones_like(noise)[:, :1] + + mask = torch.mean(mask, dim=1, keepdim=True) + mask = utils.common_upscale(mask.to(device), noise.shape[-1] * 8, noise.shape[-2] * 8, "bilinear", "center") + mask = mask.view(mask.shape[0], mask.shape[2] // 8, 8, mask.shape[3] // 8, 8).permute(0, 2, 4, 1, 3).reshape(mask.shape[0], -1, mask.shape[2] // 8, mask.shape[3] // 8) + mask = utils.resize_to_batch_size(mask, noise.shape[0]) + return torch.cat((image, mask), dim=1) + + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + # upscale the attention mask, since now we + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + shape = kwargs["noise"].shape + mask_ref_size = kwargs["attention_mask_img_shape"] + # the model will pad to the patch size, and then divide + # essentially dividing and rounding up + (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size)) + attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok)) + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + guidance = 0.0 + out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor((guidance,))) + return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 76de78a8a..daf6d04e7 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -154,6 +154,32 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["guidance_embed"] = len(guidance_keys) > 0 return dit_config + if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma + dit_config = {} + dit_config["image_model"] = "chroma" + dit_config["depth"] = 48 + dit_config["in_channels"] = 64 + patch_size = 2 + dit_config["patch_size"] = patch_size + in_key = "{}img_in.weight".format(key_prefix) + if in_key in state_dict_keys: + dit_config["in_channels"] = state_dict[in_key].shape[1] + dit_config["out_channels"] = 64 + dit_config["context_in_dim"] = 4096 + dit_config["hidden_size"] = 3072 + dit_config["mlp_ratio"] = 4.0 + dit_config["num_heads"] = 24 + dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.') + dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.') + dit_config["axes_dim"] = [16, 56, 56] + dit_config["theta"] = 10000 + dit_config["qkv_bias"] = True + dit_config["in_dim"] = 64 + dit_config["out_dim"] = 3072 + dit_config["hidden_dim"] = 5120 + dit_config["n_layers"] = 5 + return dit_config + if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and '{}img_in.weight'.format(key_prefix) in state_dict_keys: #Flux dit_config = {} dit_config["image_model"] = "flux" diff --git a/comfy/sd.py b/comfy/sd.py index 748f6c1ec..454b5929a 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -42,6 +42,7 @@ import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 import comfy.text_encoders.wan import comfy.text_encoders.hidream +import comfy.text_encoders.chroma import comfy.model_patcher import comfy.lora @@ -714,6 +715,7 @@ class CLIPType(Enum): LUMINA2 = 12 WAN = 13 HIDREAM = 14 + CHROMA = 15 def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}): @@ -829,6 +831,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer + elif clip_type == CLIPType.CHROMA: + clip_target.clip = comfy.text_encoders.chroma.chroma_te(**t5xxl_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.chroma.ChromaT5Tokenizer else: #CLIPType.MOCHI clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 69bcee1f7..f03f2790e 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -17,6 +17,7 @@ import comfy.text_encoders.hunyuan_video import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 import comfy.text_encoders.wan +import comfy.text_encoders.chroma from . import supported_models_base from . import latent_formats @@ -1068,7 +1069,34 @@ class HiDream(supported_models_base.BASE): def clip_target(self, state_dict={}): return None # TODO +class Chroma(supported_models_base.BASE): + unet_config = { + "image_model": "chroma", + } -models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream] + unet_extra_config = { + } + + sampling_settings = { + "multiplier": 1.0, + } + + latent_format = comfy.latent_formats.Flux + + memory_usage_factor = 3.2 + + supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32] + + + def get_model(self, state_dict, prefix="", device=None): + out = model_base.Chroma(self, device=device) + return out + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref)) + return supported_models_base.ClipTarget(comfy.text_encoders.chroma.ChromaTokenizer, comfy.text_encoders.chroma.chroma_te(**t5_detect)) + +models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma] models += [SVD_img2vid] diff --git a/comfy/text_encoders/chroma.py b/comfy/text_encoders/chroma.py new file mode 100644 index 000000000..aa8dffb25 --- /dev/null +++ b/comfy/text_encoders/chroma.py @@ -0,0 +1,43 @@ +from comfy import sd1_clip +import comfy.text_encoders.t5 +import os +from transformers import T5TokenizerFast + + +class T5XXLModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}): + textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json") + t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None) + if t5xxl_scaled_fp8 is not None: + model_options = model_options.copy() + model_options["scaled_fp8"] = t5xxl_scaled_fp8 + + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) + + +class ChromaT5XXL(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options) + + +class T5XXLTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") + super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) + + +class ChromaT5Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) + + +def chroma_te(dtype_t5=None, t5xxl_scaled_fp8=None): + class ChromaTEModel_(ChromaT5XXL): + def __init__(self, device="cpu", dtype=None, model_options={}): + if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: + model_options = model_options.copy() + model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 + if dtype is None: + dtype = dtype_t5 + super().__init__(device=device, dtype=dtype, model_options=model_options) + return ChromaTEModel_ diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py index f6928199b..102958734 100644 --- a/comfy_extras/nodes_optimalsteps.py +++ b/comfy_extras/nodes_optimalsteps.py @@ -20,13 +20,14 @@ def loglinear_interp(t_steps, num_steps): NOISE_LEVELS = {"FLUX": [0.9968, 0.9886, 0.9819, 0.975, 0.966, 0.9471, 0.9158, 0.8287, 0.5512, 0.2808, 0.001], "Wan":[1.0, 0.997, 0.995, 0.993, 0.991, 0.989, 0.987, 0.985, 0.98, 0.975, 0.973, 0.968, 0.96, 0.946, 0.927, 0.902, 0.864, 0.776, 0.539, 0.208, 0.001], +"Chroma": [0.9919999837875366, 0.9900000095367432, 0.9879999756813049, 0.9850000143051147, 0.9819999933242798, 0.9779999852180481, 0.9729999899864197, 0.9679999947547913, 0.9610000252723694, 0.953000009059906, 0.9430000185966492, 0.9309999942779541, 0.9169999957084656, 0.8999999761581421, 0.8809999823570251, 0.8579999804496765, 0.8320000171661377, 0.8019999861717224, 0.7689999938011169, 0.7310000061988831, 0.6899999976158142, 0.6460000276565552, 0.5989999771118164, 0.550000011920929, 0.5009999871253967, 0.45100000500679016, 0.4020000100135803, 0.35499998927116394, 0.3109999895095825, 0.27000001072883606, 0.23199999332427979, 0.19900000095367432, 0.16899999976158142, 0.14300000667572021, 0.11999999731779099, 0.10100000351667404, 0.08399999886751175, 0.07000000029802322, 0.057999998331069946, 0.04800000041723251, 0.0], } class OptimalStepsScheduler: @classmethod def INPUT_TYPES(s): return {"required": - {"model_type": (["FLUX", "Wan"], ), + {"model_type": (["FLUX", "Wan", "Chroma"], ), "steps": ("INT", {"default": 20, "min": 3, "max": 1000}), "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), } diff --git a/nodes.py b/nodes.py index 73a62d930..f2ced2c35 100644 --- a/nodes.py +++ b/nodes.py @@ -917,7 +917,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), From 08ff5fa08a92e0b3f23b9abec979a830a6cffb03 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Wed, 30 Apr 2025 20:57:30 -0400 Subject: [PATCH 04/16] Cleanup chroma PR. --- comfy/ldm/chroma/layers.py | 2 +- comfy/ldm/chroma/math.py | 44 --------------------- comfy/lora.py | 2 +- comfy/model_base.py | 63 ++++-------------------------- comfy/model_detection.py | 41 ++++++------------- comfy/sd.py | 6 +-- comfy/supported_models.py | 3 +- comfy/text_encoders/chroma.py | 43 -------------------- comfy_extras/nodes_optimalsteps.py | 2 +- 9 files changed, 25 insertions(+), 181 deletions(-) delete mode 100644 comfy/ldm/chroma/math.py delete mode 100644 comfy/text_encoders/chroma.py diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py index dd0b72f70..35da91ee2 100644 --- a/comfy/ldm/chroma/layers.py +++ b/comfy/ldm/chroma/layers.py @@ -1,7 +1,7 @@ import torch from torch import Tensor, nn -from .math import attention +from comfy.ldm.flux.math import attention from comfy.ldm.flux.layers import ( MLPEmbedder, RMSNorm, diff --git a/comfy/ldm/chroma/math.py b/comfy/ldm/chroma/math.py deleted file mode 100644 index 36b67931c..000000000 --- a/comfy/ldm/chroma/math.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -from einops import rearrange -from torch import Tensor - -from comfy.ldm.modules.attention import optimized_attention -import comfy.model_management - - -def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None) -> Tensor: - q_shape = q.shape - k_shape = k.shape - - q = q.float().reshape(*q.shape[:-1], -1, 1, 2) - k = k.float().reshape(*k.shape[:-1], -1, 1, 2) - q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v) - k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v) - - heads = q.shape[1] - x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask) - return x - - -def rope(pos: Tensor, dim: int, theta: int) -> Tensor: - assert dim % 2 == 0 - if comfy.model_management.is_device_mps(pos.device) or comfy.model_management.is_intel_xpu() or comfy.model_management.is_directml_enabled(): - device = torch.device("cpu") - else: - device = pos.device - - scale = torch.linspace(0, (dim - 2) / dim, steps=dim//2, dtype=torch.float64, device=device) - omega = 1.0 / (theta**scale) - out = torch.einsum("...n,d->...nd", pos.to(dtype=torch.float32, device=device), omega) - out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1) - out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2) - return out.to(dtype=torch.float32, device=pos.device) - - -def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): - xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2) - xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2) - xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1] - xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1] - return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk) - diff --git a/comfy/lora.py b/comfy/lora.py index dbabd3335..fff524be2 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -252,7 +252,7 @@ def model_lora_keys_unet(model, key_map={}): key_lora = k[len("diffusion_model."):-len(".weight")] key_map["base_model.model.{}".format(key_lora)] = k #official hunyuan lora format - if isinstance(model, comfy.model_base.Flux) or isinstance(model, comfy.model_base.Chroma): #Diffusers lora Flux or a diffusers lora Chroma + if isinstance(model, comfy.model_base.Flux): #Diffusers lora Flux diffusers_keys = comfy.utils.flux_to_diffusers(model.model_config.unet_config, output_prefix="diffusion_model.") for k in diffusers_keys: if k.endswith(".weight"): diff --git a/comfy/model_base.py b/comfy/model_base.py index 1a06bb503..3d33086d8 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -787,8 +787,8 @@ class PixArt(BaseModel): return out class Flux(BaseModel): - def __init__(self, model_config, model_type=ModelType.FLUX, device=None): - super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.flux.model.Flux) + def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.flux.model.Flux): + super().__init__(model_config, model_type, device=device, unet_model=unet_model) def concat_cond(self, **kwargs): try: @@ -1110,63 +1110,14 @@ class HiDream(BaseModel): out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond)) return out -class Chroma(BaseModel): +class Chroma(Flux): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.chroma.model.Chroma) - def concat_cond(self, **kwargs): - try: - #Handle Flux control loras dynamically changing the img_in weight. - num_channels = self.diffusion_model.img_in.weight.shape[1] - except: - #Some cases like tensorrt might not have the weights accessible - num_channels = self.model_config.unet_config["in_channels"] - - out_channels = self.model_config.unet_config["out_channels"] - - if num_channels <= out_channels: - return None - - image = kwargs.get("concat_latent_image", None) - noise = kwargs.get("noise", None) - device = kwargs["device"] - - if image is None: - image = torch.zeros_like(noise) - - image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center") - image = utils.resize_to_batch_size(image, noise.shape[0]) - image = self.process_latent_in(image) - if num_channels <= out_channels * 2: - return image - - #inpaint model - mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None)) - if mask is None: - mask = torch.ones_like(noise)[:, :1] - - mask = torch.mean(mask, dim=1, keepdim=True) - mask = utils.common_upscale(mask.to(device), noise.shape[-1] * 8, noise.shape[-2] * 8, "bilinear", "center") - mask = mask.view(mask.shape[0], mask.shape[2] // 8, 8, mask.shape[3] // 8, 8).permute(0, 2, 4, 1, 3).reshape(mask.shape[0], -1, mask.shape[2] // 8, mask.shape[3] // 8) - mask = utils.resize_to_batch_size(mask, noise.shape[0]) - return torch.cat((image, mask), dim=1) - - def extra_conds(self, **kwargs): out = super().extra_conds(**kwargs) - cross_attn = kwargs.get("cross_attn", None) - if cross_attn is not None: - out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) - # upscale the attention mask, since now we - attention_mask = kwargs.get("attention_mask", None) - if attention_mask is not None: - shape = kwargs["noise"].shape - mask_ref_size = kwargs["attention_mask_img_shape"] - # the model will pad to the patch size, and then divide - # essentially dividing and rounding up - (h_tok, w_tok) = (math.ceil(shape[2] / self.diffusion_model.patch_size), math.ceil(shape[3] / self.diffusion_model.patch_size)) - attention_mask = utils.upscale_dit_mask(attention_mask, mask_ref_size, (h_tok, w_tok)) - out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) - guidance = 0.0 - out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor((guidance,))) + + guidance = kwargs.get("guidance", 0) + if guidance is not None: + out['guidance'] = comfy.conds.CONDRegular(torch.FloatTensor([guidance])) return out diff --git a/comfy/model_detection.py b/comfy/model_detection.py index daf6d04e7..9254843ea 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -154,32 +154,6 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["guidance_embed"] = len(guidance_keys) > 0 return dit_config - if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma - dit_config = {} - dit_config["image_model"] = "chroma" - dit_config["depth"] = 48 - dit_config["in_channels"] = 64 - patch_size = 2 - dit_config["patch_size"] = patch_size - in_key = "{}img_in.weight".format(key_prefix) - if in_key in state_dict_keys: - dit_config["in_channels"] = state_dict[in_key].shape[1] - dit_config["out_channels"] = 64 - dit_config["context_in_dim"] = 4096 - dit_config["hidden_size"] = 3072 - dit_config["mlp_ratio"] = 4.0 - dit_config["num_heads"] = 24 - dit_config["depth"] = count_blocks(state_dict_keys, '{}double_blocks.'.format(key_prefix) + '{}.') - dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_blocks.'.format(key_prefix) + '{}.') - dit_config["axes_dim"] = [16, 56, 56] - dit_config["theta"] = 10000 - dit_config["qkv_bias"] = True - dit_config["in_dim"] = 64 - dit_config["out_dim"] = 3072 - dit_config["hidden_dim"] = 5120 - dit_config["n_layers"] = 5 - return dit_config - if '{}double_blocks.0.img_attn.norm.key_norm.scale'.format(key_prefix) in state_dict_keys and '{}img_in.weight'.format(key_prefix) in state_dict_keys: #Flux dit_config = {} dit_config["image_model"] = "flux" @@ -190,7 +164,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): if in_key in state_dict_keys: dit_config["in_channels"] = state_dict[in_key].shape[1] // (patch_size * patch_size) dit_config["out_channels"] = 16 - dit_config["vec_in_dim"] = 768 + vec_in_key = '{}vector_in.in_layer.weight'.format(key_prefix) + if vec_in_key in state_dict_keys: + dit_config["vec_in_dim"] = state_dict[vec_in_key].shape[1] dit_config["context_in_dim"] = 4096 dit_config["hidden_size"] = 3072 dit_config["mlp_ratio"] = 4.0 @@ -200,7 +176,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["axes_dim"] = [16, 56, 56] dit_config["theta"] = 10000 dit_config["qkv_bias"] = True - dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys + if '{}distilled_guidance_layer.0.norms.0.scale'.format(key_prefix) in state_dict_keys or '{}distilled_guidance_layer.norms.0.scale'.format(key_prefix) in state_dict_keys: #Chroma + dit_config["image_model"] = "chroma" + dit_config["in_channels"] = 64 + dit_config["out_channels"] = 64 + dit_config["in_dim"] = 64 + dit_config["out_dim"] = 3072 + dit_config["hidden_dim"] = 5120 + dit_config["n_layers"] = 5 + else: + dit_config["guidance_embed"] = "{}guidance_in.in_layer.weight".format(key_prefix) in state_dict_keys return dit_config if '{}t5_yproj.weight'.format(key_prefix) in state_dict_keys: #Genmo mochi preview diff --git a/comfy/sd.py b/comfy/sd.py index 454b5929a..da9b36d0e 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -42,7 +42,6 @@ import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 import comfy.text_encoders.wan import comfy.text_encoders.hidream -import comfy.text_encoders.chroma import comfy.model_patcher import comfy.lora @@ -820,7 +819,7 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip elif clip_type == CLIPType.LTXV: clip_target.clip = comfy.text_encoders.lt.ltxv_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.lt.LTXVT5Tokenizer - elif clip_type == CLIPType.PIXART: + elif clip_type == CLIPType.PIXART or clip_type == CLIPType.CHROMA: clip_target.clip = comfy.text_encoders.pixart_t5.pixart_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.pixart_t5.PixArtTokenizer elif clip_type == CLIPType.WAN: @@ -831,9 +830,6 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None, llama_scaled_fp8=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer - elif clip_type == CLIPType.CHROMA: - clip_target.clip = comfy.text_encoders.chroma.chroma_te(**t5xxl_detect(clip_data)) - clip_target.tokenizer = comfy.text_encoders.chroma.ChromaT5Tokenizer else: #CLIPType.MOCHI clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer diff --git a/comfy/supported_models.py b/comfy/supported_models.py index f03f2790e..d5210cfac 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -17,7 +17,6 @@ import comfy.text_encoders.hunyuan_video import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 import comfy.text_encoders.wan -import comfy.text_encoders.chroma from . import supported_models_base from . import latent_formats @@ -1095,7 +1094,7 @@ class Chroma(supported_models_base.BASE): def clip_target(self, state_dict={}): pref = self.text_encoder_key_prefix[0] t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}t5xxl.transformer.".format(pref)) - return supported_models_base.ClipTarget(comfy.text_encoders.chroma.ChromaTokenizer, comfy.text_encoders.chroma.chroma_te(**t5_detect)) + return supported_models_base.ClipTarget(comfy.text_encoders.pixart_t5.PixArtTokenizer, comfy.text_encoders.pixart_t5.pixart_te(**t5_detect)) models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, FluxSchnell, GenmoMochi, LTXV, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, Lumina2, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, Hunyuan3Dv2mini, Hunyuan3Dv2, HiDream, Chroma] diff --git a/comfy/text_encoders/chroma.py b/comfy/text_encoders/chroma.py deleted file mode 100644 index aa8dffb25..000000000 --- a/comfy/text_encoders/chroma.py +++ /dev/null @@ -1,43 +0,0 @@ -from comfy import sd1_clip -import comfy.text_encoders.t5 -import os -from transformers import T5TokenizerFast - - -class T5XXLModel(sd1_clip.SDClipModel): - def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=False, model_options={}): - textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_config_xxl.json") - t5xxl_scaled_fp8 = model_options.get("t5xxl_scaled_fp8", None) - if t5xxl_scaled_fp8 is not None: - model_options = model_options.copy() - model_options["scaled_fp8"] = t5xxl_scaled_fp8 - - super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config=textmodel_json_config, dtype=dtype, special_tokens={"end": 1, "pad": 0}, model_class=comfy.text_encoders.t5.T5, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) - - -class ChromaT5XXL(sd1_clip.SD1ClipModel): - def __init__(self, device="cpu", dtype=None, model_options={}): - super().__init__(device=device, dtype=dtype, name="t5xxl", clip_model=T5XXLModel, model_options=model_options) - - -class T5XXLTokenizer(sd1_clip.SDTokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_tokenizer") - super().__init__(tokenizer_path, embedding_directory=embedding_directory, pad_with_end=False, embedding_size=4096, embedding_key='t5xxl', tokenizer_class=T5TokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_data=tokenizer_data) - - -class ChromaT5Tokenizer(sd1_clip.SD1Tokenizer): - def __init__(self, embedding_directory=None, tokenizer_data={}): - super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="t5xxl", tokenizer=T5XXLTokenizer) - - -def chroma_te(dtype_t5=None, t5xxl_scaled_fp8=None): - class ChromaTEModel_(ChromaT5XXL): - def __init__(self, device="cpu", dtype=None, model_options={}): - if t5xxl_scaled_fp8 is not None and "t5xxl_scaled_fp8" not in model_options: - model_options = model_options.copy() - model_options["t5xxl_scaled_fp8"] = t5xxl_scaled_fp8 - if dtype is None: - dtype = dtype_t5 - super().__init__(device=device, dtype=dtype, model_options=model_options) - return ChromaTEModel_ diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py index 102958734..e7c851ca2 100644 --- a/comfy_extras/nodes_optimalsteps.py +++ b/comfy_extras/nodes_optimalsteps.py @@ -20,7 +20,7 @@ def loglinear_interp(t_steps, num_steps): NOISE_LEVELS = {"FLUX": [0.9968, 0.9886, 0.9819, 0.975, 0.966, 0.9471, 0.9158, 0.8287, 0.5512, 0.2808, 0.001], "Wan":[1.0, 0.997, 0.995, 0.993, 0.991, 0.989, 0.987, 0.985, 0.98, 0.975, 0.973, 0.968, 0.96, 0.946, 0.927, 0.902, 0.864, 0.776, 0.539, 0.208, 0.001], -"Chroma": [0.9919999837875366, 0.9900000095367432, 0.9879999756813049, 0.9850000143051147, 0.9819999933242798, 0.9779999852180481, 0.9729999899864197, 0.9679999947547913, 0.9610000252723694, 0.953000009059906, 0.9430000185966492, 0.9309999942779541, 0.9169999957084656, 0.8999999761581421, 0.8809999823570251, 0.8579999804496765, 0.8320000171661377, 0.8019999861717224, 0.7689999938011169, 0.7310000061988831, 0.6899999976158142, 0.6460000276565552, 0.5989999771118164, 0.550000011920929, 0.5009999871253967, 0.45100000500679016, 0.4020000100135803, 0.35499998927116394, 0.3109999895095825, 0.27000001072883606, 0.23199999332427979, 0.19900000095367432, 0.16899999976158142, 0.14300000667572021, 0.11999999731779099, 0.10100000351667404, 0.08399999886751175, 0.07000000029802322, 0.057999998331069946, 0.04800000041723251, 0.0], +"Chroma": [0.992, 0.99, 0.988, 0.985, 0.982, 0.978, 0.973, 0.968, 0.961, 0.953, 0.943, 0.931, 0.917, 0.9, 0.881, 0.858, 0.832, 0.802, 0.769, 0.731, 0.69, 0.646, 0.599, 0.55, 0.501, 0.451, 0.402, 0.355, 0.311, 0.27, 0.232, 0.199, 0.169, 0.143, 0.12, 0.101, 0.084, 0.07, 0.058, 0.048, 0.001], } class OptimalStepsScheduler: From c6c19e99803a2538cee17e2f94314f77b7d48e98 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Thu, 1 May 2025 00:24:32 -0700 Subject: [PATCH 05/16] fix bug (#7894) --- comfy_api_nodes/apis/client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py index 384e559dc..d3cd9ad2f 100644 --- a/comfy_api_nodes/apis/client.py +++ b/comfy_api_nodes/apis/client.py @@ -297,6 +297,10 @@ class SynchronousOperation(Generic[T, R]): # Convert request model to dict, but use None for EmptyRequest request_dict = None if isinstance(self.request, EmptyRequest) else self.request.model_dump(exclude_none=True) + if request_dict: + for key, value in request_dict.items(): + if isinstance(value, Enum): + request_dict[key] = value.value # Debug log for request logging.debug(f"[DEBUG] API Request: {self.endpoint.method.value} {self.endpoint.path}") From aa9d759df36faa2e34cc5722463749a09a7f529b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 1 May 2025 03:33:42 -0700 Subject: [PATCH 06/16] Switch ltxv to use the pytorch RMSNorm. (#7897) --- comfy/ldm/lightricks/model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index 6e8e06181..056e101a4 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -1,7 +1,6 @@ import torch from torch import nn import comfy.ldm.modules.attention -from comfy.ldm.genmo.joint_model.layers import RMSNorm import comfy.ldm.common_dit from einops import rearrange import math @@ -262,8 +261,8 @@ class CrossAttention(nn.Module): self.heads = heads self.dim_head = dim_head - self.q_norm = RMSNorm(inner_dim, dtype=dtype, device=device) - self.k_norm = RMSNorm(inner_dim, dtype=dtype, device=device) + self.q_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device) + self.k_norm = operations.RMSNorm(inner_dim, dtype=dtype, device=device) self.to_q = operations.Linear(query_dim, inner_dim, bias=True, dtype=dtype, device=device) self.to_k = operations.Linear(context_dim, inner_dim, bias=True, dtype=dtype, device=device) From 6d32dc049e676a1373dddc4fd912f2ba1311d4cb Mon Sep 17 00:00:00 2001 From: Chenlei Hu Date: Thu, 1 May 2025 10:57:54 -0400 Subject: [PATCH 07/16] Update frontend to v1.18 (#7898) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f64a05947..74a4ceb02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.17.11 +comfyui-frontend-package==1.18.5 comfyui-workflow-templates==0.1.3 torch torchsde From 8d0661d0ba35fd54a72e4b49f68d4625febafc10 Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Thu, 1 May 2025 19:32:04 -0400 Subject: [PATCH 08/16] Lint instance methods (#7903) --- comfy_extras/nodes_post_processing.py | 1 + comfy_extras/nodes_webcam.py | 2 +- pyproject.toml | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index 5b9542015..cb1a0d883 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -141,6 +141,7 @@ class Quantize: CATEGORY = "image/postprocessing" + @staticmethod def bayer(im, pal_im, order): def normalized_bayer_matrix(n): if n == 0: diff --git a/comfy_extras/nodes_webcam.py b/comfy_extras/nodes_webcam.py index 31eddb2d6..062b15cf8 100644 --- a/comfy_extras/nodes_webcam.py +++ b/comfy_extras/nodes_webcam.py @@ -20,7 +20,7 @@ class WebcamCapture(nodes.LoadImage): CATEGORY = "image" - def load_capture(s, image, **kwargs): + def load_capture(self, image, **kwargs): return super().load_image(folder_paths.get_annotated_filepath(image)) diff --git a/pyproject.toml b/pyproject.toml index eadca662e..a9c028c7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ documentation = "https://docs.comfy.org/" [tool.ruff] lint.select = [ + "N805", # invalid-first-argument-name-for-method "S307", # suspicious-eval-usage "S102", # exec "T", # print-usage From ff99861650a195234f2858b864663920ca811c55 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 2 May 2025 02:15:32 -0700 Subject: [PATCH 09/16] Make clipsave work with more TE models. (#7908) --- comfy_extras/nodes_model_merging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comfy_extras/nodes_model_merging.py b/comfy_extras/nodes_model_merging.py index 78d284889..f20beab7d 100644 --- a/comfy_extras/nodes_model_merging.py +++ b/comfy_extras/nodes_model_merging.py @@ -276,7 +276,7 @@ class CLIPSave: comfy.model_management.load_models_gpu([clip.load_model()], force_patch_weights=True) clip_sd = clip.get_sd() - for prefix in ["clip_l.", "clip_g.", ""]: + for prefix in ["clip_l.", "clip_g.", "clip_h.", "t5xxl.", "pile_t5xl.", "mt5xl.", "umt5xxl.", "t5base.", "gemma2_2b.", "llama.", "hydit_clip.", ""]: k = list(filter(lambda a: a.startswith(prefix), clip_sd.keys())) current_clip_sd = {} for x in k: From 551fe8dceebb07abed486580d8326a6202d0cf7a Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Fri, 2 May 2025 05:28:05 -0400 Subject: [PATCH 10/16] Add node to extend sigmas (#7901) * Add ExpandSigmas node * Rename, add interpolation functions Co-authored-by: liesen * Move computed interpolation outside loop * Add type hints --------- Co-authored-by: liesen --- comfy_extras/nodes_custom_sampler.py | 51 ++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index c9689b745..3e5be3d3c 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -1,3 +1,4 @@ +import math import comfy.samplers import comfy.sample from comfy.k_diffusion import sampling as k_diffusion_sampling @@ -249,6 +250,55 @@ class SetFirstSigma: sigmas[0] = sigma return (sigmas, ) +class ExtendIntermediateSigmas: + @classmethod + def INPUT_TYPES(s): + return {"required": + {"sigmas": ("SIGMAS", ), + "steps": ("INT", {"default": 2, "min": 1, "max": 100}), + "start_at_sigma": ("FLOAT", {"default": -1.0, "min": -1.0, "max": 20000.0, "step": 0.01, "round": False}), + "end_at_sigma": ("FLOAT", {"default": 12.0, "min": 0.0, "max": 20000.0, "step": 0.01, "round": False}), + "spacing": (['linear', 'cosine', 'sine'],), + } + } + RETURN_TYPES = ("SIGMAS",) + CATEGORY = "sampling/custom_sampling/sigmas" + + FUNCTION = "extend" + + def extend(self, sigmas: torch.Tensor, steps: int, start_at_sigma: float, end_at_sigma: float, spacing: str): + if start_at_sigma < 0: + start_at_sigma = float("inf") + + interpolator = { + 'linear': lambda x: x, + 'cosine': lambda x: torch.sin(x*math.pi/2), + 'sine': lambda x: 1 - torch.cos(x*math.pi/2) + }[spacing] + + # linear space for our interpolation function + x = torch.linspace(0, 1, steps + 1, device=sigmas.device)[1:-1] + computed_spacing = interpolator(x) + + extended_sigmas = [] + for i in range(len(sigmas) - 1): + sigma_current = sigmas[i] + sigma_next = sigmas[i+1] + + extended_sigmas.append(sigma_current) + + if end_at_sigma <= sigma_current <= start_at_sigma: + interpolated_steps = computed_spacing * (sigma_next - sigma_current) + sigma_current + extended_sigmas.extend(interpolated_steps.tolist()) + + # Add the last sigma value + if len(sigmas) > 0: + extended_sigmas.append(sigmas[-1]) + + extended_sigmas = torch.FloatTensor(extended_sigmas) + + return (extended_sigmas,) + class KSamplerSelect: @classmethod def INPUT_TYPES(s): @@ -735,6 +785,7 @@ NODE_CLASS_MAPPINGS = { "SplitSigmasDenoise": SplitSigmasDenoise, "FlipSigmas": FlipSigmas, "SetFirstSigma": SetFirstSigma, + "ExtendIntermediateSigmas": ExtendIntermediateSigmas, "CFGGuider": CFGGuider, "DualCFGGuider": DualCFGGuider, From d9a87c1e6a390eb0e915b544e35baf4d807919db Mon Sep 17 00:00:00 2001 From: catboxanon <122327233+catboxanon@users.noreply.github.com> Date: Fri, 2 May 2025 05:28:27 -0400 Subject: [PATCH 11/16] Fix outdated comment about Internet connectivity (#7827) --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index f3f56597a..5c21542b3 100644 --- a/main.py +++ b/main.py @@ -13,7 +13,7 @@ import logging import sys if __name__ == "__main__": - #NOTE: These do not do anything on core ComfyUI which should already have no communication with the internet, they are for custom nodes. + #NOTE: These do not do anything on core ComfyUI, they are for custom nodes. os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1' os.environ['DO_NOT_TRACK'] = '1' From 2ab9618732b738b52c96ae128597371b9b9fff96 Mon Sep 17 00:00:00 2001 From: Kohaku-Blueleaf <59680068+KohakuBlueleaf@users.noreply.github.com> Date: Sat, 3 May 2025 01:12:37 +0800 Subject: [PATCH 12/16] Fix the bugs in OFT/BOFT moule (#7909) * Correct calculate_weight and load for OFT * Correct calculate_weight and loading for BOFT --- comfy/weight_adapter/boft.py | 34 +++++++++++++++++----------------- comfy/weight_adapter/oft.py | 20 +++++++++++--------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/comfy/weight_adapter/boft.py b/comfy/weight_adapter/boft.py index c85adc7ab..b2a2f1bd4 100644 --- a/comfy/weight_adapter/boft.py +++ b/comfy/weight_adapter/boft.py @@ -24,7 +24,7 @@ class BOFTAdapter(WeightAdapterBase): ) -> Optional["BOFTAdapter"]: if loaded_keys is None: loaded_keys = set() - blocks_name = "{}.boft_blocks".format(x) + blocks_name = "{}.oft_blocks".format(x) rescale_name = "{}.rescale".format(x) blocks = None @@ -32,17 +32,18 @@ class BOFTAdapter(WeightAdapterBase): blocks = lora[blocks_name] if blocks.ndim == 4: loaded_keys.add(blocks_name) + else: + blocks = None + if blocks is None: + return None rescale = None if rescale_name in lora.keys(): rescale = lora[rescale_name] loaded_keys.add(rescale_name) - if blocks is not None: - weights = (blocks, rescale, alpha, dora_scale) - return cls(loaded_keys, weights) - else: - return None + weights = (blocks, rescale, alpha, dora_scale) + return cls(loaded_keys, weights) def calculate_weight( self, @@ -71,7 +72,7 @@ class BOFTAdapter(WeightAdapterBase): # Get r I = torch.eye(boft_b, device=blocks.device, dtype=blocks.dtype) # for Q = -Q^T - q = blocks - blocks.transpose(1, 2) + q = blocks - blocks.transpose(-1, -2) normed_q = q if alpha > 0: # alpha in boft/bboft is for constraint q_norm = torch.norm(q) + 1e-8 @@ -79,9 +80,8 @@ class BOFTAdapter(WeightAdapterBase): normed_q = q * alpha / q_norm # use float() to prevent unsupported type in .inverse() r = (I + normed_q) @ (I - normed_q).float().inverse() - r = r.to(original_weight) - - inp = org = original_weight + r = r.to(weight) + inp = org = weight r_b = boft_b//2 for i in range(boft_m): @@ -91,14 +91,14 @@ class BOFTAdapter(WeightAdapterBase): if strength != 1: bi = bi * strength + (1-strength) * I inp = ( - inp.unflatten(-1, (-1, g, k)) - .transpose(-2, -1) - .flatten(-3) - .unflatten(-1, (-1, boft_b)) + inp.unflatten(0, (-1, g, k)) + .transpose(1, 2) + .flatten(0, 2) + .unflatten(0, (-1, boft_b)) ) - inp = torch.einsum("b n m, b n ... -> b m ...", inp, bi) + inp = torch.einsum("b i j, b j ...-> b i ...", bi, inp) inp = ( - inp.flatten(-2).unflatten(-1, (-1, k, g)).transpose(-2, -1).flatten(-3) + inp.flatten(0, 1).unflatten(0, (-1, k, g)).transpose(1, 2).flatten(0, 2) ) if rescale is not None: @@ -109,7 +109,7 @@ class BOFTAdapter(WeightAdapterBase): if dora_scale is not None: weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) else: - weight += function(((strength * alpha) * lora_diff).type(weight.dtype)) + weight += function((strength * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight diff --git a/comfy/weight_adapter/oft.py b/comfy/weight_adapter/oft.py index 0ea229b79..25009eca3 100644 --- a/comfy/weight_adapter/oft.py +++ b/comfy/weight_adapter/oft.py @@ -32,17 +32,18 @@ class OFTAdapter(WeightAdapterBase): blocks = lora[blocks_name] if blocks.ndim == 3: loaded_keys.add(blocks_name) + else: + blocks = None + if blocks is None: + return None rescale = None if rescale_name in lora.keys(): rescale = lora[rescale_name] loaded_keys.add(rescale_name) - if blocks is not None: - weights = (blocks, rescale, alpha, dora_scale) - return cls(loaded_keys, weights) - else: - return None + weights = (blocks, rescale, alpha, dora_scale) + return cls(loaded_keys, weights) def calculate_weight( self, @@ -79,16 +80,17 @@ class OFTAdapter(WeightAdapterBase): normed_q = q * alpha / q_norm # use float() to prevent unsupported type in .inverse() r = (I + normed_q) @ (I - normed_q).float().inverse() - r = r.to(original_weight) + r = r.to(weight) + _, *shape = weight.shape lora_diff = torch.einsum( "k n m, k n ... -> k m ...", (r * strength) - strength * I, - original_weight, - ) + weight.view(block_num, block_size, *shape), + ).view(-1, *shape) if dora_scale is not None: weight = weight_decompose(dora_scale, weight, lora_diff, alpha, strength, intermediate_dtype, function) else: - weight += function(((strength * alpha) * lora_diff).type(weight.dtype)) + weight += function((strength * lora_diff).type(weight.dtype)) except Exception as e: logging.error("ERROR {} {} {}".format(self.name, key, e)) return weight From 530494588d9e58a8bcaf55b471f4c3ce2b97ce65 Mon Sep 17 00:00:00 2001 From: Chenlei Hu Date: Fri, 2 May 2025 13:14:52 -0400 Subject: [PATCH 13/16] [BugFix] Update frontend 1.18.6 (#7910) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 74a4ceb02..05ceba00a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -comfyui-frontend-package==1.18.5 +comfyui-frontend-package==1.18.6 comfyui-workflow-templates==0.1.3 torch torchsde From 065d855f14968406051a1340e3f2f26461a00e5d Mon Sep 17 00:00:00 2001 From: Terry Jia Date: Fri, 2 May 2025 13:15:54 -0400 Subject: [PATCH 14/16] upstream Preview Any from rgthree-comfy (#7815) * upstream Preview Any from rgthree-comfy * use IO.ANY --- comfy_extras/nodes_preview_any.py | 43 +++++++++++++++++++++++++++++++ nodes.py | 1 + 2 files changed, 44 insertions(+) create mode 100644 comfy_extras/nodes_preview_any.py diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py new file mode 100644 index 000000000..e6805696f --- /dev/null +++ b/comfy_extras/nodes_preview_any.py @@ -0,0 +1,43 @@ +import json +from comfy.comfy_types.node_typing import IO + +# Preview Any - original implement from +# https://github.com/rgthree/rgthree-comfy/blob/main/py/display_any.py +# upstream requested in https://github.com/Kosinkadink/rfcs/blob/main/rfcs/0000-corenodes.md#preview-nodes +class PreviewAny(): + @classmethod + def INPUT_TYPES(cls): + return { + "required": {"source": (IO.ANY, {})}, + } + + RETURN_TYPES = () + FUNCTION = "main" + OUTPUT_NODE = True + + CATEGORY = "utils" + + def main(self, source=None): + value = 'None' + if isinstance(source, str): + value = source + elif isinstance(source, (int, float, bool)): + value = str(source) + elif source is not None: + try: + value = json.dumps(source) + except Exception: + try: + value = str(source) + except Exception: + value = 'source exists, but could not be serialized.' + + return {"ui": {"text": (value,)}} + +NODE_CLASS_MAPPINGS = { + "PreviewAny": PreviewAny, +} + +NODE_DISPLAY_NAME_MAPPINGS = { + "PreviewAny": "Preview Any", +} diff --git a/nodes.py b/nodes.py index f2ced2c35..92b8ca6ae 100644 --- a/nodes.py +++ b/nodes.py @@ -2258,6 +2258,7 @@ def init_builtin_extra_nodes(): "nodes_optimalsteps.py", "nodes_hidream.py", "nodes_fresca.py", + "nodes_preview_any.py", ] api_nodes_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy_api_nodes") From 486ad8fdc55495a705a211c22cc50bec9ec95643 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Fri, 2 May 2025 21:28:10 -0700 Subject: [PATCH 15/16] Fix updater issue with newer portable. (#7917) --- .ci/update_windows/update.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.ci/update_windows/update.py b/.ci/update_windows/update.py index 731b6bc53..51a263203 100755 --- a/.ci/update_windows/update.py +++ b/.ci/update_windows/update.py @@ -63,7 +63,12 @@ except: print("checking out master branch") # noqa: T201 branch = repo.lookup_branch('master') if branch is None: - ref = repo.lookup_reference('refs/remotes/origin/master') + try: + ref = repo.lookup_reference('refs/remotes/origin/master') + except: + print("pulling.") # noqa: T201 + pull(repo) + ref = repo.lookup_reference('refs/remotes/origin/master') repo.checkout(ref) branch = repo.lookup_branch('master') if branch is None: From 7689917113fe521adfaba2a4fff952ef1805ad2b Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Sat, 3 May 2025 00:34:01 -0400 Subject: [PATCH 16/16] ComfyUI version 0.3.31 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 67d27f942..8d2068de7 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.30" +__version__ = "0.3.31" diff --git a/pyproject.toml b/pyproject.toml index a9c028c7e..8b549a0b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.30" +version = "0.3.31" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9"