From 619b8cde74538a1dc62b85e47e34daa493705c06 Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 16 Jan 2025 14:54:48 -0500 Subject: [PATCH 1/5] Bump ComfyUI version to 0.3.11 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index 7cccc7535..fbe4747a7 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.10" +__version__ = "0.3.11" diff --git a/pyproject.toml b/pyproject.toml index b747d6ef7..db8967b6b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.10" +version = "0.3.11" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From cca96a85ae753c9a7de722884f43b81e4eb3abff Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 16 Jan 2025 16:30:06 -0500 Subject: [PATCH 2/5] Fix cosmos VAE failing with videos longer than 121 frames. --- comfy/ldm/cosmos/vae.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/comfy/ldm/cosmos/vae.py b/comfy/ldm/cosmos/vae.py index c8db68612..d64f292de 100644 --- a/comfy/ldm/cosmos/vae.py +++ b/comfy/ldm/cosmos/vae.py @@ -18,6 +18,7 @@ import logging import torch from torch import nn from enum import Enum +import math from .cosmos_tokenizer.layers3d import ( EncoderFactorized, @@ -105,17 +106,23 @@ class CausalContinuousVideoTokenizer(nn.Module): z, posteriors = self.distribution(moments) latent_ch = z.shape[1] latent_t = z.shape[2] - dtype = z.dtype - mean = self.latent_mean.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=dtype, device=z.device) - std = self.latent_std.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=dtype, device=z.device) + in_dtype = z.dtype + mean = self.latent_mean.view(latent_ch, -1) + std = self.latent_std.view(latent_ch, -1) + + mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device) + std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device) return ((z - mean) / std) * self.sigma_data def decode(self, z): in_dtype = z.dtype latent_ch = z.shape[1] latent_t = z.shape[2] - mean = self.latent_mean.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device) - std = self.latent_std.view(latent_ch, -1)[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device) + mean = self.latent_mean.view(latent_ch, -1) + std = self.latent_std.view(latent_ch, -1) + + mean = mean.repeat(1, math.ceil(latent_t / mean.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device) + std = std.repeat(1, math.ceil(latent_t / std.shape[-1]))[:, : latent_t].reshape([1, latent_ch, -1, 1, 1]).to(dtype=in_dtype, device=z.device) z = z / self.sigma_data z = z * std + mean From 0aa2368e462664bb9a00e17660d786e69cc2e25c Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 16 Jan 2025 17:45:37 -0500 Subject: [PATCH 3/5] Fix some cosmos fp8 issues. --- comfy/ldm/cosmos/model.py | 2 +- comfy/ldm/cosmos/position_embedding.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/comfy/ldm/cosmos/model.py b/comfy/ldm/cosmos/model.py index 1205838b5..06d0baef3 100644 --- a/comfy/ldm/cosmos/model.py +++ b/comfy/ldm/cosmos/model.py @@ -293,7 +293,7 @@ class GeneralDIT(nn.Module): x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W) if self.extra_per_block_abs_pos_emb: - extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device) + extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype) else: extra_pos_emb = None diff --git a/comfy/ldm/cosmos/position_embedding.py b/comfy/ldm/cosmos/position_embedding.py index cf45ab0e3..4d6a58dba 100644 --- a/comfy/ldm/cosmos/position_embedding.py +++ b/comfy/ldm/cosmos/position_embedding.py @@ -41,12 +41,12 @@ def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0) class VideoPositionEmb(nn.Module): - def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None) -> torch.Tensor: + def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor: """ It delegates the embedding generation to generate_embeddings function. """ B_T_H_W_C = x_B_T_H_W_C.shape - embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device) + embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device, dtype=dtype) return embeddings @@ -104,6 +104,7 @@ class VideoRopePosition3DEmb(VideoPositionEmb): w_ntk_factor: Optional[float] = None, t_ntk_factor: Optional[float] = None, device=None, + dtype=None, ): """ Generate embeddings for the given input size. @@ -189,13 +190,12 @@ class LearnablePosEmbAxis(VideoPositionEmb): self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device, dtype=dtype)) self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device, dtype=dtype)) - - def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None) -> torch.Tensor: + def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor: B, T, H, W, _ = B_T_H_W_C if self.interpolation == "crop": - emb_h_H = self.pos_emb_h[:H].to(device=device) - emb_w_W = self.pos_emb_w[:W].to(device=device) - emb_t_T = self.pos_emb_t[:T].to(device=device) + emb_h_H = self.pos_emb_h[:H].to(device=device, dtype=dtype) + emb_w_W = self.pos_emb_w[:W].to(device=device, dtype=dtype) + emb_t_T = self.pos_emb_t[:T].to(device=device, dtype=dtype) emb = ( repeat(emb_t_T, "t d-> b t h w d", b=B, h=H, w=W) + repeat(emb_h_H, "h d-> b t h w d", b=B, t=T, w=W) From 55add502206ed5511a04215db4ab8f1cfa3d99ae Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 16 Jan 2025 18:11:57 -0500 Subject: [PATCH 4/5] Bump ComfyUI version to v0.3.12 --- comfyui_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comfyui_version.py b/comfyui_version.py index fbe4747a7..411243f6c 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.11" +__version__ = "0.3.12" diff --git a/pyproject.toml b/pyproject.toml index db8967b6b..0198d1b08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.11" +version = "0.3.12" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" From 7fc3ccdcc2fb1f20c4b7dd4aca374db952fd66df Mon Sep 17 00:00:00 2001 From: comfyanonymous Date: Thu, 16 Jan 2025 21:17:18 -0500 Subject: [PATCH 5/5] Add that nvidia cosmos is supported to the README. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 000d76801..fd21f5624 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin - [Mochi](https://comfyanonymous.github.io/ComfyUI_examples/mochi/) - [LTX-Video](https://comfyanonymous.github.io/ComfyUI_examples/ltxv/) - [Hunyuan Video](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_video/) + - [Nvidia Cosmos](https://comfyanonymous.github.io/ComfyUI_examples/cosmos/) - [Stable Audio](https://comfyanonymous.github.io/ComfyUI_examples/audio/) - Asynchronous Queue system - Many optimizations: Only re-executes the parts of the workflow that changes between executions.