From 4b77c4941c32aec4ff75ca6083383566afe7d4e8 Mon Sep 17 00:00:00 2001 From: doctorpangloss <@hiddenswitch.com> Date: Fri, 22 Nov 2024 17:13:19 -0800 Subject: [PATCH] LTXV tests --- comfy/model_downloader.py | 1 + comfy/nodes/base_nodes.py | 4 +- comfy/sd.py | 8 +- comfy_extras/nodes/nodes_images.py | 7 +- tests/inference/workflows/ltxv-0.json | 170 ++++++++++++++++++++ tests/inference/workflows/ltxv-1.json | 214 ++++++++++++++++++++++++++ tests/unit/test_base_nodes.py | 2 +- 7 files changed, 398 insertions(+), 8 deletions(-) create mode 100644 tests/inference/workflows/ltxv-0.json create mode 100644 tests/inference/workflows/ltxv-1.json diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py index 1cfa8d4ee..fba8e9742 100644 --- a/comfy/model_downloader.py +++ b/comfy/model_downloader.py @@ -260,6 +260,7 @@ KNOWN_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([ HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4.safetensors"), HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4-v2.safetensors"), HuggingFile("silveroxides/flux1-nf4-weights", "flux1-schnell-bnb-nf4.safetensors"), + HuggingFile("Lightricks/LTX-Video", "ltx-video-2b-v0.9.safetensors"), ], folder_name="checkpoints") KNOWN_UNCLIP_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([ diff --git a/comfy/nodes/base_nodes.py b/comfy/nodes/base_nodes.py index 7b384fb5e..25e4085aa 100644 --- a/comfy/nodes/base_nodes.py +++ b/comfy/nodes/base_nodes.py @@ -391,7 +391,7 @@ class InpaintModelConditioning: CATEGORY = "conditioning/inpaint" - def encode(self, positive, negative, pixels, vae, mask, noise_mask): + def encode(self, positive, negative, pixels, vae, mask, noise_mask=True): x = (pixels.shape[1] // 8) * 8 y = (pixels.shape[2] // 8) * 8 mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear") @@ -947,7 +947,7 @@ class CLIPLoader: elif type == "mochi": clip_type = sd.CLIPType.MOCHI elif type == "ltxv": - clip_type = comfy.sd.CLIPType.LTXV + clip_type = sd.CLIPType.LTXV else: logging.warning(f"Unknown clip type argument passed: {type} for model {clip_name}") diff --git a/comfy/sd.py b/comfy/sd.py index 07f1b4ab2..a47c50534 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -21,12 +21,14 @@ from . import model_sampling from . import sd1_clip from . import sdxl_clip from . import utils +from . import lora from .ldm.audio.autoencoder import AudioOobleckVAE from .ldm.cascade.stage_a import StageA from .ldm.cascade.stage_c_coder import StageC_coder -from .ldm.genmo.vae import model as genmo +from .ldm.genmo.vae import model as genmo_model from .ldm.lightricks.vae import causal_video_autoencoder as lightricks from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine +from .lora_convert import convert_lora from .model_management import load_models_gpu from .t2i_adapter import adapter from .taesd import taesd @@ -47,7 +49,7 @@ def load_lora_for_models(model, clip, _lora, strength_model, strength_clip): if clip is not None: key_map = lora.model_lora_keys_clip(clip.cond_stage_model, key_map) - lora = comfy.lora_convert.convert_lora(lora) + _lora = convert_lora(_lora) loaded = lora.load_lora(_lora, key_map) if model is not None: new_modelpatcher = model.clone() @@ -265,7 +267,7 @@ class VAE: sd = utils.state_dict_prefix_replace(sd, {"": "decoder."}) if "layers.4.layers.1.attn_block.attn.qkv.weight" in sd: sd = utils.state_dict_prefix_replace(sd, {"": "encoder."}) - self.first_stage_model = genmo.VideoVAE() + self.first_stage_model = genmo_model.VideoVAE() self.latent_channels = 12 self.latent_dim = 3 self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype) diff --git a/comfy_extras/nodes/nodes_images.py b/comfy_extras/nodes/nodes_images.py index 5f281e641..151be6a84 100644 --- a/comfy_extras/nodes/nodes_images.py +++ b/comfy_extras/nodes/nodes_images.py @@ -57,7 +57,6 @@ def levels_adjustment(image: ImageBatch, black_level: float = 0.0, mid_level: fl return result - class ImageCrop: @classmethod def INPUT_TYPES(s): @@ -272,7 +271,7 @@ class ImageResize: "required": { "image": ("IMAGE",), "resize_mode": (["cover", "contain", "auto"], {"default": "cover"}), - "resolutions": (["SDXL/SD3/Flux", "SD1.5"], {"default": "SDXL/SD3/Flux"}), + "resolutions": (["SDXL/SD3/Flux", "SD1.5", "LTXV"], {"default": "SDXL/SD3/Flux"}), "interpolation": (ImageScale.upscale_methods, {"default": "bilinear"}), } } @@ -294,6 +293,10 @@ class ImageResize: (1344, 768), (1536, 640), ] + elif resolutions == "ltxv": + supported_resolutions = [ + (768, 512) + ] else: supported_resolutions = [ (512, 512), diff --git a/tests/inference/workflows/ltxv-0.json b/tests/inference/workflows/ltxv-0.json new file mode 100644 index 000000000..232fd96ae --- /dev/null +++ b/tests/inference/workflows/ltxv-0.json @@ -0,0 +1,170 @@ +{ + "6": { + "inputs": { + "text": "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.", + "clip": [ + "38", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Positive Prompt)" + } + }, + "7": { + "inputs": { + "text": "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", + "clip": [ + "38", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Negative Prompt)" + } + }, + "8": { + "inputs": { + "samples": [ + "72", + 0 + ], + "vae": [ + "44", + 2 + ] + }, + "class_type": "VAEDecode", + "_meta": { + "title": "VAE Decode" + } + }, + "38": { + "inputs": { + "clip_name": "t5xxl_fp16.safetensors", + "type": "ltxv" + }, + "class_type": "CLIPLoader", + "_meta": { + "title": "Load CLIP" + } + }, + "41": { + "inputs": { + "filename_prefix": "ComfyUI", + "fps": 24, + "lossless": false, + "quality": 90, + "method": "default", + "images": [ + "8", + 0 + ] + }, + "class_type": "SaveAnimatedWEBP", + "_meta": { + "title": "SaveAnimatedWEBP" + } + }, + "44": { + "inputs": { + "ckpt_name": "ltx-video-2b-v0.9.safetensors" + }, + "class_type": "CheckpointLoaderSimple", + "_meta": { + "title": "Load Checkpoint" + } + }, + "69": { + "inputs": { + "frame_rate": 25, + "positive": [ + "6", + 0 + ], + "negative": [ + "7", + 0 + ] + }, + "class_type": "LTXVConditioning", + "_meta": { + "title": "LTXVConditioning" + } + }, + "70": { + "inputs": { + "width": 768, + "height": 512, + "length": 9, + "batch_size": 1 + }, + "class_type": "EmptyLTXVLatentVideo", + "_meta": { + "title": "EmptyLTXVLatentVideo" + } + }, + "71": { + "inputs": { + "steps": 30, + "max_shift": 2.05, + "base_shift": 0.95, + "stretch": true, + "terminal": 0.1, + "latent": [ + "70", + 0 + ] + }, + "class_type": "LTXVScheduler", + "_meta": { + "title": "LTXVScheduler" + } + }, + "72": { + "inputs": { + "add_noise": true, + "noise_seed": 782373264718055, + "cfg": 3, + "model": [ + "44", + 0 + ], + "positive": [ + "69", + 0 + ], + "negative": [ + "69", + 1 + ], + "sampler": [ + "73", + 0 + ], + "sigmas": [ + "71", + 0 + ], + "latent_image": [ + "70", + 0 + ] + }, + "class_type": "SamplerCustom", + "_meta": { + "title": "SamplerCustom" + } + }, + "73": { + "inputs": { + "sampler_name": "euler" + }, + "class_type": "KSamplerSelect", + "_meta": { + "title": "KSamplerSelect" + } + } +} \ No newline at end of file diff --git a/tests/inference/workflows/ltxv-1.json b/tests/inference/workflows/ltxv-1.json new file mode 100644 index 000000000..fa330bd7b --- /dev/null +++ b/tests/inference/workflows/ltxv-1.json @@ -0,0 +1,214 @@ +{ + "6": { + "inputs": { + "text": "best quality, 4k, HDR, a tracking shot of a beautiful scene of the sea waves on the beach with a massive explosion in the water", + "clip": [ + "38", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Positive Prompt)" + } + }, + "7": { + "inputs": { + "text": "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly", + "clip": [ + "38", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Negative Prompt)" + } + }, + "8": { + "inputs": { + "samples": [ + "72", + 0 + ], + "vae": [ + "44", + 2 + ] + }, + "class_type": "VAEDecode", + "_meta": { + "title": "VAE Decode" + } + }, + "38": { + "inputs": { + "clip_name": "t5xxl_fp16.safetensors", + "type": "ltxv" + }, + "class_type": "CLIPLoader", + "_meta": { + "title": "Load CLIP" + } + }, + "41": { + "inputs": { + "filename_prefix": "ComfyUI", + "fps": 24, + "lossless": false, + "quality": 90, + "method": "default", + "images": [ + "8", + 0 + ] + }, + "class_type": "SaveAnimatedWEBP", + "_meta": { + "title": "SaveAnimatedWEBP" + } + }, + "44": { + "inputs": { + "ckpt_name": "ltx-video-2b-v0.9.safetensors" + }, + "class_type": "CheckpointLoaderSimple", + "_meta": { + "title": "Load Checkpoint" + } + }, + "69": { + "inputs": { + "frame_rate": 25, + "positive": [ + "77", + 0 + ], + "negative": [ + "77", + 1 + ] + }, + "class_type": "LTXVConditioning", + "_meta": { + "title": "LTXVConditioning" + } + }, + "71": { + "inputs": { + "steps": 30, + "max_shift": 2.05, + "base_shift": 0.95, + "stretch": true, + "terminal": 0.1, + "latent": [ + "77", + 2 + ] + }, + "class_type": "LTXVScheduler", + "_meta": { + "title": "LTXVScheduler" + } + }, + "72": { + "inputs": { + "add_noise": true, + "noise_seed": 775174607420296, + "cfg": 3, + "model": [ + "44", + 0 + ], + "positive": [ + "69", + 0 + ], + "negative": [ + "69", + 1 + ], + "sampler": [ + "73", + 0 + ], + "sigmas": [ + "71", + 0 + ], + "latent_image": [ + "77", + 2 + ] + }, + "class_type": "SamplerCustom", + "_meta": { + "title": "SamplerCustom" + } + }, + "73": { + "inputs": { + "sampler_name": "euler" + }, + "class_type": "KSamplerSelect", + "_meta": { + "title": "KSamplerSelect" + } + }, + "77": { + "inputs": { + "width": 768, + "height": 512, + "length": 97, + "batch_size": 1, + "positive": [ + "6", + 0 + ], + "negative": [ + "7", + 0 + ], + "vae": [ + "44", + 2 + ], + "image": [ + "80", + 0 + ] + }, + "class_type": "LTXVImgToVideo", + "_meta": { + "title": "LTXVImgToVideo" + } + }, + "79": { + "inputs": { + "value": "https://upload.wikimedia.org/wikipedia/commons/5/51/Havelock_Island%2C_Mangrove_tree_on_the_beach%2C_Andaman_Islands.jpg", + "name": "", + "title": "", + "description": "", + "__required": true + }, + "class_type": "ImageRequestParameter", + "_meta": { + "title": "ImageRequestParameter" + } + }, + "80": { + "inputs": { + "resize_mode": "cover", + "resolutions": "LTXV", + "interpolation": "bilinear", + "image": [ + "79", + 0 + ] + }, + "class_type": "ImageResize", + "_meta": { + "title": "Fit Image to Diffusion Size" + } + } +} \ No newline at end of file diff --git a/tests/unit/test_base_nodes.py b/tests/unit/test_base_nodes.py index 0d018172f..7b4c218ee 100644 --- a/tests/unit/test_base_nodes.py +++ b/tests/unit/test_base_nodes.py @@ -81,7 +81,7 @@ def test_inpaint_model_conditioning(model, vae, clip, has_gpu): pytest.skip("requires gpu for performant testing") cond_pos, = CLIPTextEncode().encode(clip, "test prompt") cond_neg, = CLIPTextEncode().encode(clip, "test negative prompt") - pos, neg, latent = InpaintModelConditioning().encode(cond_pos, cond_neg, _image_512x512, vae, torch.ones((1, 512, 512))) + pos, neg, latent = InpaintModelConditioning().encode(cond_pos, cond_neg, _image_512x512, vae, torch.ones((1, 512, 512)), noise_mask=True) assert len(pos) == len(cond_pos) assert len(neg) == len(cond_neg) assert "samples" in latent