From 4b77c4941c32aec4ff75ca6083383566afe7d4e8 Mon Sep 17 00:00:00 2001
From: doctorpangloss <@hiddenswitch.com>
Date: Fri, 22 Nov 2024 17:13:19 -0800
Subject: [PATCH] LTXV tests

---
 comfy/model_downloader.py             |   1 +
 comfy/nodes/base_nodes.py             |   4 +-
 comfy/sd.py                           |   8 +-
 comfy_extras/nodes/nodes_images.py    |   7 +-
 tests/inference/workflows/ltxv-0.json | 170 ++++++++++++++++++++
 tests/inference/workflows/ltxv-1.json | 214 ++++++++++++++++++++++++++
 tests/unit/test_base_nodes.py         |   2 +-
 7 files changed, 398 insertions(+), 8 deletions(-)
 create mode 100644 tests/inference/workflows/ltxv-0.json
 create mode 100644 tests/inference/workflows/ltxv-1.json

diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py
index 1cfa8d4ee..fba8e9742 100644
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@@ -260,6 +260,7 @@ KNOWN_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([
     HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4.safetensors"),
     HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4-v2.safetensors"),
     HuggingFile("silveroxides/flux1-nf4-weights", "flux1-schnell-bnb-nf4.safetensors"),
+    HuggingFile("Lightricks/LTX-Video", "ltx-video-2b-v0.9.safetensors"),
 ], folder_name="checkpoints")
 
 KNOWN_UNCLIP_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([
diff --git a/comfy/nodes/base_nodes.py b/comfy/nodes/base_nodes.py
index 7b384fb5e..25e4085aa 100644
--- a/comfy/nodes/base_nodes.py
+++ b/comfy/nodes/base_nodes.py
@@ -391,7 +391,7 @@ class InpaintModelConditioning:
 
     CATEGORY = "conditioning/inpaint"
 
-    def encode(self, positive, negative, pixels, vae, mask, noise_mask):
+    def encode(self, positive, negative, pixels, vae, mask, noise_mask=True):
         x = (pixels.shape[1] // 8) * 8
         y = (pixels.shape[2] // 8) * 8
         mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear")
@@ -947,7 +947,7 @@ class CLIPLoader:
         elif type == "mochi":
             clip_type = sd.CLIPType.MOCHI
         elif type == "ltxv":
-            clip_type = comfy.sd.CLIPType.LTXV
+            clip_type = sd.CLIPType.LTXV
         else:
             logging.warning(f"Unknown clip type argument passed: {type} for model {clip_name}")
 
diff --git a/comfy/sd.py b/comfy/sd.py
index 07f1b4ab2..a47c50534 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -21,12 +21,14 @@ from . import model_sampling
 from . import sd1_clip
 from . import sdxl_clip
 from . import utils
+from . import lora
 from .ldm.audio.autoencoder import AudioOobleckVAE
 from .ldm.cascade.stage_a import StageA
 from .ldm.cascade.stage_c_coder import StageC_coder
-from .ldm.genmo.vae import model as genmo
+from .ldm.genmo.vae import model as genmo_model
 from .ldm.lightricks.vae import causal_video_autoencoder as lightricks
 from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine
+from .lora_convert import convert_lora
 from .model_management import load_models_gpu
 from .t2i_adapter import adapter
 from .taesd import taesd
@@ -47,7 +49,7 @@ def load_lora_for_models(model, clip, _lora, strength_model, strength_clip):
     if clip is not None:
         key_map = lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
 
-    lora = comfy.lora_convert.convert_lora(lora)
+    _lora = convert_lora(_lora)
     loaded = lora.load_lora(_lora, key_map)
     if model is not None:
         new_modelpatcher = model.clone()
@@ -265,7 +267,7 @@ class VAE:
                     sd = utils.state_dict_prefix_replace(sd, {"": "decoder."})
                 if "layers.4.layers.1.attn_block.attn.qkv.weight" in sd:
                     sd = utils.state_dict_prefix_replace(sd, {"": "encoder."})
-                self.first_stage_model = genmo.VideoVAE()
+                self.first_stage_model = genmo_model.VideoVAE()
                 self.latent_channels = 12
                 self.latent_dim = 3
                 self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
diff --git a/comfy_extras/nodes/nodes_images.py b/comfy_extras/nodes/nodes_images.py
index 5f281e641..151be6a84 100644
--- a/comfy_extras/nodes/nodes_images.py
+++ b/comfy_extras/nodes/nodes_images.py
@@ -57,7 +57,6 @@ def levels_adjustment(image: ImageBatch, black_level: float = 0.0, mid_level: fl
     return result
 
 
-
 class ImageCrop:
     @classmethod
     def INPUT_TYPES(s):
@@ -272,7 +271,7 @@ class ImageResize:
             "required": {
                 "image": ("IMAGE",),
                 "resize_mode": (["cover", "contain", "auto"], {"default": "cover"}),
-                "resolutions": (["SDXL/SD3/Flux", "SD1.5"], {"default": "SDXL/SD3/Flux"}),
+                "resolutions": (["SDXL/SD3/Flux", "SD1.5", "LTXV"], {"default": "SDXL/SD3/Flux"}),
                 "interpolation": (ImageScale.upscale_methods, {"default": "bilinear"}),
             }
         }
@@ -294,6 +293,10 @@ class ImageResize:
                 (1344, 768),
                 (1536, 640),
             ]
+        elif resolutions == "ltxv":
+            supported_resolutions = [
+                (768, 512)
+            ]
         else:
             supported_resolutions = [
                 (512, 512),
diff --git a/tests/inference/workflows/ltxv-0.json b/tests/inference/workflows/ltxv-0.json
new file mode 100644
index 000000000..232fd96ae
--- /dev/null
+++ b/tests/inference/workflows/ltxv-0.json
@@ -0,0 +1,170 @@
+{
+  "6": {
+    "inputs": {
+      "text": "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
+      "clip": [
+        "38",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Positive Prompt)"
+    }
+  },
+  "7": {
+    "inputs": {
+      "text": "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
+      "clip": [
+        "38",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Negative Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "72",
+        0
+      ],
+      "vae": [
+        "44",
+        2
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "38": {
+    "inputs": {
+      "clip_name": "t5xxl_fp16.safetensors",
+      "type": "ltxv"
+    },
+    "class_type": "CLIPLoader",
+    "_meta": {
+      "title": "Load CLIP"
+    }
+  },
+  "41": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "fps": 24,
+      "lossless": false,
+      "quality": 90,
+      "method": "default",
+      "images": [
+        "8",
+        0
+      ]
+    },
+    "class_type": "SaveAnimatedWEBP",
+    "_meta": {
+      "title": "SaveAnimatedWEBP"
+    }
+  },
+  "44": {
+    "inputs": {
+      "ckpt_name": "ltx-video-2b-v0.9.safetensors"
+    },
+    "class_type": "CheckpointLoaderSimple",
+    "_meta": {
+      "title": "Load Checkpoint"
+    }
+  },
+  "69": {
+    "inputs": {
+      "frame_rate": 25,
+      "positive": [
+        "6",
+        0
+      ],
+      "negative": [
+        "7",
+        0
+      ]
+    },
+    "class_type": "LTXVConditioning",
+    "_meta": {
+      "title": "LTXVConditioning"
+    }
+  },
+  "70": {
+    "inputs": {
+      "width": 768,
+      "height": 512,
+      "length": 9,
+      "batch_size": 1
+    },
+    "class_type": "EmptyLTXVLatentVideo",
+    "_meta": {
+      "title": "EmptyLTXVLatentVideo"
+    }
+  },
+  "71": {
+    "inputs": {
+      "steps": 30,
+      "max_shift": 2.05,
+      "base_shift": 0.95,
+      "stretch": true,
+      "terminal": 0.1,
+      "latent": [
+        "70",
+        0
+      ]
+    },
+    "class_type": "LTXVScheduler",
+    "_meta": {
+      "title": "LTXVScheduler"
+    }
+  },
+  "72": {
+    "inputs": {
+      "add_noise": true,
+      "noise_seed": 782373264718055,
+      "cfg": 3,
+      "model": [
+        "44",
+        0
+      ],
+      "positive": [
+        "69",
+        0
+      ],
+      "negative": [
+        "69",
+        1
+      ],
+      "sampler": [
+        "73",
+        0
+      ],
+      "sigmas": [
+        "71",
+        0
+      ],
+      "latent_image": [
+        "70",
+        0
+      ]
+    },
+    "class_type": "SamplerCustom",
+    "_meta": {
+      "title": "SamplerCustom"
+    }
+  },
+  "73": {
+    "inputs": {
+      "sampler_name": "euler"
+    },
+    "class_type": "KSamplerSelect",
+    "_meta": {
+      "title": "KSamplerSelect"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/inference/workflows/ltxv-1.json b/tests/inference/workflows/ltxv-1.json
new file mode 100644
index 000000000..fa330bd7b
--- /dev/null
+++ b/tests/inference/workflows/ltxv-1.json
@@ -0,0 +1,214 @@
+{
+  "6": {
+    "inputs": {
+      "text": "best quality, 4k, HDR, a tracking shot of a beautiful scene of the sea waves on the beach with a massive explosion in the water",
+      "clip": [
+        "38",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Positive Prompt)"
+    }
+  },
+  "7": {
+    "inputs": {
+      "text": "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
+      "clip": [
+        "38",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Negative Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "72",
+        0
+      ],
+      "vae": [
+        "44",
+        2
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "38": {
+    "inputs": {
+      "clip_name": "t5xxl_fp16.safetensors",
+      "type": "ltxv"
+    },
+    "class_type": "CLIPLoader",
+    "_meta": {
+      "title": "Load CLIP"
+    }
+  },
+  "41": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "fps": 24,
+      "lossless": false,
+      "quality": 90,
+      "method": "default",
+      "images": [
+        "8",
+        0
+      ]
+    },
+    "class_type": "SaveAnimatedWEBP",
+    "_meta": {
+      "title": "SaveAnimatedWEBP"
+    }
+  },
+  "44": {
+    "inputs": {
+      "ckpt_name": "ltx-video-2b-v0.9.safetensors"
+    },
+    "class_type": "CheckpointLoaderSimple",
+    "_meta": {
+      "title": "Load Checkpoint"
+    }
+  },
+  "69": {
+    "inputs": {
+      "frame_rate": 25,
+      "positive": [
+        "77",
+        0
+      ],
+      "negative": [
+        "77",
+        1
+      ]
+    },
+    "class_type": "LTXVConditioning",
+    "_meta": {
+      "title": "LTXVConditioning"
+    }
+  },
+  "71": {
+    "inputs": {
+      "steps": 30,
+      "max_shift": 2.05,
+      "base_shift": 0.95,
+      "stretch": true,
+      "terminal": 0.1,
+      "latent": [
+        "77",
+        2
+      ]
+    },
+    "class_type": "LTXVScheduler",
+    "_meta": {
+      "title": "LTXVScheduler"
+    }
+  },
+  "72": {
+    "inputs": {
+      "add_noise": true,
+      "noise_seed": 775174607420296,
+      "cfg": 3,
+      "model": [
+        "44",
+        0
+      ],
+      "positive": [
+        "69",
+        0
+      ],
+      "negative": [
+        "69",
+        1
+      ],
+      "sampler": [
+        "73",
+        0
+      ],
+      "sigmas": [
+        "71",
+        0
+      ],
+      "latent_image": [
+        "77",
+        2
+      ]
+    },
+    "class_type": "SamplerCustom",
+    "_meta": {
+      "title": "SamplerCustom"
+    }
+  },
+  "73": {
+    "inputs": {
+      "sampler_name": "euler"
+    },
+    "class_type": "KSamplerSelect",
+    "_meta": {
+      "title": "KSamplerSelect"
+    }
+  },
+  "77": {
+    "inputs": {
+      "width": 768,
+      "height": 512,
+      "length": 97,
+      "batch_size": 1,
+      "positive": [
+        "6",
+        0
+      ],
+      "negative": [
+        "7",
+        0
+      ],
+      "vae": [
+        "44",
+        2
+      ],
+      "image": [
+        "80",
+        0
+      ]
+    },
+    "class_type": "LTXVImgToVideo",
+    "_meta": {
+      "title": "LTXVImgToVideo"
+    }
+  },
+  "79": {
+    "inputs": {
+      "value": "https://upload.wikimedia.org/wikipedia/commons/5/51/Havelock_Island%2C_Mangrove_tree_on_the_beach%2C_Andaman_Islands.jpg",
+      "name": "",
+      "title": "",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "ImageRequestParameter",
+    "_meta": {
+      "title": "ImageRequestParameter"
+    }
+  },
+  "80": {
+    "inputs": {
+      "resize_mode": "cover",
+      "resolutions": "LTXV",
+      "interpolation": "bilinear",
+      "image": [
+        "79",
+        0
+      ]
+    },
+    "class_type": "ImageResize",
+    "_meta": {
+      "title": "Fit Image to Diffusion Size"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/unit/test_base_nodes.py b/tests/unit/test_base_nodes.py
index 0d018172f..7b4c218ee 100644
--- a/tests/unit/test_base_nodes.py
+++ b/tests/unit/test_base_nodes.py
@@ -81,7 +81,7 @@ def test_inpaint_model_conditioning(model, vae, clip, has_gpu):
         pytest.skip("requires gpu for performant testing")
     cond_pos, = CLIPTextEncode().encode(clip, "test prompt")
     cond_neg, = CLIPTextEncode().encode(clip, "test negative prompt")
-    pos, neg, latent = InpaintModelConditioning().encode(cond_pos, cond_neg, _image_512x512, vae, torch.ones((1, 512, 512)))
+    pos, neg, latent = InpaintModelConditioning().encode(cond_pos, cond_neg, _image_512x512, vae, torch.ones((1, 512, 512)), noise_mask=True)
     assert len(pos) == len(cond_pos)
     assert len(neg) == len(cond_neg)
     assert "samples" in latent