mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-01-22 12:20:16 +08:00
LTXV tests
This commit is contained in:
parent
fe64070b41
commit
4b77c4941c
@ -260,6 +260,7 @@ KNOWN_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([
|
|||||||
HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4.safetensors"),
|
HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4.safetensors"),
|
||||||
HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4-v2.safetensors"),
|
HuggingFile("lllyasviel/flux1-dev-bnb-nf4", "flux1-dev-bnb-nf4-v2.safetensors"),
|
||||||
HuggingFile("silveroxides/flux1-nf4-weights", "flux1-schnell-bnb-nf4.safetensors"),
|
HuggingFile("silveroxides/flux1-nf4-weights", "flux1-schnell-bnb-nf4.safetensors"),
|
||||||
|
HuggingFile("Lightricks/LTX-Video", "ltx-video-2b-v0.9.safetensors"),
|
||||||
], folder_name="checkpoints")
|
], folder_name="checkpoints")
|
||||||
|
|
||||||
KNOWN_UNCLIP_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([
|
KNOWN_UNCLIP_CHECKPOINTS: Final[KnownDownloadables] = KnownDownloadables([
|
||||||
|
|||||||
@ -391,7 +391,7 @@ class InpaintModelConditioning:
|
|||||||
|
|
||||||
CATEGORY = "conditioning/inpaint"
|
CATEGORY = "conditioning/inpaint"
|
||||||
|
|
||||||
def encode(self, positive, negative, pixels, vae, mask, noise_mask):
|
def encode(self, positive, negative, pixels, vae, mask, noise_mask=True):
|
||||||
x = (pixels.shape[1] // 8) * 8
|
x = (pixels.shape[1] // 8) * 8
|
||||||
y = (pixels.shape[2] // 8) * 8
|
y = (pixels.shape[2] // 8) * 8
|
||||||
mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear")
|
mask = torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(pixels.shape[1], pixels.shape[2]), mode="bilinear")
|
||||||
@ -947,7 +947,7 @@ class CLIPLoader:
|
|||||||
elif type == "mochi":
|
elif type == "mochi":
|
||||||
clip_type = sd.CLIPType.MOCHI
|
clip_type = sd.CLIPType.MOCHI
|
||||||
elif type == "ltxv":
|
elif type == "ltxv":
|
||||||
clip_type = comfy.sd.CLIPType.LTXV
|
clip_type = sd.CLIPType.LTXV
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Unknown clip type argument passed: {type} for model {clip_name}")
|
logging.warning(f"Unknown clip type argument passed: {type} for model {clip_name}")
|
||||||
|
|
||||||
|
|||||||
@ -21,12 +21,14 @@ from . import model_sampling
|
|||||||
from . import sd1_clip
|
from . import sd1_clip
|
||||||
from . import sdxl_clip
|
from . import sdxl_clip
|
||||||
from . import utils
|
from . import utils
|
||||||
|
from . import lora
|
||||||
from .ldm.audio.autoencoder import AudioOobleckVAE
|
from .ldm.audio.autoencoder import AudioOobleckVAE
|
||||||
from .ldm.cascade.stage_a import StageA
|
from .ldm.cascade.stage_a import StageA
|
||||||
from .ldm.cascade.stage_c_coder import StageC_coder
|
from .ldm.cascade.stage_c_coder import StageC_coder
|
||||||
from .ldm.genmo.vae import model as genmo
|
from .ldm.genmo.vae import model as genmo_model
|
||||||
from .ldm.lightricks.vae import causal_video_autoencoder as lightricks
|
from .ldm.lightricks.vae import causal_video_autoencoder as lightricks
|
||||||
from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine
|
from .ldm.models.autoencoder import AutoencoderKL, AutoencodingEngine
|
||||||
|
from .lora_convert import convert_lora
|
||||||
from .model_management import load_models_gpu
|
from .model_management import load_models_gpu
|
||||||
from .t2i_adapter import adapter
|
from .t2i_adapter import adapter
|
||||||
from .taesd import taesd
|
from .taesd import taesd
|
||||||
@ -47,7 +49,7 @@ def load_lora_for_models(model, clip, _lora, strength_model, strength_clip):
|
|||||||
if clip is not None:
|
if clip is not None:
|
||||||
key_map = lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
|
key_map = lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
|
||||||
|
|
||||||
lora = comfy.lora_convert.convert_lora(lora)
|
_lora = convert_lora(_lora)
|
||||||
loaded = lora.load_lora(_lora, key_map)
|
loaded = lora.load_lora(_lora, key_map)
|
||||||
if model is not None:
|
if model is not None:
|
||||||
new_modelpatcher = model.clone()
|
new_modelpatcher = model.clone()
|
||||||
@ -265,7 +267,7 @@ class VAE:
|
|||||||
sd = utils.state_dict_prefix_replace(sd, {"": "decoder."})
|
sd = utils.state_dict_prefix_replace(sd, {"": "decoder."})
|
||||||
if "layers.4.layers.1.attn_block.attn.qkv.weight" in sd:
|
if "layers.4.layers.1.attn_block.attn.qkv.weight" in sd:
|
||||||
sd = utils.state_dict_prefix_replace(sd, {"": "encoder."})
|
sd = utils.state_dict_prefix_replace(sd, {"": "encoder."})
|
||||||
self.first_stage_model = genmo.VideoVAE()
|
self.first_stage_model = genmo_model.VideoVAE()
|
||||||
self.latent_channels = 12
|
self.latent_channels = 12
|
||||||
self.latent_dim = 3
|
self.latent_dim = 3
|
||||||
self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
|
self.memory_used_decode = lambda shape, dtype: (1000 * shape[2] * shape[3] * shape[4] * (6 * 8 * 8)) * model_management.dtype_size(dtype)
|
||||||
|
|||||||
@ -57,7 +57,6 @@ def levels_adjustment(image: ImageBatch, black_level: float = 0.0, mid_level: fl
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ImageCrop:
|
class ImageCrop:
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def INPUT_TYPES(s):
|
||||||
@ -272,7 +271,7 @@ class ImageResize:
|
|||||||
"required": {
|
"required": {
|
||||||
"image": ("IMAGE",),
|
"image": ("IMAGE",),
|
||||||
"resize_mode": (["cover", "contain", "auto"], {"default": "cover"}),
|
"resize_mode": (["cover", "contain", "auto"], {"default": "cover"}),
|
||||||
"resolutions": (["SDXL/SD3/Flux", "SD1.5"], {"default": "SDXL/SD3/Flux"}),
|
"resolutions": (["SDXL/SD3/Flux", "SD1.5", "LTXV"], {"default": "SDXL/SD3/Flux"}),
|
||||||
"interpolation": (ImageScale.upscale_methods, {"default": "bilinear"}),
|
"interpolation": (ImageScale.upscale_methods, {"default": "bilinear"}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -294,6 +293,10 @@ class ImageResize:
|
|||||||
(1344, 768),
|
(1344, 768),
|
||||||
(1536, 640),
|
(1536, 640),
|
||||||
]
|
]
|
||||||
|
elif resolutions == "ltxv":
|
||||||
|
supported_resolutions = [
|
||||||
|
(768, 512)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
supported_resolutions = [
|
supported_resolutions = [
|
||||||
(512, 512),
|
(512, 512),
|
||||||
|
|||||||
170
tests/inference/workflows/ltxv-0.json
Normal file
170
tests/inference/workflows/ltxv-0.json
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
{
|
||||||
|
"6": {
|
||||||
|
"inputs": {
|
||||||
|
"text": "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
|
||||||
|
"clip": [
|
||||||
|
"38",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "CLIPTextEncode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "CLIP Text Encode (Positive Prompt)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7": {
|
||||||
|
"inputs": {
|
||||||
|
"text": "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
|
||||||
|
"clip": [
|
||||||
|
"38",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "CLIPTextEncode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "CLIP Text Encode (Negative Prompt)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"inputs": {
|
||||||
|
"samples": [
|
||||||
|
"72",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"vae": [
|
||||||
|
"44",
|
||||||
|
2
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "VAEDecode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "VAE Decode"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"38": {
|
||||||
|
"inputs": {
|
||||||
|
"clip_name": "t5xxl_fp16.safetensors",
|
||||||
|
"type": "ltxv"
|
||||||
|
},
|
||||||
|
"class_type": "CLIPLoader",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Load CLIP"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"41": {
|
||||||
|
"inputs": {
|
||||||
|
"filename_prefix": "ComfyUI",
|
||||||
|
"fps": 24,
|
||||||
|
"lossless": false,
|
||||||
|
"quality": 90,
|
||||||
|
"method": "default",
|
||||||
|
"images": [
|
||||||
|
"8",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "SaveAnimatedWEBP",
|
||||||
|
"_meta": {
|
||||||
|
"title": "SaveAnimatedWEBP"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"44": {
|
||||||
|
"inputs": {
|
||||||
|
"ckpt_name": "ltx-video-2b-v0.9.safetensors"
|
||||||
|
},
|
||||||
|
"class_type": "CheckpointLoaderSimple",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Load Checkpoint"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"69": {
|
||||||
|
"inputs": {
|
||||||
|
"frame_rate": 25,
|
||||||
|
"positive": [
|
||||||
|
"6",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"7",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "LTXVConditioning",
|
||||||
|
"_meta": {
|
||||||
|
"title": "LTXVConditioning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"70": {
|
||||||
|
"inputs": {
|
||||||
|
"width": 768,
|
||||||
|
"height": 512,
|
||||||
|
"length": 9,
|
||||||
|
"batch_size": 1
|
||||||
|
},
|
||||||
|
"class_type": "EmptyLTXVLatentVideo",
|
||||||
|
"_meta": {
|
||||||
|
"title": "EmptyLTXVLatentVideo"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"71": {
|
||||||
|
"inputs": {
|
||||||
|
"steps": 30,
|
||||||
|
"max_shift": 2.05,
|
||||||
|
"base_shift": 0.95,
|
||||||
|
"stretch": true,
|
||||||
|
"terminal": 0.1,
|
||||||
|
"latent": [
|
||||||
|
"70",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "LTXVScheduler",
|
||||||
|
"_meta": {
|
||||||
|
"title": "LTXVScheduler"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"72": {
|
||||||
|
"inputs": {
|
||||||
|
"add_noise": true,
|
||||||
|
"noise_seed": 782373264718055,
|
||||||
|
"cfg": 3,
|
||||||
|
"model": [
|
||||||
|
"44",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"positive": [
|
||||||
|
"69",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"69",
|
||||||
|
1
|
||||||
|
],
|
||||||
|
"sampler": [
|
||||||
|
"73",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"sigmas": [
|
||||||
|
"71",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"latent_image": [
|
||||||
|
"70",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "SamplerCustom",
|
||||||
|
"_meta": {
|
||||||
|
"title": "SamplerCustom"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"73": {
|
||||||
|
"inputs": {
|
||||||
|
"sampler_name": "euler"
|
||||||
|
},
|
||||||
|
"class_type": "KSamplerSelect",
|
||||||
|
"_meta": {
|
||||||
|
"title": "KSamplerSelect"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
214
tests/inference/workflows/ltxv-1.json
Normal file
214
tests/inference/workflows/ltxv-1.json
Normal file
@ -0,0 +1,214 @@
|
|||||||
|
{
|
||||||
|
"6": {
|
||||||
|
"inputs": {
|
||||||
|
"text": "best quality, 4k, HDR, a tracking shot of a beautiful scene of the sea waves on the beach with a massive explosion in the water",
|
||||||
|
"clip": [
|
||||||
|
"38",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "CLIPTextEncode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "CLIP Text Encode (Positive Prompt)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7": {
|
||||||
|
"inputs": {
|
||||||
|
"text": "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly",
|
||||||
|
"clip": [
|
||||||
|
"38",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "CLIPTextEncode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "CLIP Text Encode (Negative Prompt)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"8": {
|
||||||
|
"inputs": {
|
||||||
|
"samples": [
|
||||||
|
"72",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"vae": [
|
||||||
|
"44",
|
||||||
|
2
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "VAEDecode",
|
||||||
|
"_meta": {
|
||||||
|
"title": "VAE Decode"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"38": {
|
||||||
|
"inputs": {
|
||||||
|
"clip_name": "t5xxl_fp16.safetensors",
|
||||||
|
"type": "ltxv"
|
||||||
|
},
|
||||||
|
"class_type": "CLIPLoader",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Load CLIP"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"41": {
|
||||||
|
"inputs": {
|
||||||
|
"filename_prefix": "ComfyUI",
|
||||||
|
"fps": 24,
|
||||||
|
"lossless": false,
|
||||||
|
"quality": 90,
|
||||||
|
"method": "default",
|
||||||
|
"images": [
|
||||||
|
"8",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "SaveAnimatedWEBP",
|
||||||
|
"_meta": {
|
||||||
|
"title": "SaveAnimatedWEBP"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"44": {
|
||||||
|
"inputs": {
|
||||||
|
"ckpt_name": "ltx-video-2b-v0.9.safetensors"
|
||||||
|
},
|
||||||
|
"class_type": "CheckpointLoaderSimple",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Load Checkpoint"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"69": {
|
||||||
|
"inputs": {
|
||||||
|
"frame_rate": 25,
|
||||||
|
"positive": [
|
||||||
|
"77",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"77",
|
||||||
|
1
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "LTXVConditioning",
|
||||||
|
"_meta": {
|
||||||
|
"title": "LTXVConditioning"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"71": {
|
||||||
|
"inputs": {
|
||||||
|
"steps": 30,
|
||||||
|
"max_shift": 2.05,
|
||||||
|
"base_shift": 0.95,
|
||||||
|
"stretch": true,
|
||||||
|
"terminal": 0.1,
|
||||||
|
"latent": [
|
||||||
|
"77",
|
||||||
|
2
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "LTXVScheduler",
|
||||||
|
"_meta": {
|
||||||
|
"title": "LTXVScheduler"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"72": {
|
||||||
|
"inputs": {
|
||||||
|
"add_noise": true,
|
||||||
|
"noise_seed": 775174607420296,
|
||||||
|
"cfg": 3,
|
||||||
|
"model": [
|
||||||
|
"44",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"positive": [
|
||||||
|
"69",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"69",
|
||||||
|
1
|
||||||
|
],
|
||||||
|
"sampler": [
|
||||||
|
"73",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"sigmas": [
|
||||||
|
"71",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"latent_image": [
|
||||||
|
"77",
|
||||||
|
2
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "SamplerCustom",
|
||||||
|
"_meta": {
|
||||||
|
"title": "SamplerCustom"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"73": {
|
||||||
|
"inputs": {
|
||||||
|
"sampler_name": "euler"
|
||||||
|
},
|
||||||
|
"class_type": "KSamplerSelect",
|
||||||
|
"_meta": {
|
||||||
|
"title": "KSamplerSelect"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"77": {
|
||||||
|
"inputs": {
|
||||||
|
"width": 768,
|
||||||
|
"height": 512,
|
||||||
|
"length": 97,
|
||||||
|
"batch_size": 1,
|
||||||
|
"positive": [
|
||||||
|
"6",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"negative": [
|
||||||
|
"7",
|
||||||
|
0
|
||||||
|
],
|
||||||
|
"vae": [
|
||||||
|
"44",
|
||||||
|
2
|
||||||
|
],
|
||||||
|
"image": [
|
||||||
|
"80",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "LTXVImgToVideo",
|
||||||
|
"_meta": {
|
||||||
|
"title": "LTXVImgToVideo"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"79": {
|
||||||
|
"inputs": {
|
||||||
|
"value": "https://upload.wikimedia.org/wikipedia/commons/5/51/Havelock_Island%2C_Mangrove_tree_on_the_beach%2C_Andaman_Islands.jpg",
|
||||||
|
"name": "",
|
||||||
|
"title": "",
|
||||||
|
"description": "",
|
||||||
|
"__required": true
|
||||||
|
},
|
||||||
|
"class_type": "ImageRequestParameter",
|
||||||
|
"_meta": {
|
||||||
|
"title": "ImageRequestParameter"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"80": {
|
||||||
|
"inputs": {
|
||||||
|
"resize_mode": "cover",
|
||||||
|
"resolutions": "LTXV",
|
||||||
|
"interpolation": "bilinear",
|
||||||
|
"image": [
|
||||||
|
"79",
|
||||||
|
0
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"class_type": "ImageResize",
|
||||||
|
"_meta": {
|
||||||
|
"title": "Fit Image to Diffusion Size"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -81,7 +81,7 @@ def test_inpaint_model_conditioning(model, vae, clip, has_gpu):
|
|||||||
pytest.skip("requires gpu for performant testing")
|
pytest.skip("requires gpu for performant testing")
|
||||||
cond_pos, = CLIPTextEncode().encode(clip, "test prompt")
|
cond_pos, = CLIPTextEncode().encode(clip, "test prompt")
|
||||||
cond_neg, = CLIPTextEncode().encode(clip, "test negative prompt")
|
cond_neg, = CLIPTextEncode().encode(clip, "test negative prompt")
|
||||||
pos, neg, latent = InpaintModelConditioning().encode(cond_pos, cond_neg, _image_512x512, vae, torch.ones((1, 512, 512)))
|
pos, neg, latent = InpaintModelConditioning().encode(cond_pos, cond_neg, _image_512x512, vae, torch.ones((1, 512, 512)), noise_mask=True)
|
||||||
assert len(pos) == len(cond_pos)
|
assert len(pos) == len(cond_pos)
|
||||||
assert len(neg) == len(cond_neg)
|
assert len(neg) == len(cond_neg)
|
||||||
assert "samples" in latent
|
assert "samples" in latent
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user