From b1ad9cad371a3c900f3ca791458e35041a23e872 Mon Sep 17 00:00:00 2001 From: doctorpangloss <@hiddenswitch.com> Date: Fri, 22 Nov 2024 18:00:29 -0800 Subject: [PATCH] Known Flux controlnet models --- comfy/clip_vision.py | 32 ++- comfy/model_downloader.py | 13 +- .../workflows/flux-controlnet-1.json | 193 +++++++++++++ .../workflows/flux-inpainting-0.json | 201 ++++++++++++++ tests/inference/workflows/flux-redux-0.json | 258 ++++++++++++++++++ 5 files changed, 684 insertions(+), 13 deletions(-) create mode 100644 tests/inference/workflows/flux-controlnet-1.json create mode 100644 tests/inference/workflows/flux-inpainting-0.json create mode 100644 tests/inference/workflows/flux-redux-0.json diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index a2e5262bb..d35005410 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -1,22 +1,25 @@ -from .component_model import files -from .model_management import load_models_gpu -from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace -import torch import json import logging -from . import ops -from . import model_patcher -from . import model_management +import torch + from . import clip_model +from . import model_management +from . import model_patcher +from . import ops +from .component_model import files +from .model_management import load_models_gpu +from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace class Output: def __getitem__(self, key): return getattr(self, key) + def __setitem__(self, key, item): setattr(self, key, item) + def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]): mean = torch.tensor(mean, device=image.device, dtype=image.dtype) std = torch.tensor(std, device=image.device, dtype=image.dtype) @@ -24,11 +27,12 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s if not (image.shape[2] == size and image.shape[3] == size): scale = (size / min(image.shape[2], image.shape[3])) image = torch.nn.functional.interpolate(image, size=(round(scale * image.shape[2]), round(scale * image.shape[3])), mode="bicubic", antialias=True) - h = (image.shape[2] - size)//2 - w = (image.shape[3] - size)//2 - image = image[:,:,h:h+size,w:w+size] + h = (image.shape[2] - size) // 2 + w = (image.shape[3] - size) // 2 + image = image[:, :, h:h + size, w:w + size] image = torch.clip((255. * image), 0, 255).round() / 255.0 - return (image - mean.view([3,1,1])) / std.view([3,1,1]) + return (image - mean.view([3, 1, 1])) / std.view([3, 1, 1]) + class ClipVisionModel(): def __init__(self, json_config: dict | str): @@ -53,6 +57,7 @@ class ClipVisionModel(): self.model.eval() self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) + def load_sd(self, sd): return self.model.load_state_dict(sd, strict=False) @@ -70,6 +75,7 @@ class ClipVisionModel(): outputs["penultimate_hidden_states"] = out[1].to(model_management.intermediate_device()) return outputs + def convert_to_transformers(sd, prefix): sd_k = sd.keys() if "{}transformer.resblocks.0.attn.in_proj_weight".format(prefix) in sd_k: @@ -96,6 +102,7 @@ def convert_to_transformers(sd, prefix): sd = state_dict_prefix_replace(sd, replace_prefix) return sd + def load_clipvision_from_sd(sd, prefix="", convert_keys=False): if convert_keys: sd = convert_to_transformers(sd, prefix) @@ -105,7 +112,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): json_config = files.get_path_as_dict(None, "clip_vision_config_h.json") elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd: if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152: - json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json") + json_config = files.get_path_as_dict(None, "clip_vision_siglip_384.json") elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577: json_config = files.get_path_as_dict(None, "clip_vision_config_vitl_336.json") else: @@ -124,6 +131,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): sd.pop(k) return clip + def load(ckpt_path): sd = load_torch_file(ckpt_path) if "visual.transformer.resblocks.0.attn.in_proj_weight" in sd: diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py index fba8e9742..8c5de3096 100644 --- a/comfy/model_downloader.py +++ b/comfy/model_downloader.py @@ -283,7 +283,8 @@ KNOWN_GLIGEN_MODELS: Final[KnownDownloadables] = KnownDownloadables([ ], folder_name="gligen") KNOWN_CLIP_VISION_MODELS: Final[KnownDownloadables] = KnownDownloadables([ - HuggingFile("comfyanonymous/clip_vision_g", "clip_vision_g.safetensors") + HuggingFile("comfyanonymous/clip_vision_g", "clip_vision_g.safetensors"), + HuggingFile("Comfy-Org/sigclip_vision_384", "sigclip_vision_patch14_384.safetensors"), ], folder_name="clip_vision") KNOWN_LORAS: Final[KnownDownloadables] = KnownDownloadables([ @@ -292,6 +293,8 @@ KNOWN_LORAS: Final[KnownDownloadables] = KnownDownloadables([ CivitFile(model_id=47085, model_version_id=55199, filename="GoodHands-beta2.safetensors"), HuggingFile("ByteDance/Hyper-SD", "Hyper-SDXL-12steps-CFG-lora.safetensors"), HuggingFile("ByteDance/Hyper-SD", "Hyper-SD15-12steps-CFG-lora.safetensors"), + HuggingFile("black-forest-labs/FLUX.1-Canny-dev-lora", "flux1-canny-dev-lora.safetensors"), + HuggingFile("black-forest-labs/FLUX.1-Depth-dev-lora", "flux1-depth-dev-lora.safetensors"), ], folder_name="loras") KNOWN_CONTROLNETS: Final[KnownDownloadables] = KnownDownloadables([ @@ -434,6 +437,9 @@ KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([ HuggingFile("ByteDance/Hyper-SD", "Hyper-SDXL-1step-Unet-Comfyui.fp16.safetensors"), HuggingFile("black-forest-labs/FLUX.1-schnell", "flux1-schnell.safetensors"), HuggingFile("black-forest-labs/FLUX.1-dev", "flux1-dev.safetensors"), + HuggingFile("black-forest-labs/FLUX.1-Fill-dev", "flux1-fill-dev.safetensors"), + HuggingFile("black-forest-labs/FLUX.1-Canny-dev", "flux1-canny-dev.safetensors"), + HuggingFile("black-forest-labs/FLUX.1-Depth-dev", "flux1-depth-dev.safetensors"), HuggingFile("Kijai/flux-fp8", "flux1-dev-fp8.safetensors"), HuggingFile("Kijai/flux-fp8", "flux1-schnell-fp8.safetensors"), HuggingFile("Comfy-Org/mochi_preview_repackaged", "split_files/diffusion_models/mochi_preview_bf16.safetensors"), @@ -452,6 +458,10 @@ KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([ HuggingFile("zer0int/CLIP-GmP-ViT-L-14", "ViT-L-14-TEXT-detail-improved-hiT-GmP-TE-only-HF.safetensors"), ], folder_name="clip") +KNOWN_STYLE_MODELS: Final[KnownDownloadables] = KnownDownloadables([ + HuggingFile("black-forest-labs/FLUX.1-Redux-dev", "flux1-redux-dev.safetensors"), +], folder_name="style_models") + _known_models_db: list[KnownDownloadables] = [ KNOWN_CHECKPOINTS, KNOWN_VAES, @@ -466,6 +476,7 @@ _known_models_db: list[KnownDownloadables] = [ KNOWN_IMAGE_ONLY_CHECKPOINTS, KNOWN_UNCLIP_CHECKPOINTS, KNOWN_UPSCALERS, + KNOWN_STYLE_MODELS, ] diff --git a/tests/inference/workflows/flux-controlnet-1.json b/tests/inference/workflows/flux-controlnet-1.json new file mode 100644 index 000000000..18eb7e7e2 --- /dev/null +++ b/tests/inference/workflows/flux-controlnet-1.json @@ -0,0 +1,193 @@ +{ + "3": { + "inputs": { + "seed": 432318046789205, + "steps": 20, + "cfg": 1, + "sampler_name": "euler", + "scheduler": "normal", + "denoise": 1, + "model": [ + "31", + 0 + ], + "positive": [ + "35", + 0 + ], + "negative": [ + "35", + 1 + ], + "latent_image": [ + "35", + 2 + ] + }, + "class_type": "KSampler", + "_meta": { + "title": "KSampler" + } + }, + "7": { + "inputs": { + "text": "", + "clip": [ + "34", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Negative Prompt)" + } + }, + "8": { + "inputs": { + "samples": [ + "3", + 0 + ], + "vae": [ + "32", + 0 + ] + }, + "class_type": "VAEDecode", + "_meta": { + "title": "VAE Decode" + } + }, + "9": { + "inputs": { + "filename_prefix": "ComfyUI", + "images": [ + "8", + 0 + ] + }, + "class_type": "SaveImage", + "_meta": { + "title": "Save Image" + } + }, + "18": { + "inputs": { + "low_threshold": 0.15, + "high_threshold": 0.3, + "image": [ + "36", + 0 + ] + }, + "class_type": "Canny", + "_meta": { + "title": "Canny" + } + }, + "19": { + "inputs": { + "images": [ + "18", + 0 + ] + }, + "class_type": "PreviewImage", + "_meta": { + "title": "Preview Image" + } + }, + "23": { + "inputs": { + "text": "cute anime girl with massive fluffy fennec ears and a big fluffy tail blonde messy long hair blue eyes wearing a pink sweater and jeans", + "clip": [ + "34", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Positive Prompt)" + } + }, + "26": { + "inputs": { + "guidance": 30, + "conditioning": [ + "23", + 0 + ] + }, + "class_type": "FluxGuidance", + "_meta": { + "title": "FluxGuidance" + } + }, + "31": { + "inputs": { + "unet_name": "flux1-canny-dev.safetensors", + "weight_dtype": "default" + }, + "class_type": "UNETLoader", + "_meta": { + "title": "Load Diffusion Model" + } + }, + "32": { + "inputs": { + "vae_name": "ae.safetensors" + }, + "class_type": "VAELoader", + "_meta": { + "title": "Load VAE" + } + }, + "34": { + "inputs": { + "clip_name1": "clip_l.safetensors", + "clip_name2": "t5xxl_fp16.safetensors", + "type": "flux" + }, + "class_type": "DualCLIPLoader", + "_meta": { + "title": "DualCLIPLoader" + } + }, + "35": { + "inputs": { + "positive": [ + "26", + 0 + ], + "negative": [ + "7", + 0 + ], + "vae": [ + "32", + 0 + ], + "pixels": [ + "18", + 0 + ] + }, + "class_type": "InstructPixToPixConditioning", + "_meta": { + "title": "InstructPixToPixConditioning" + } + }, + "36": { + "inputs": { + "value": "https://comfyanonymous.github.io/ComfyUI_examples/flux/flux_fill_inpaint_example.png", + "name": "", + "title": "", + "description": "", + "__required": true + }, + "class_type": "ImageRequestParameter", + "_meta": { + "title": "ImageRequestParameter" + } + } +} \ No newline at end of file diff --git a/tests/inference/workflows/flux-inpainting-0.json b/tests/inference/workflows/flux-inpainting-0.json new file mode 100644 index 000000000..a76543d55 --- /dev/null +++ b/tests/inference/workflows/flux-inpainting-0.json @@ -0,0 +1,201 @@ +{ + "3": { + "inputs": { + "seed": 164211176398261, + "steps": 20, + "cfg": 1, + "sampler_name": "euler", + "scheduler": "normal", + "denoise": 1, + "model": [ + "39", + 0 + ], + "positive": [ + "38", + 0 + ], + "negative": [ + "38", + 1 + ], + "latent_image": [ + "38", + 2 + ] + }, + "class_type": "KSampler", + "_meta": { + "title": "KSampler" + } + }, + "7": { + "inputs": { + "text": "", + "clip": [ + "34", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Negative Prompt)" + } + }, + "8": { + "inputs": { + "samples": [ + "3", + 0 + ], + "vae": [ + "32", + 0 + ] + }, + "class_type": "VAEDecode", + "_meta": { + "title": "VAE Decode" + } + }, + "9": { + "inputs": { + "filename_prefix": "ComfyUI", + "images": [ + "8", + 0 + ] + }, + "class_type": "SaveImage", + "_meta": { + "title": "Save Image" + } + }, + "23": { + "inputs": { + "text": "beautiful scenery", + "clip": [ + "34", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Positive Prompt)" + } + }, + "26": { + "inputs": { + "guidance": 30, + "conditioning": [ + "23", + 0 + ] + }, + "class_type": "FluxGuidance", + "_meta": { + "title": "FluxGuidance" + } + }, + "31": { + "inputs": { + "unet_name": "flux1-fill-dev.safetensors", + "weight_dtype": "default" + }, + "class_type": "UNETLoader", + "_meta": { + "title": "Load Diffusion Model" + } + }, + "32": { + "inputs": { + "vae_name": "ae.safetensors" + }, + "class_type": "VAELoader", + "_meta": { + "title": "Load VAE" + } + }, + "34": { + "inputs": { + "clip_name1": "clip_l.safetensors", + "clip_name2": "t5xxl_fp16.safetensors", + "type": "flux" + }, + "class_type": "DualCLIPLoader", + "_meta": { + "title": "DualCLIPLoader" + } + }, + "38": { + "inputs": { + "noise_mask": false, + "positive": [ + "26", + 0 + ], + "negative": [ + "7", + 0 + ], + "vae": [ + "32", + 0 + ], + "pixels": [ + "44", + 0 + ], + "mask": [ + "44", + 1 + ] + }, + "class_type": "InpaintModelConditioning", + "_meta": { + "title": "InpaintModelConditioning" + } + }, + "39": { + "inputs": { + "model": [ + "31", + 0 + ] + }, + "class_type": "DifferentialDiffusion", + "_meta": { + "title": "Differential Diffusion" + } + }, + "44": { + "inputs": { + "left": 400, + "top": 0, + "right": 400, + "bottom": 400, + "feathering": 24, + "image": [ + "45", + 0 + ] + }, + "class_type": "ImagePadForOutpaint", + "_meta": { + "title": "Pad Image for Outpainting" + } + }, + "45": { + "inputs": { + "value": "https://comfyanonymous.github.io/ComfyUI_examples/flux/flux_fill_inpaint_example.png", + "name": "", + "title": "", + "description": "", + "__required": true + }, + "class_type": "ImageRequestParameter", + "_meta": { + "title": "ImageRequestParameter" + } + } +} \ No newline at end of file diff --git a/tests/inference/workflows/flux-redux-0.json b/tests/inference/workflows/flux-redux-0.json new file mode 100644 index 000000000..97cae67a3 --- /dev/null +++ b/tests/inference/workflows/flux-redux-0.json @@ -0,0 +1,258 @@ +{ + "6": { + "inputs": { + "text": "cute anime girl with massive fluffy fennec ears", + "clip": [ + "11", + 0 + ] + }, + "class_type": "CLIPTextEncode", + "_meta": { + "title": "CLIP Text Encode (Positive Prompt)" + } + }, + "8": { + "inputs": { + "samples": [ + "13", + 0 + ], + "vae": [ + "10", + 0 + ] + }, + "class_type": "VAEDecode", + "_meta": { + "title": "VAE Decode" + } + }, + "9": { + "inputs": { + "filename_prefix": "ComfyUI", + "images": [ + "8", + 0 + ] + }, + "class_type": "SaveImage", + "_meta": { + "title": "Save Image" + } + }, + "10": { + "inputs": { + "vae_name": "ae.safetensors" + }, + "class_type": "VAELoader", + "_meta": { + "title": "Load VAE" + } + }, + "11": { + "inputs": { + "clip_name1": "t5xxl_fp16.safetensors", + "clip_name2": "clip_l.safetensors", + "type": "flux" + }, + "class_type": "DualCLIPLoader", + "_meta": { + "title": "DualCLIPLoader" + } + }, + "12": { + "inputs": { + "unet_name": "flux1-dev.safetensors", + "weight_dtype": "default" + }, + "class_type": "UNETLoader", + "_meta": { + "title": "Load Diffusion Model" + } + }, + "13": { + "inputs": { + "noise": [ + "25", + 0 + ], + "guider": [ + "22", + 0 + ], + "sampler": [ + "16", + 0 + ], + "sigmas": [ + "17", + 0 + ], + "latent_image": [ + "27", + 0 + ] + }, + "class_type": "SamplerCustomAdvanced", + "_meta": { + "title": "SamplerCustomAdvanced" + } + }, + "16": { + "inputs": { + "sampler_name": "euler" + }, + "class_type": "KSamplerSelect", + "_meta": { + "title": "KSamplerSelect" + } + }, + "17": { + "inputs": { + "scheduler": "simple", + "steps": 1, + "denoise": 1, + "model": [ + "30", + 0 + ] + }, + "class_type": "BasicScheduler", + "_meta": { + "title": "BasicScheduler" + } + }, + "22": { + "inputs": { + "model": [ + "30", + 0 + ], + "conditioning": [ + "41", + 0 + ] + }, + "class_type": "BasicGuider", + "_meta": { + "title": "BasicGuider" + } + }, + "25": { + "inputs": { + "noise_seed": 895731728473880 + }, + "class_type": "RandomNoise", + "_meta": { + "title": "RandomNoise" + } + }, + "26": { + "inputs": { + "guidance": 3.5, + "conditioning": [ + "6", + 0 + ] + }, + "class_type": "FluxGuidance", + "_meta": { + "title": "FluxGuidance" + } + }, + "27": { + "inputs": { + "width": 1024, + "height": 1024, + "batch_size": 1 + }, + "class_type": "EmptySD3LatentImage", + "_meta": { + "title": "EmptySD3LatentImage" + } + }, + "30": { + "inputs": { + "max_shift": 1.15, + "base_shift": 0.5, + "width": 1024, + "height": 1024, + "model": [ + "12", + 0 + ] + }, + "class_type": "ModelSamplingFlux", + "_meta": { + "title": "ModelSamplingFlux" + } + }, + "38": { + "inputs": { + "clip_name": "sigclip_vision_patch14_384.safetensors" + }, + "class_type": "CLIPVisionLoader", + "_meta": { + "title": "Load CLIP Vision" + } + }, + "39": { + "inputs": { + "clip_vision": [ + "38", + 0 + ], + "image": [ + "44", + 0 + ] + }, + "class_type": "CLIPVisionEncode", + "_meta": { + "title": "CLIP Vision Encode" + } + }, + "41": { + "inputs": { + "conditioning": [ + "26", + 0 + ], + "style_model": [ + "42", + 0 + ], + "clip_vision_output": [ + "39", + 0 + ] + }, + "class_type": "StyleModelApply", + "_meta": { + "title": "Apply Style Model" + } + }, + "42": { + "inputs": { + "style_model_name": "flux1-redux-dev.safetensors" + }, + "class_type": "StyleModelLoader", + "_meta": { + "title": "Load Style Model" + } + }, + "44": { + "inputs": { + "value": "https://comfyanonymous.github.io/ComfyUI_examples/flux/flux_fill_inpaint_example.png", + "name": "", + "title": "", + "description": "", + "__required": true + }, + "class_type": "ImageRequestParameter", + "_meta": { + "title": "ImageRequestParameter" + } + } +} \ No newline at end of file