From b1ad9cad371a3c900f3ca791458e35041a23e872 Mon Sep 17 00:00:00 2001
From: doctorpangloss <@hiddenswitch.com>
Date: Fri, 22 Nov 2024 18:00:29 -0800
Subject: [PATCH] Known Flux controlnet models

---
 comfy/clip_vision.py                          |  32 ++-
 comfy/model_downloader.py                     |  13 +-
 .../workflows/flux-controlnet-1.json          | 193 +++++++++++++
 .../workflows/flux-inpainting-0.json          | 201 ++++++++++++++
 tests/inference/workflows/flux-redux-0.json   | 258 ++++++++++++++++++
 5 files changed, 684 insertions(+), 13 deletions(-)
 create mode 100644 tests/inference/workflows/flux-controlnet-1.json
 create mode 100644 tests/inference/workflows/flux-inpainting-0.json
 create mode 100644 tests/inference/workflows/flux-redux-0.json

diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py
index a2e5262bb..d35005410 100644
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -1,22 +1,25 @@
-from .component_model import files
-from .model_management import load_models_gpu
-from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
-import torch
 import json
 import logging
 
-from . import ops
-from . import model_patcher
-from . import model_management
+import torch
+
 from . import clip_model
+from . import model_management
+from . import model_patcher
+from . import ops
+from .component_model import files
+from .model_management import load_models_gpu
+from .utils import load_torch_file, transformers_convert, state_dict_prefix_replace
 
 
 class Output:
     def __getitem__(self, key):
         return getattr(self, key)
+
     def __setitem__(self, key, item):
         setattr(self, key, item)
 
+
 def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]):
     mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
     std = torch.tensor(std, device=image.device, dtype=image.dtype)
@@ -24,11 +27,12 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
     if not (image.shape[2] == size and image.shape[3] == size):
         scale = (size / min(image.shape[2], image.shape[3]))
         image = torch.nn.functional.interpolate(image, size=(round(scale * image.shape[2]), round(scale * image.shape[3])), mode="bicubic", antialias=True)
-        h = (image.shape[2] - size)//2
-        w = (image.shape[3] - size)//2
-        image = image[:,:,h:h+size,w:w+size]
+        h = (image.shape[2] - size) // 2
+        w = (image.shape[3] - size) // 2
+        image = image[:, :, h:h + size, w:w + size]
     image = torch.clip((255. * image), 0, 255).round() / 255.0
-    return (image - mean.view([3,1,1])) / std.view([3,1,1])
+    return (image - mean.view([3, 1, 1])) / std.view([3, 1, 1])
+
 
 class ClipVisionModel():
     def __init__(self, json_config: dict | str):
@@ -53,6 +57,7 @@ class ClipVisionModel():
         self.model.eval()
 
         self.patcher = model_patcher.ModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+
     def load_sd(self, sd):
         return self.model.load_state_dict(sd, strict=False)
 
@@ -70,6 +75,7 @@ class ClipVisionModel():
         outputs["penultimate_hidden_states"] = out[1].to(model_management.intermediate_device())
         return outputs
 
+
 def convert_to_transformers(sd, prefix):
     sd_k = sd.keys()
     if "{}transformer.resblocks.0.attn.in_proj_weight".format(prefix) in sd_k:
@@ -96,6 +102,7 @@ def convert_to_transformers(sd, prefix):
         sd = state_dict_prefix_replace(sd, replace_prefix)
     return sd
 
+
 def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
     if convert_keys:
         sd = convert_to_transformers(sd, prefix)
@@ -105,7 +112,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
         json_config = files.get_path_as_dict(None, "clip_vision_config_h.json")
     elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
         if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+            json_config = files.get_path_as_dict(None, "clip_vision_siglip_384.json")
         elif sd["vision_model.embeddings.position_embedding.weight"].shape[0] == 577:
             json_config = files.get_path_as_dict(None, "clip_vision_config_vitl_336.json")
         else:
@@ -124,6 +131,7 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
             sd.pop(k)
     return clip
 
+
 def load(ckpt_path):
     sd = load_torch_file(ckpt_path)
     if "visual.transformer.resblocks.0.attn.in_proj_weight" in sd:
diff --git a/comfy/model_downloader.py b/comfy/model_downloader.py
index fba8e9742..8c5de3096 100644
--- a/comfy/model_downloader.py
+++ b/comfy/model_downloader.py
@@ -283,7 +283,8 @@ KNOWN_GLIGEN_MODELS: Final[KnownDownloadables] = KnownDownloadables([
 ], folder_name="gligen")
 
 KNOWN_CLIP_VISION_MODELS: Final[KnownDownloadables] = KnownDownloadables([
-    HuggingFile("comfyanonymous/clip_vision_g", "clip_vision_g.safetensors")
+    HuggingFile("comfyanonymous/clip_vision_g", "clip_vision_g.safetensors"),
+    HuggingFile("Comfy-Org/sigclip_vision_384", "sigclip_vision_patch14_384.safetensors"),
 ], folder_name="clip_vision")
 
 KNOWN_LORAS: Final[KnownDownloadables] = KnownDownloadables([
@@ -292,6 +293,8 @@ KNOWN_LORAS: Final[KnownDownloadables] = KnownDownloadables([
     CivitFile(model_id=47085, model_version_id=55199, filename="GoodHands-beta2.safetensors"),
     HuggingFile("ByteDance/Hyper-SD", "Hyper-SDXL-12steps-CFG-lora.safetensors"),
     HuggingFile("ByteDance/Hyper-SD", "Hyper-SD15-12steps-CFG-lora.safetensors"),
+    HuggingFile("black-forest-labs/FLUX.1-Canny-dev-lora", "flux1-canny-dev-lora.safetensors"),
+    HuggingFile("black-forest-labs/FLUX.1-Depth-dev-lora", "flux1-depth-dev-lora.safetensors"),
 ], folder_name="loras")
 
 KNOWN_CONTROLNETS: Final[KnownDownloadables] = KnownDownloadables([
@@ -434,6 +437,9 @@ KNOWN_UNET_MODELS: Final[KnownDownloadables] = KnownDownloadables([
     HuggingFile("ByteDance/Hyper-SD", "Hyper-SDXL-1step-Unet-Comfyui.fp16.safetensors"),
     HuggingFile("black-forest-labs/FLUX.1-schnell", "flux1-schnell.safetensors"),
     HuggingFile("black-forest-labs/FLUX.1-dev", "flux1-dev.safetensors"),
+    HuggingFile("black-forest-labs/FLUX.1-Fill-dev", "flux1-fill-dev.safetensors"),
+    HuggingFile("black-forest-labs/FLUX.1-Canny-dev", "flux1-canny-dev.safetensors"),
+    HuggingFile("black-forest-labs/FLUX.1-Depth-dev", "flux1-depth-dev.safetensors"),
     HuggingFile("Kijai/flux-fp8", "flux1-dev-fp8.safetensors"),
     HuggingFile("Kijai/flux-fp8", "flux1-schnell-fp8.safetensors"),
     HuggingFile("Comfy-Org/mochi_preview_repackaged", "split_files/diffusion_models/mochi_preview_bf16.safetensors"),
@@ -452,6 +458,10 @@ KNOWN_CLIP_MODELS: Final[KnownDownloadables] = KnownDownloadables([
     HuggingFile("zer0int/CLIP-GmP-ViT-L-14", "ViT-L-14-TEXT-detail-improved-hiT-GmP-TE-only-HF.safetensors"),
 ], folder_name="clip")
 
+KNOWN_STYLE_MODELS: Final[KnownDownloadables] = KnownDownloadables([
+    HuggingFile("black-forest-labs/FLUX.1-Redux-dev", "flux1-redux-dev.safetensors"),
+], folder_name="style_models")
+
 _known_models_db: list[KnownDownloadables] = [
     KNOWN_CHECKPOINTS,
     KNOWN_VAES,
@@ -466,6 +476,7 @@ _known_models_db: list[KnownDownloadables] = [
     KNOWN_IMAGE_ONLY_CHECKPOINTS,
     KNOWN_UNCLIP_CHECKPOINTS,
     KNOWN_UPSCALERS,
+    KNOWN_STYLE_MODELS,
 ]
 
 
diff --git a/tests/inference/workflows/flux-controlnet-1.json b/tests/inference/workflows/flux-controlnet-1.json
new file mode 100644
index 000000000..18eb7e7e2
--- /dev/null
+++ b/tests/inference/workflows/flux-controlnet-1.json
@@ -0,0 +1,193 @@
+{
+  "3": {
+    "inputs": {
+      "seed": 432318046789205,
+      "steps": 20,
+      "cfg": 1,
+      "sampler_name": "euler",
+      "scheduler": "normal",
+      "denoise": 1,
+      "model": [
+        "31",
+        0
+      ],
+      "positive": [
+        "35",
+        0
+      ],
+      "negative": [
+        "35",
+        1
+      ],
+      "latent_image": [
+        "35",
+        2
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "KSampler"
+    }
+  },
+  "7": {
+    "inputs": {
+      "text": "",
+      "clip": [
+        "34",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Negative Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "3",
+        0
+      ],
+      "vae": [
+        "32",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "9": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "images": [
+        "8",
+        0
+      ]
+    },
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "Save Image"
+    }
+  },
+  "18": {
+    "inputs": {
+      "low_threshold": 0.15,
+      "high_threshold": 0.3,
+      "image": [
+        "36",
+        0
+      ]
+    },
+    "class_type": "Canny",
+    "_meta": {
+      "title": "Canny"
+    }
+  },
+  "19": {
+    "inputs": {
+      "images": [
+        "18",
+        0
+      ]
+    },
+    "class_type": "PreviewImage",
+    "_meta": {
+      "title": "Preview Image"
+    }
+  },
+  "23": {
+    "inputs": {
+      "text": "cute anime girl with massive fluffy fennec ears and a big fluffy tail blonde messy long hair blue eyes wearing a pink sweater and jeans",
+      "clip": [
+        "34",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Positive Prompt)"
+    }
+  },
+  "26": {
+    "inputs": {
+      "guidance": 30,
+      "conditioning": [
+        "23",
+        0
+      ]
+    },
+    "class_type": "FluxGuidance",
+    "_meta": {
+      "title": "FluxGuidance"
+    }
+  },
+  "31": {
+    "inputs": {
+      "unet_name": "flux1-canny-dev.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "32": {
+    "inputs": {
+      "vae_name": "ae.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  },
+  "34": {
+    "inputs": {
+      "clip_name1": "clip_l.safetensors",
+      "clip_name2": "t5xxl_fp16.safetensors",
+      "type": "flux"
+    },
+    "class_type": "DualCLIPLoader",
+    "_meta": {
+      "title": "DualCLIPLoader"
+    }
+  },
+  "35": {
+    "inputs": {
+      "positive": [
+        "26",
+        0
+      ],
+      "negative": [
+        "7",
+        0
+      ],
+      "vae": [
+        "32",
+        0
+      ],
+      "pixels": [
+        "18",
+        0
+      ]
+    },
+    "class_type": "InstructPixToPixConditioning",
+    "_meta": {
+      "title": "InstructPixToPixConditioning"
+    }
+  },
+  "36": {
+    "inputs": {
+      "value": "https://comfyanonymous.github.io/ComfyUI_examples/flux/flux_fill_inpaint_example.png",
+      "name": "",
+      "title": "",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "ImageRequestParameter",
+    "_meta": {
+      "title": "ImageRequestParameter"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/inference/workflows/flux-inpainting-0.json b/tests/inference/workflows/flux-inpainting-0.json
new file mode 100644
index 000000000..a76543d55
--- /dev/null
+++ b/tests/inference/workflows/flux-inpainting-0.json
@@ -0,0 +1,201 @@
+{
+  "3": {
+    "inputs": {
+      "seed": 164211176398261,
+      "steps": 20,
+      "cfg": 1,
+      "sampler_name": "euler",
+      "scheduler": "normal",
+      "denoise": 1,
+      "model": [
+        "39",
+        0
+      ],
+      "positive": [
+        "38",
+        0
+      ],
+      "negative": [
+        "38",
+        1
+      ],
+      "latent_image": [
+        "38",
+        2
+      ]
+    },
+    "class_type": "KSampler",
+    "_meta": {
+      "title": "KSampler"
+    }
+  },
+  "7": {
+    "inputs": {
+      "text": "",
+      "clip": [
+        "34",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Negative Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "3",
+        0
+      ],
+      "vae": [
+        "32",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "9": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "images": [
+        "8",
+        0
+      ]
+    },
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "Save Image"
+    }
+  },
+  "23": {
+    "inputs": {
+      "text": "beautiful scenery",
+      "clip": [
+        "34",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Positive Prompt)"
+    }
+  },
+  "26": {
+    "inputs": {
+      "guidance": 30,
+      "conditioning": [
+        "23",
+        0
+      ]
+    },
+    "class_type": "FluxGuidance",
+    "_meta": {
+      "title": "FluxGuidance"
+    }
+  },
+  "31": {
+    "inputs": {
+      "unet_name": "flux1-fill-dev.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "32": {
+    "inputs": {
+      "vae_name": "ae.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  },
+  "34": {
+    "inputs": {
+      "clip_name1": "clip_l.safetensors",
+      "clip_name2": "t5xxl_fp16.safetensors",
+      "type": "flux"
+    },
+    "class_type": "DualCLIPLoader",
+    "_meta": {
+      "title": "DualCLIPLoader"
+    }
+  },
+  "38": {
+    "inputs": {
+      "noise_mask": false,
+      "positive": [
+        "26",
+        0
+      ],
+      "negative": [
+        "7",
+        0
+      ],
+      "vae": [
+        "32",
+        0
+      ],
+      "pixels": [
+        "44",
+        0
+      ],
+      "mask": [
+        "44",
+        1
+      ]
+    },
+    "class_type": "InpaintModelConditioning",
+    "_meta": {
+      "title": "InpaintModelConditioning"
+    }
+  },
+  "39": {
+    "inputs": {
+      "model": [
+        "31",
+        0
+      ]
+    },
+    "class_type": "DifferentialDiffusion",
+    "_meta": {
+      "title": "Differential Diffusion"
+    }
+  },
+  "44": {
+    "inputs": {
+      "left": 400,
+      "top": 0,
+      "right": 400,
+      "bottom": 400,
+      "feathering": 24,
+      "image": [
+        "45",
+        0
+      ]
+    },
+    "class_type": "ImagePadForOutpaint",
+    "_meta": {
+      "title": "Pad Image for Outpainting"
+    }
+  },
+  "45": {
+    "inputs": {
+      "value": "https://comfyanonymous.github.io/ComfyUI_examples/flux/flux_fill_inpaint_example.png",
+      "name": "",
+      "title": "",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "ImageRequestParameter",
+    "_meta": {
+      "title": "ImageRequestParameter"
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/inference/workflows/flux-redux-0.json b/tests/inference/workflows/flux-redux-0.json
new file mode 100644
index 000000000..97cae67a3
--- /dev/null
+++ b/tests/inference/workflows/flux-redux-0.json
@@ -0,0 +1,258 @@
+{
+  "6": {
+    "inputs": {
+      "text": "cute anime girl with massive fluffy fennec ears",
+      "clip": [
+        "11",
+        0
+      ]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {
+      "title": "CLIP Text Encode (Positive Prompt)"
+    }
+  },
+  "8": {
+    "inputs": {
+      "samples": [
+        "13",
+        0
+      ],
+      "vae": [
+        "10",
+        0
+      ]
+    },
+    "class_type": "VAEDecode",
+    "_meta": {
+      "title": "VAE Decode"
+    }
+  },
+  "9": {
+    "inputs": {
+      "filename_prefix": "ComfyUI",
+      "images": [
+        "8",
+        0
+      ]
+    },
+    "class_type": "SaveImage",
+    "_meta": {
+      "title": "Save Image"
+    }
+  },
+  "10": {
+    "inputs": {
+      "vae_name": "ae.safetensors"
+    },
+    "class_type": "VAELoader",
+    "_meta": {
+      "title": "Load VAE"
+    }
+  },
+  "11": {
+    "inputs": {
+      "clip_name1": "t5xxl_fp16.safetensors",
+      "clip_name2": "clip_l.safetensors",
+      "type": "flux"
+    },
+    "class_type": "DualCLIPLoader",
+    "_meta": {
+      "title": "DualCLIPLoader"
+    }
+  },
+  "12": {
+    "inputs": {
+      "unet_name": "flux1-dev.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {
+      "title": "Load Diffusion Model"
+    }
+  },
+  "13": {
+    "inputs": {
+      "noise": [
+        "25",
+        0
+      ],
+      "guider": [
+        "22",
+        0
+      ],
+      "sampler": [
+        "16",
+        0
+      ],
+      "sigmas": [
+        "17",
+        0
+      ],
+      "latent_image": [
+        "27",
+        0
+      ]
+    },
+    "class_type": "SamplerCustomAdvanced",
+    "_meta": {
+      "title": "SamplerCustomAdvanced"
+    }
+  },
+  "16": {
+    "inputs": {
+      "sampler_name": "euler"
+    },
+    "class_type": "KSamplerSelect",
+    "_meta": {
+      "title": "KSamplerSelect"
+    }
+  },
+  "17": {
+    "inputs": {
+      "scheduler": "simple",
+      "steps": 1,
+      "denoise": 1,
+      "model": [
+        "30",
+        0
+      ]
+    },
+    "class_type": "BasicScheduler",
+    "_meta": {
+      "title": "BasicScheduler"
+    }
+  },
+  "22": {
+    "inputs": {
+      "model": [
+        "30",
+        0
+      ],
+      "conditioning": [
+        "41",
+        0
+      ]
+    },
+    "class_type": "BasicGuider",
+    "_meta": {
+      "title": "BasicGuider"
+    }
+  },
+  "25": {
+    "inputs": {
+      "noise_seed": 895731728473880
+    },
+    "class_type": "RandomNoise",
+    "_meta": {
+      "title": "RandomNoise"
+    }
+  },
+  "26": {
+    "inputs": {
+      "guidance": 3.5,
+      "conditioning": [
+        "6",
+        0
+      ]
+    },
+    "class_type": "FluxGuidance",
+    "_meta": {
+      "title": "FluxGuidance"
+    }
+  },
+  "27": {
+    "inputs": {
+      "width": 1024,
+      "height": 1024,
+      "batch_size": 1
+    },
+    "class_type": "EmptySD3LatentImage",
+    "_meta": {
+      "title": "EmptySD3LatentImage"
+    }
+  },
+  "30": {
+    "inputs": {
+      "max_shift": 1.15,
+      "base_shift": 0.5,
+      "width": 1024,
+      "height": 1024,
+      "model": [
+        "12",
+        0
+      ]
+    },
+    "class_type": "ModelSamplingFlux",
+    "_meta": {
+      "title": "ModelSamplingFlux"
+    }
+  },
+  "38": {
+    "inputs": {
+      "clip_name": "sigclip_vision_patch14_384.safetensors"
+    },
+    "class_type": "CLIPVisionLoader",
+    "_meta": {
+      "title": "Load CLIP Vision"
+    }
+  },
+  "39": {
+    "inputs": {
+      "clip_vision": [
+        "38",
+        0
+      ],
+      "image": [
+        "44",
+        0
+      ]
+    },
+    "class_type": "CLIPVisionEncode",
+    "_meta": {
+      "title": "CLIP Vision Encode"
+    }
+  },
+  "41": {
+    "inputs": {
+      "conditioning": [
+        "26",
+        0
+      ],
+      "style_model": [
+        "42",
+        0
+      ],
+      "clip_vision_output": [
+        "39",
+        0
+      ]
+    },
+    "class_type": "StyleModelApply",
+    "_meta": {
+      "title": "Apply Style Model"
+    }
+  },
+  "42": {
+    "inputs": {
+      "style_model_name": "flux1-redux-dev.safetensors"
+    },
+    "class_type": "StyleModelLoader",
+    "_meta": {
+      "title": "Load Style Model"
+    }
+  },
+  "44": {
+    "inputs": {
+      "value": "https://comfyanonymous.github.io/ComfyUI_examples/flux/flux_fill_inpaint_example.png",
+      "name": "",
+      "title": "",
+      "description": "",
+      "__required": true
+    },
+    "class_type": "ImageRequestParameter",
+    "_meta": {
+      "title": "ImageRequestParameter"
+    }
+  }
+}
\ No newline at end of file