mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 21:20:49 +08:00
JoyImageEdit is an image-edit diffusion transformer from JD (jd-opensource),
Apache 2.0. This adds native ComfyUI support so it loads and runs like other
edit models (load checkpoint -> TextEncode + ReferenceLatent -> KSampler ->
VAEDecode), with no diffusers dependency.
Architecture:
- Transformer (comfy/ldm/joyimage/model.py): dual-stream (img/txt) DiT with a
Conv3d patch embed (patch_size [1,2,2]), Wan-style learnable modulation,
and 3D RoPE (rope_dim_list [16,56,56]). All attention goes through
comfy.ldm.modules.attention.optimized_attention.
- Text encoder (comfy/text_encoders/{qwen3_vl,joyimage}.py): a reusable
Qwen3-VL multimodal stack (vision tower + LM) in qwen3_vl.py, plus a thin
JoyImage-specific layer (prompt templates, drop_idx, tokenizer, te() factory)
in joyimage.py that depends on it. text_dim 4096.
- VAE: reuses the existing Wan 2.1 latent format (AutoencoderKLWan), no new
latent format.
- Edit conditioning: reuses the reference_latents mechanism. Reference and
noise latents are stacked on a new n-slot dimension and rotated at the model
boundary (model_base.JoyImage), so the transformer stays 5D-in/5D-out.
Guidance-rescale is built into the CFG path.
Model wiring:
- model_base.JoyImage uses ModelType.FLOW with sampling_settings
multiplier=1000 (the time embedding is trained on t in [0,1000]) and
shift=1.5; FLOW's linear time_snr_shift matches the diffusers
FlowMatchEuler sigma schedule.
- model_detection sniffs the transformer state-dict (double_blocks.*,
condition_embedder.*, 5D img_in Conv3d) to route image_model="joyimage".
- supported_models.JoyImage and the CLIPLoader "joyimage" type register it.
User-facing node TextEncodeJoyImageEdit (comfy_extras/nodes_joyimage.py)
bucket-resizes the input image to the nearest 1024-base bucket, encodes the
prompt with the image, and emits both the conditioning and the bucketed image
so the same pixels feed VAEEncode and the negative encode (JoyImage requires
noise and reference latents to share spatial dims).
89 lines
2.6 KiB
Python
89 lines
2.6 KiB
Python
import node_helpers
|
|
import comfy.utils
|
|
from typing_extensions import override
|
|
from comfy_api.latest import ComfyExtension, io
|
|
|
|
|
|
# fmt: off
|
|
BUCKETS_1024 = [
|
|
(512, 1792), (512, 1856), (512, 1920), (512, 1984), (512, 2048),
|
|
(576, 1600), (576, 1664), (576, 1728), (576, 1792),
|
|
(640, 1472), (640, 1536), (640, 1600),
|
|
(704, 1344), (704, 1408), (704, 1472),
|
|
(768, 1216), (768, 1280), (768, 1344),
|
|
(832, 1152), (832, 1216),
|
|
(896, 1088), (896, 1152),
|
|
(960, 1024), (960, 1088),
|
|
(1024, 960), (1024, 1024),
|
|
(1088, 896), (1088, 960),
|
|
(1152, 832), (1152, 896),
|
|
(1216, 768), (1216, 832),
|
|
(1280, 768),
|
|
(1344, 704), (1344, 768),
|
|
(1408, 704),
|
|
(1472, 640), (1472, 704),
|
|
(1536, 640),
|
|
(1600, 576), (1600, 640),
|
|
(1664, 576),
|
|
(1728, 576),
|
|
(1792, 512), (1792, 576),
|
|
(1856, 512),
|
|
(1920, 512),
|
|
(1984, 512),
|
|
(2048, 512),
|
|
]
|
|
# fmt: on
|
|
|
|
|
|
def _find_best_bucket(height: int, width: int) -> tuple[int, int]:
|
|
target_ratio = height / width
|
|
return min(BUCKETS_1024, key=lambda hw: abs(hw[0] / hw[1] - target_ratio))
|
|
|
|
|
|
class TextEncodeJoyImageEdit(io.ComfyNode):
|
|
@classmethod
|
|
def define_schema(cls):
|
|
return io.Schema(
|
|
node_id="TextEncodeJoyImageEdit",
|
|
category="advanced/conditioning",
|
|
inputs=[
|
|
io.Clip.Input("clip"),
|
|
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
|
io.Vae.Input("vae"),
|
|
io.Image.Input("image"),
|
|
],
|
|
outputs=[
|
|
io.Conditioning.Output(),
|
|
io.Image.Output(display_name="image"),
|
|
],
|
|
)
|
|
|
|
@classmethod
|
|
def execute(cls, clip, prompt, vae, image) -> io.NodeOutput:
|
|
samples = image.movedim(-1, 1)
|
|
src_h, src_w = samples.shape[2], samples.shape[3]
|
|
bucket_h, bucket_w = _find_best_bucket(src_h, src_w)
|
|
|
|
resized = comfy.utils.common_upscale(samples, bucket_w, bucket_h, "bilinear", "center")
|
|
resized_image = resized.movedim(1, -1)[:, :, :, :3]
|
|
|
|
tokens = clip.tokenize(prompt, images=[resized_image])
|
|
conditioning = clip.encode_from_tokens_scheduled(tokens)
|
|
|
|
ref_latent = vae.encode(resized_image)
|
|
conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True)
|
|
|
|
return io.NodeOutput(conditioning, resized_image)
|
|
|
|
|
|
class JoyImageExtension(ComfyExtension):
|
|
@override
|
|
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
|
return [
|
|
TextEncodeJoyImageEdit,
|
|
]
|
|
|
|
|
|
async def comfy_entrypoint() -> JoyImageExtension:
|
|
return JoyImageExtension()
|