diff --git a/blueprints/Text to Image (LongCat-Image).json b/blueprints/Text to Image (LongCat-Image).json index 1a01ec30a..eda148949 100644 --- a/blueprints/Text to Image (LongCat-Image).json +++ b/blueprints/Text to Image (LongCat-Image).json @@ -1 +1 @@ -{"id": "a7e3b1c0-4f2d-4e8a-9b1c-longcat00001", "revision": 0, "last_node_id": 20, "last_link_id": 20, "nodes": [{"id": 1, "type": "lc-subgraph-001", "pos": [0, 1230], "size": [400, 470], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "prompt", "name": "text", "type": "STRING", "widget": {"name": "text"}, "link": null}, {"name": "width", "type": "INT", "widget": {"name": "width"}, "link": null}, {"name": "height", "type": "INT", "widget": {"name": "height"}, "link": null}, {"name": "unet_name", "type": "COMBO", "widget": {"name": "unet_name"}, "link": null}, {"name": "clip_name", "type": "COMBO", "widget": {"name": "clip_name"}, "link": null}, {"name": "vae_name", "type": "COMBO", "widget": {"name": "vae_name"}, "link": null}], "outputs": [{"localized_name": "IMAGE", "name": "IMAGE", "type": "IMAGE", "links": []}], "properties": {"proxyWidgets": [["-1", "text"], ["-1", "width"], ["-1", "height"], ["7", "seed"], ["7", "control_after_generate"], ["-1", "unet_name"], ["-1", "clip_name"], ["-1", "vae_name"]], "cnr_id": "comfy-core", "ver": "0.3.73", "enableTabs": false}, "widgets_values": ["A young Asian woman wearing a yellow knit sweater with a white necklace, sitting with her hands on her knees and a serene expression. The background is a rough brick wall with warm afternoon sunlight.", 768, 1344, null, null, "longcat_image.safetensors", "qwen_2.5_vl_7b.safetensors", "ae.safetensors"]}], "links": [], "groups": [], "definitions": {"subgraphs": [{"id": "lc-subgraph-001", "version": 1, "state": {"lastGroupId": 4, "lastNodeId": 20, "lastLinkId": 20, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "local-Text to Image (LongCat-Image)", "inputNode": {"id": -10, "bounding": [-80, 425, 120, 160]}, "outputNode": {"id": -20, "bounding": [1490, 415, 120, 60]}, "inputs": [{"id": "inp-text", "name": "text", "type": "STRING", "linkIds": [10], "label": "prompt", "pos": [20, 445]}, {"id": "inp-width", "name": "width", "type": "INT", "linkIds": [11], "pos": [20, 465]}, {"id": "inp-height", "name": "height", "type": "INT", "linkIds": [12], "pos": [20, 485]}, {"id": "inp-unet", "name": "unet_name", "type": "COMBO", "linkIds": [13], "pos": [20, 505]}, {"id": "inp-clip", "name": "clip_name", "type": "COMBO", "linkIds": [14], "pos": [20, 525]}, {"id": "inp-vae", "name": "vae_name", "type": "COMBO", "linkIds": [15], "pos": [20, 545]}], "outputs": [{"id": "out-image", "name": "IMAGE", "type": "IMAGE", "linkIds": [9], "localized_name": "IMAGE", "pos": [1510, 435]}], "widgets": [], "nodes": [{"id": 1, "type": "UNETLoader", "pos": [110, 200], "size": [270, 82], "flags": {}, "order": 0, "mode": 0, "inputs": [{"name": "unet_name", "type": "COMBO", "widget": {"name": "unet_name"}, "link": 13}, {"name": "weight_dtype", "type": "COMBO", "widget": {"name": "weight_dtype"}, "link": null}], "outputs": [{"name": "MODEL", "type": "MODEL", "links": [1]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "UNETLoader", "models": [{"name": "longcat_image.safetensors", "url": "https://huggingface.co/meituan-longcat/LongCat-Image/resolve/main/transformer/diffusion_pytorch_model.safetensors", "directory": "diffusion_models"}]}, "widgets_values": ["longcat_image.safetensors", "default"]}, {"id": 2, "type": "CLIPLoader", "pos": [110, 330], "size": [270, 106], "flags": {}, "order": 1, "mode": 0, "inputs": [{"name": "clip_name", "type": "COMBO", "widget": {"name": "clip_name"}, "link": 14}, {"name": "type", "type": "COMBO", "widget": {"name": "type"}, "link": null}, {"name": "device", "shape": 7, "type": "COMBO", "widget": {"name": "device"}, "link": null}], "outputs": [{"name": "CLIP", "type": "CLIP", "links": [2, 16]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPLoader", "models": [{"name": "qwen_2.5_vl_7b.safetensors", "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b.safetensors", "directory": "text_encoders"}]}, "widgets_values": ["qwen_2.5_vl_7b.safetensors", "longcat_image", "default"]}, {"id": 3, "type": "VAELoader", "pos": [110, 480], "size": [270, 58], "flags": {}, "order": 2, "mode": 0, "inputs": [{"name": "vae_name", "type": "COMBO", "widget": {"name": "vae_name"}, "link": 15}], "outputs": [{"name": "VAE", "type": "VAE", "links": [3]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "VAELoader", "models": [{"name": "ae.safetensors", "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors", "directory": "vae"}]}, "widgets_values": ["ae.safetensors"]}, {"id": 4, "type": "CLIPTextEncodeLongCatImage", "pos": [430, 200], "size": [410, 250], "flags": {}, "order": 3, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 2}, {"name": "text", "type": "STRING", "widget": {"name": "text"}, "link": 10}, {"name": "guidance", "type": "FLOAT", "widget": {"name": "guidance"}, "link": null}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [4]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPTextEncodeLongCatImage"}, "widgets_values": ["", 4.0]}, {"id": 5, "type": "CLIPTextEncodeLongCatImage", "pos": [430, 510], "size": [410, 120], "flags": {}, "order": 4, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 16}, {"name": "text", "type": "STRING", "widget": {"name": "text"}, "link": null}, {"name": "guidance", "type": "FLOAT", "widget": {"name": "guidance"}, "link": null}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [6]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPTextEncodeLongCatImage"}, "widgets_values": ["", 4.0]}, {"id": 10, "type": "CFGRenormLongCatImage", "pos": [880, 160], "size": [280, 26], "flags": {}, "order": 5, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 1}], "outputs": [{"name": "MODEL", "type": "MODEL", "links": [17]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CFGRenormLongCatImage"}, "widgets_values": []}, {"id": 6, "type": "EmptySD3LatentImage", "pos": [110, 630], "size": [260, 106], "flags": {}, "order": 6, "mode": 0, "inputs": [{"name": "width", "type": "INT", "widget": {"name": "width"}, "link": 11}, {"name": "height", "type": "INT", "widget": {"name": "height"}, "link": 12}, {"name": "batch_size", "type": "INT", "widget": {"name": "batch_size"}, "link": null}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [7]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "EmptySD3LatentImage"}, "widgets_values": [768, 1344, 1]}, {"id": 7, "type": "KSampler", "pos": [880, 230], "size": [315, 262], "flags": {}, "order": 7, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 17}, {"name": "positive", "type": "CONDITIONING", "link": 4}, {"name": "negative", "type": "CONDITIONING", "link": 6}, {"name": "latent_image", "type": "LATENT", "link": 7}, {"name": "seed", "type": "INT", "widget": {"name": "seed"}, "link": null}, {"name": "steps", "type": "INT", "widget": {"name": "steps"}, "link": null}, {"name": "cfg", "type": "FLOAT", "widget": {"name": "cfg"}, "link": null}, {"name": "sampler_name", "type": "COMBO", "widget": {"name": "sampler_name"}, "link": null}, {"name": "scheduler", "type": "COMBO", "widget": {"name": "scheduler"}, "link": null}, {"name": "denoise", "type": "FLOAT", "widget": {"name": "denoise"}, "link": null}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [8]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "KSampler"}, "widgets_values": [0, "randomize", 20, 4.0, "euler", "simple", 1.0]}, {"id": 8, "type": "VAEDecode", "pos": [1220, 160], "size": [210, 46], "flags": {}, "order": 8, "mode": 0, "inputs": [{"name": "samples", "type": "LATENT", "link": 8}, {"name": "vae", "type": "VAE", "link": 3}], "outputs": [{"name": "IMAGE", "type": "IMAGE", "links": [9]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "VAEDecode"}, "widgets_values": []}], "groups": [{"id": 1, "title": "Image size", "bounding": [100, 560, 290, 200], "color": "#3f789e", "font_size": 24, "flags": {}}, {"id": 2, "title": "Prompt", "bounding": [410, 130, 450, 540], "color": "#3f789e", "font_size": 24, "flags": {}}, {"id": 3, "title": "Models", "bounding": [100, 130, 290, 413], "color": "#3f789e", "font_size": 24, "flags": {}}], "links": [{"id": 1, "origin_id": 1, "origin_slot": 0, "target_id": 10, "target_slot": 0, "type": "MODEL"}, {"id": 2, "origin_id": 2, "origin_slot": 0, "target_id": 4, "target_slot": 0, "type": "CLIP"}, {"id": 3, "origin_id": 3, "origin_slot": 0, "target_id": 8, "target_slot": 1, "type": "VAE"}, {"id": 4, "origin_id": 4, "origin_slot": 0, "target_id": 7, "target_slot": 1, "type": "CONDITIONING"}, {"id": 6, "origin_id": 5, "origin_slot": 0, "target_id": 7, "target_slot": 2, "type": "CONDITIONING"}, {"id": 7, "origin_id": 6, "origin_slot": 0, "target_id": 7, "target_slot": 3, "type": "LATENT"}, {"id": 8, "origin_id": 7, "origin_slot": 0, "target_id": 8, "target_slot": 0, "type": "LATENT"}, {"id": 9, "origin_id": 8, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}, {"id": 10, "origin_id": -10, "origin_slot": 0, "target_id": 4, "target_slot": 1, "type": "STRING"}, {"id": 11, "origin_id": -10, "origin_slot": 1, "target_id": 6, "target_slot": 0, "type": "INT"}, {"id": 12, "origin_id": -10, "origin_slot": 2, "target_id": 6, "target_slot": 1, "type": "INT"}, {"id": 13, "origin_id": -10, "origin_slot": 3, "target_id": 1, "target_slot": 0, "type": "COMBO"}, {"id": 14, "origin_id": -10, "origin_slot": 4, "target_id": 2, "target_slot": 0, "type": "COMBO"}, {"id": 15, "origin_id": -10, "origin_slot": 5, "target_id": 3, "target_slot": 0, "type": "COMBO"}, {"id": 16, "origin_id": 2, "origin_slot": 0, "target_id": 5, "target_slot": 0, "type": "CLIP"}, {"id": 17, "origin_id": 10, "origin_slot": 0, "target_id": 7, "target_slot": 0, "type": "MODEL"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image generation and editing/Text to image"}]}, "config": {}, "extra": {"frontendVersion": "1.37.10", "workflowRendererVersion": "LG"}, "version": 0.4} +{"id": "a7e3b1c0-4f2d-4e8a-9b1c-longcat00001", "revision": 0, "last_node_id": 20, "last_link_id": 20, "nodes": [{"id": 1, "type": "lc-subgraph-001", "pos": [0, 1230], "size": [400, 470], "flags": {}, "order": 0, "mode": 0, "inputs": [{"label": "prompt", "name": "text", "type": "STRING", "widget": {"name": "text"}, "link": null}, {"name": "width", "type": "INT", "widget": {"name": "width"}, "link": null}, {"name": "height", "type": "INT", "widget": {"name": "height"}, "link": null}, {"name": "unet_name", "type": "COMBO", "widget": {"name": "unet_name"}, "link": null}, {"name": "clip_name", "type": "COMBO", "widget": {"name": "clip_name"}, "link": null}, {"name": "vae_name", "type": "COMBO", "widget": {"name": "vae_name"}, "link": null}], "outputs": [{"localized_name": "IMAGE", "name": "IMAGE", "type": "IMAGE", "links": []}], "properties": {"proxyWidgets": [["-1", "text"], ["-1", "width"], ["-1", "height"], ["7", "seed"], ["7", "control_after_generate"], ["-1", "unet_name"], ["-1", "clip_name"], ["-1", "vae_name"]], "cnr_id": "comfy-core", "ver": "0.3.73", "enableTabs": false}, "widgets_values": ["A young Asian woman wearing a yellow knit sweater with a white necklace, sitting with her hands on her knees and a serene expression. The background is a rough brick wall with warm afternoon sunlight.", 768, 1344, null, null, "longcat_image_bf16.safetensors", "qwen_2.5_vl_7b.safetensors", "ae.safetensors"]}], "links": [], "groups": [], "definitions": {"subgraphs": [{"id": "lc-subgraph-001", "version": 1, "state": {"lastGroupId": 4, "lastNodeId": 20, "lastLinkId": 20, "lastRerouteId": 0}, "revision": 0, "config": {}, "name": "local-Text to Image (LongCat-Image)", "inputNode": {"id": -10, "bounding": [-80, 425, 120, 160]}, "outputNode": {"id": -20, "bounding": [1490, 415, 120, 60]}, "inputs": [{"id": "inp-text", "name": "text", "type": "STRING", "linkIds": [10], "label": "prompt", "pos": [20, 445]}, {"id": "inp-width", "name": "width", "type": "INT", "linkIds": [11], "pos": [20, 465]}, {"id": "inp-height", "name": "height", "type": "INT", "linkIds": [12], "pos": [20, 485]}, {"id": "inp-unet", "name": "unet_name", "type": "COMBO", "linkIds": [13], "pos": [20, 505]}, {"id": "inp-clip", "name": "clip_name", "type": "COMBO", "linkIds": [14], "pos": [20, 525]}, {"id": "inp-vae", "name": "vae_name", "type": "COMBO", "linkIds": [15], "pos": [20, 545]}], "outputs": [{"id": "out-image", "name": "IMAGE", "type": "IMAGE", "linkIds": [9], "localized_name": "IMAGE", "pos": [1510, 435]}], "widgets": [], "nodes": [{"id": 1, "type": "UNETLoader", "pos": [110, 200], "size": [270, 82], "flags": {}, "order": 0, "mode": 0, "inputs": [{"name": "unet_name", "type": "COMBO", "widget": {"name": "unet_name"}, "link": 13}, {"name": "weight_dtype", "type": "COMBO", "widget": {"name": "weight_dtype"}, "link": null}], "outputs": [{"name": "MODEL", "type": "MODEL", "links": [1]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "UNETLoader", "models": [{"name": "longcat_image_bf16.safetensors", "url": "https://huggingface.co/TalmajM/LongCat-Image_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/longcat_image_bf16.safetensors", "directory": "unet"}]}, "widgets_values": ["longcat_image_bf16.safetensors", "default"]}, {"id": 2, "type": "CLIPLoader", "pos": [110, 330], "size": [270, 106], "flags": {}, "order": 1, "mode": 0, "inputs": [{"name": "clip_name", "type": "COMBO", "widget": {"name": "clip_name"}, "link": 14}, {"name": "type", "type": "COMBO", "widget": {"name": "type"}, "link": null}, {"name": "device", "shape": 7, "type": "COMBO", "widget": {"name": "device"}, "link": null}], "outputs": [{"name": "CLIP", "type": "CLIP", "links": [2, 16]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPLoader", "models": [{"name": "qwen_2.5_vl_7b.safetensors", "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b.safetensors", "directory": "text_encoders"}]}, "widgets_values": ["qwen_2.5_vl_7b.safetensors", "longcat_image", "default"]}, {"id": 3, "type": "VAELoader", "pos": [110, 480], "size": [270, 58], "flags": {}, "order": 2, "mode": 0, "inputs": [{"name": "vae_name", "type": "COMBO", "widget": {"name": "vae_name"}, "link": 15}], "outputs": [{"name": "VAE", "type": "VAE", "links": [3]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "VAELoader", "models": [{"name": "ae.safetensors", "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors", "directory": "vae"}]}, "widgets_values": ["ae.safetensors"]}, {"id": 4, "type": "CLIPTextEncodeLongCatImage", "pos": [430, 200], "size": [410, 250], "flags": {}, "order": 3, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 2}, {"name": "text", "type": "STRING", "widget": {"name": "text"}, "link": 10}, {"name": "guidance", "type": "FLOAT", "widget": {"name": "guidance"}, "link": null}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [4]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPTextEncodeLongCatImage"}, "widgets_values": ["", 4.0]}, {"id": 5, "type": "CLIPTextEncodeLongCatImage", "pos": [430, 510], "size": [410, 120], "flags": {}, "order": 4, "mode": 0, "inputs": [{"name": "clip", "type": "CLIP", "link": 16}, {"name": "text", "type": "STRING", "widget": {"name": "text"}, "link": null}, {"name": "guidance", "type": "FLOAT", "widget": {"name": "guidance"}, "link": null}], "outputs": [{"name": "CONDITIONING", "type": "CONDITIONING", "links": [6]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CLIPTextEncodeLongCatImage"}, "widgets_values": ["", 4.0]}, {"id": 10, "type": "CFGRenormLongCatImage", "pos": [880, 160], "size": [280, 26], "flags": {}, "order": 5, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 1}], "outputs": [{"name": "MODEL", "type": "MODEL", "links": [17]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "CFGRenormLongCatImage"}, "widgets_values": []}, {"id": 6, "type": "EmptySD3LatentImage", "pos": [110, 630], "size": [260, 106], "flags": {}, "order": 6, "mode": 0, "inputs": [{"name": "width", "type": "INT", "widget": {"name": "width"}, "link": 11}, {"name": "height", "type": "INT", "widget": {"name": "height"}, "link": 12}, {"name": "batch_size", "type": "INT", "widget": {"name": "batch_size"}, "link": null}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [7]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "EmptySD3LatentImage"}, "widgets_values": [768, 1344, 1]}, {"id": 7, "type": "KSampler", "pos": [880, 230], "size": [315, 262], "flags": {}, "order": 7, "mode": 0, "inputs": [{"name": "model", "type": "MODEL", "link": 17}, {"name": "positive", "type": "CONDITIONING", "link": 4}, {"name": "negative", "type": "CONDITIONING", "link": 6}, {"name": "latent_image", "type": "LATENT", "link": 7}, {"name": "seed", "type": "INT", "widget": {"name": "seed"}, "link": null}, {"name": "steps", "type": "INT", "widget": {"name": "steps"}, "link": null}, {"name": "cfg", "type": "FLOAT", "widget": {"name": "cfg"}, "link": null}, {"name": "sampler_name", "type": "COMBO", "widget": {"name": "sampler_name"}, "link": null}, {"name": "scheduler", "type": "COMBO", "widget": {"name": "scheduler"}, "link": null}, {"name": "denoise", "type": "FLOAT", "widget": {"name": "denoise"}, "link": null}], "outputs": [{"name": "LATENT", "type": "LATENT", "links": [8]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "KSampler"}, "widgets_values": [0, "randomize", 20, 4.0, "euler", "simple", 1.0]}, {"id": 8, "type": "VAEDecode", "pos": [1220, 160], "size": [210, 46], "flags": {}, "order": 8, "mode": 0, "inputs": [{"name": "samples", "type": "LATENT", "link": 8}, {"name": "vae", "type": "VAE", "link": 3}], "outputs": [{"name": "IMAGE", "type": "IMAGE", "links": [9]}], "properties": {"cnr_id": "comfy-core", "ver": "0.3.73", "Node name for S&R": "VAEDecode"}, "widgets_values": []}], "groups": [{"id": 1, "title": "Image size", "bounding": [100, 560, 290, 200], "color": "#3f789e", "font_size": 24, "flags": {}}, {"id": 2, "title": "Prompt", "bounding": [410, 130, 450, 540], "color": "#3f789e", "font_size": 24, "flags": {}}, {"id": 3, "title": "Models", "bounding": [100, 130, 290, 413], "color": "#3f789e", "font_size": 24, "flags": {}}], "links": [{"id": 1, "origin_id": 1, "origin_slot": 0, "target_id": 10, "target_slot": 0, "type": "MODEL"}, {"id": 2, "origin_id": 2, "origin_slot": 0, "target_id": 4, "target_slot": 0, "type": "CLIP"}, {"id": 3, "origin_id": 3, "origin_slot": 0, "target_id": 8, "target_slot": 1, "type": "VAE"}, {"id": 4, "origin_id": 4, "origin_slot": 0, "target_id": 7, "target_slot": 1, "type": "CONDITIONING"}, {"id": 6, "origin_id": 5, "origin_slot": 0, "target_id": 7, "target_slot": 2, "type": "CONDITIONING"}, {"id": 7, "origin_id": 6, "origin_slot": 0, "target_id": 7, "target_slot": 3, "type": "LATENT"}, {"id": 8, "origin_id": 7, "origin_slot": 0, "target_id": 8, "target_slot": 0, "type": "LATENT"}, {"id": 9, "origin_id": 8, "origin_slot": 0, "target_id": -20, "target_slot": 0, "type": "IMAGE"}, {"id": 10, "origin_id": -10, "origin_slot": 0, "target_id": 4, "target_slot": 1, "type": "STRING"}, {"id": 11, "origin_id": -10, "origin_slot": 1, "target_id": 6, "target_slot": 0, "type": "INT"}, {"id": 12, "origin_id": -10, "origin_slot": 2, "target_id": 6, "target_slot": 1, "type": "INT"}, {"id": 13, "origin_id": -10, "origin_slot": 3, "target_id": 1, "target_slot": 0, "type": "COMBO"}, {"id": 14, "origin_id": -10, "origin_slot": 4, "target_id": 2, "target_slot": 0, "type": "COMBO"}, {"id": 15, "origin_id": -10, "origin_slot": 5, "target_id": 3, "target_slot": 0, "type": "COMBO"}, {"id": 16, "origin_id": 2, "origin_slot": 0, "target_id": 5, "target_slot": 0, "type": "CLIP"}, {"id": 17, "origin_id": 10, "origin_slot": 0, "target_id": 7, "target_slot": 0, "type": "MODEL"}], "extra": {"workflowRendererVersion": "LG"}, "category": "Image generation and editing/Text to image"}]}, "config": {}, "extra": {"frontendVersion": "1.37.10", "workflowRendererVersion": "LG"}, "version": 0.4} diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 5a3f2e540..4206b831b 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -279,36 +279,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["txt_norm"] = any_suffix_in(state_dict_keys, key_prefix, 'txt_norm.', ["weight", "scale"]) if dit_config["yak_mlp"] and dit_config["txt_norm"]: # Ovis model dit_config["txt_ids_dims"] = [1, 2] - - return dit_config - - if '{}x_embedder.weight'.format(key_prefix) in state_dict_keys and '{}transformer_blocks.0.attn.to_q.weight'.format(key_prefix) in state_dict_keys and '{}single_transformer_blocks.0.attn.to_q.weight'.format(key_prefix) in state_dict_keys and '{}context_embedder.weight'.format(key_prefix) in state_dict_keys and '{}time_embed.timestep_embedder.linear_1.weight'.format(key_prefix) in state_dict_keys: #LongCat-Image (diffusers format, Flux variant) - dit_config = {} - dit_config["image_model"] = "flux" - dit_config["axes_dim"] = [16, 56, 56] - dit_config["theta"] = 10000 - dit_config["qkv_bias"] = True - dit_config["txt_ids_dims"] = [1, 2] - - w = state_dict['{}x_embedder.weight'.format(key_prefix)] - dit_config["hidden_size"] = w.shape[0] - dit_config["in_channels"] = w.shape[1] // 4 - dit_config["out_channels"] = dit_config["in_channels"] - dit_config["patch_size"] = 2 - - ctx_key = '{}context_embedder.weight'.format(key_prefix) - if ctx_key in state_dict_keys: - dit_config["context_in_dim"] = state_dict[ctx_key].shape[1] - else: - dit_config["context_in_dim"] = 3584 - - dit_config["vec_in_dim"] = None - dit_config["guidance_embed"] = False - dit_config["mlp_ratio"] = 4.0 - dit_config["num_heads"] = dit_config["hidden_size"] // sum(dit_config["axes_dim"]) - - dit_config["depth"] = count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.') - dit_config["depth_single_blocks"] = count_blocks(state_dict_keys, '{}single_transformer_blocks.'.format(key_prefix) + '{}.') + if dit_config.get("context_in_dim") == 3584 and dit_config["vec_in_dim"] is None: # LongCat-Image + dit_config["txt_ids_dims"] = [1, 2] return dit_config diff --git a/comfy/supported_models.py b/comfy/supported_models.py index ca25ffdcf..eebbee80a 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1687,13 +1687,6 @@ class LongCatImage(supported_models_base.BASE): "txt_ids_dims": [1, 2], } - required_keys = { - "x_embedder.weight": None, - "context_embedder.weight": None, - "single_transformer_blocks.10.attn.to_q.weight": None, - "time_embed.timestep_embedder.linear_1.weight": None, - } - sampling_settings = { } @@ -1707,115 +1700,6 @@ class LongCatImage(supported_models_base.BASE): vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] - def process_unet_state_dict(self, state_dict): - out_sd = {} - double_q, double_k, double_v = {}, {}, {} - double_tq, double_tk, double_tv = {}, {}, {} - single_q, single_k, single_v, single_mlp = {}, {}, {}, {} - - for k, v in state_dict.items(): - if k.startswith("transformer_blocks."): - idx = k.split(".")[1] - rest = ".".join(k.split(".")[2:]) - prefix = "double_blocks.{}.".format(idx) - - if rest.startswith("norm1.linear."): - out_sd[prefix + "img_mod.lin." + rest.split(".")[-1]] = v - elif rest.startswith("norm1_context.linear."): - out_sd[prefix + "txt_mod.lin." + rest.split(".")[-1]] = v - elif rest.startswith("attn.to_q."): - double_q[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("attn.to_k."): - double_k[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("attn.to_v."): - double_v[idx + "." + rest.split(".")[-1]] = v - elif rest == "attn.norm_q.weight": - out_sd[prefix + "img_attn.norm.query_norm.weight"] = v - elif rest == "attn.norm_k.weight": - out_sd[prefix + "img_attn.norm.key_norm.weight"] = v - elif rest.startswith("attn.to_out.0."): - out_sd[prefix + "img_attn.proj." + rest.split(".")[-1]] = v - elif rest.startswith("attn.add_q_proj."): - double_tq[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("attn.add_k_proj."): - double_tk[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("attn.add_v_proj."): - double_tv[idx + "." + rest.split(".")[-1]] = v - elif rest == "attn.norm_added_q.weight": - out_sd[prefix + "txt_attn.norm.query_norm.weight"] = v - elif rest == "attn.norm_added_k.weight": - out_sd[prefix + "txt_attn.norm.key_norm.weight"] = v - elif rest.startswith("attn.to_add_out."): - out_sd[prefix + "txt_attn.proj." + rest.split(".")[-1]] = v - elif rest.startswith("ff.net.0.proj."): - out_sd[prefix + "img_mlp.0." + rest.split(".")[-1]] = v - elif rest.startswith("ff.net.2."): - out_sd[prefix + "img_mlp.2." + rest.split(".")[-1]] = v - elif rest.startswith("ff_context.net.0.proj."): - out_sd[prefix + "txt_mlp.0." + rest.split(".")[-1]] = v - elif rest.startswith("ff_context.net.2."): - out_sd[prefix + "txt_mlp.2." + rest.split(".")[-1]] = v - else: - out_sd["double_blocks.{}.{}".format(idx, rest)] = v - - elif k.startswith("single_transformer_blocks."): - idx = k.split(".")[1] - rest = ".".join(k.split(".")[2:]) - prefix = "single_blocks.{}.".format(idx) - - if rest.startswith("norm.linear."): - out_sd[prefix + "modulation.lin." + rest.split(".")[-1]] = v - elif rest.startswith("attn.to_q."): - single_q[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("attn.to_k."): - single_k[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("attn.to_v."): - single_v[idx + "." + rest.split(".")[-1]] = v - elif rest == "attn.norm_q.weight": - out_sd[prefix + "norm.query_norm.weight"] = v - elif rest == "attn.norm_k.weight": - out_sd[prefix + "norm.key_norm.weight"] = v - elif rest.startswith("proj_mlp."): - single_mlp[idx + "." + rest.split(".")[-1]] = v - elif rest.startswith("proj_out."): - out_sd[prefix + "linear2." + rest.split(".")[-1]] = v - else: - out_sd["single_blocks.{}.{}".format(idx, rest)] = v - - elif k == "x_embedder.weight" or k == "x_embedder.bias": - out_sd["img_in." + k.split(".")[-1]] = v - elif k == "context_embedder.weight" or k == "context_embedder.bias": - out_sd["txt_in." + k.split(".")[-1]] = v - elif k.startswith("time_embed.timestep_embedder.linear_1."): - out_sd["time_in.in_layer." + k.split(".")[-1]] = v - elif k.startswith("time_embed.timestep_embedder.linear_2."): - out_sd["time_in.out_layer." + k.split(".")[-1]] = v - elif k.startswith("norm_out.linear."): - # HF AdaLayerNormContinuous stores [scale | shift] but ComfyUI - # LastLayer expects [shift | scale], so swap the two halves. - half = v.shape[0] // 2 - v = torch.cat([v[half:], v[:half]], dim=0) - out_sd["final_layer.adaLN_modulation.1." + k.split(".")[-1]] = v - elif k == "proj_out.weight" or k == "proj_out.bias": - out_sd["final_layer.linear." + k.split(".")[-1]] = v - else: - out_sd[k] = v - - for suffix in ["weight", "bias"]: - for idx in sorted(set(x.split(".")[0] for x in double_q)): - qk = idx + "." + suffix - if qk in double_q and qk in double_k and qk in double_v: - out_sd["double_blocks.{}.img_attn.qkv.{}".format(idx, suffix)] = torch.cat([double_q[qk], double_k[qk], double_v[qk]], dim=0) - if qk in double_tq and qk in double_tk and qk in double_tv: - out_sd["double_blocks.{}.txt_attn.qkv.{}".format(idx, suffix)] = torch.cat([double_tq[qk], double_tk[qk], double_tv[qk]], dim=0) - - for idx in sorted(set(x.split(".")[0] for x in single_q)): - qk = idx + "." + suffix - if qk in single_q and qk in single_k and qk in single_v and qk in single_mlp: - out_sd["single_blocks.{}.linear1.{}".format(idx, suffix)] = torch.cat([single_q[qk], single_k[qk], single_v[qk], single_mlp[qk]], dim=0) - - return out_sd - def get_model(self, state_dict, prefix="", device=None): out = model_base.LongCatImage(self, device=device) return out diff --git a/tests-unit/comfy_test/model_detection_test.py b/tests-unit/comfy_test/model_detection_test.py index 60f247264..e28542744 100644 --- a/tests-unit/comfy_test/model_detection_test.py +++ b/tests-unit/comfy_test/model_detection_test.py @@ -5,42 +5,40 @@ from comfy.model_detection import detect_unet_config, model_config_from_unet_con import comfy.supported_models -def _make_longcat_diffusers_sd(): - """Minimal Diffusers-format state dict that triggers the LongCat-Image detection path.""" +def _make_longcat_comfyui_sd(): + """Minimal ComfyUI-format state dict for pre-converted LongCat-Image weights.""" sd = {} - H = 3072 # hidden_size (matches real LongCat-Image) + H = 3072 C_IN = 16 - C_CTX = 3584 # context_in_dim that distinguishes LongCat from standard Flux (4096) + C_CTX = 3584 - sd["x_embedder.weight"] = torch.empty(H, C_IN * 4) - sd["x_embedder.bias"] = torch.empty(H) - sd["context_embedder.weight"] = torch.empty(H, C_CTX) - sd["context_embedder.bias"] = torch.empty(H) + sd["img_in.weight"] = torch.empty(H, C_IN * 4) + sd["img_in.bias"] = torch.empty(H) + sd["txt_in.weight"] = torch.empty(H, C_CTX) + sd["txt_in.bias"] = torch.empty(H) - sd["time_embed.timestep_embedder.linear_1.weight"] = torch.empty(H, 256) - sd["time_embed.timestep_embedder.linear_1.bias"] = torch.empty(H) - sd["time_embed.timestep_embedder.linear_2.weight"] = torch.empty(H, H) - sd["time_embed.timestep_embedder.linear_2.bias"] = torch.empty(H) + sd["time_in.in_layer.weight"] = torch.empty(H, 256) + sd["time_in.in_layer.bias"] = torch.empty(H) + sd["time_in.out_layer.weight"] = torch.empty(H, H) + sd["time_in.out_layer.bias"] = torch.empty(H) - sd["norm_out.linear.weight"] = torch.empty(2 * H, H) - sd["norm_out.linear.bias"] = torch.empty(2 * H) - sd["proj_out.weight"] = torch.empty(C_IN * 4, H) - sd["proj_out.bias"] = torch.empty(C_IN * 4) + sd["final_layer.adaLN_modulation.1.weight"] = torch.empty(2 * H, H) + sd["final_layer.adaLN_modulation.1.bias"] = torch.empty(2 * H) + sd["final_layer.linear.weight"] = torch.empty(C_IN * 4, H) + sd["final_layer.linear.bias"] = torch.empty(C_IN * 4) - # Need enough transformer_blocks and single_transformer_blocks for count_blocks - # and for the required_keys check (single_transformer_blocks.10.*) for i in range(19): - sd[f"transformer_blocks.{i}.attn.to_q.weight"] = torch.empty(H, H) - sd[f"transformer_blocks.{i}.norm1.linear.weight"] = torch.empty(H) + sd[f"double_blocks.{i}.img_attn.norm.key_norm.weight"] = torch.empty(128) + sd[f"double_blocks.{i}.img_attn.qkv.weight"] = torch.empty(3 * H, H) + sd[f"double_blocks.{i}.img_mod.lin.weight"] = torch.empty(H, H) for i in range(38): - sd[f"single_transformer_blocks.{i}.attn.to_q.weight"] = torch.empty(H, H) - sd[f"single_transformer_blocks.{i}.norm.linear.weight"] = torch.empty(H) + sd[f"single_blocks.{i}.modulation.lin.weight"] = torch.empty(H, H) return sd def _make_flux_schnell_comfyui_sd(): - """Minimal ComfyUI-format state dict that triggers the standard Flux detection path.""" + """Minimal ComfyUI-format state dict for standard Flux Schnell.""" sd = {} H = 3072 C_IN = 16 @@ -67,18 +65,16 @@ class TestModelDetectionSpecificity: def test_longcat_wins_regardless_of_list_order(self): """Specificity logic must pick LongCatImage even when FluxSchnell appears first.""" - sd = _make_longcat_diffusers_sd() + sd = _make_longcat_comfyui_sd() unet_config = detect_unet_config(sd, "") longcat_cls = comfy.supported_models.LongCatImage schnell_cls = comfy.supported_models.FluxSchnell - # Order A: FluxSchnell before LongCatImage - order_a = [schnell_cls, longcat_cls] - # Order B: LongCatImage before FluxSchnell - order_b = [longcat_cls, schnell_cls] - - for label, order in [("schnell-first", order_a), ("longcat-first", order_b)]: + for label, order in [ + ("schnell-first", [schnell_cls, longcat_cls]), + ("longcat-first", [longcat_cls, schnell_cls]), + ]: with patch.object(comfy.supported_models, "models", order): result = model_config_from_unet_config(unet_config, sd) assert result is not None, f"No match with order {label}" @@ -86,29 +82,31 @@ class TestModelDetectionSpecificity: f"Expected LongCatImage with order {label}, got {type(result).__name__}" ) - def test_longcat_diffusers_detected_as_longcat(self): - sd = _make_longcat_diffusers_sd() + def test_longcat_comfyui_detected_as_longcat(self): + sd = _make_longcat_comfyui_sd() unet_config = detect_unet_config(sd, "") assert unet_config is not None assert unet_config["image_model"] == "flux" assert unet_config["context_in_dim"] == 3584 + assert unet_config["vec_in_dim"] is None + assert unet_config["guidance_embed"] is False assert unet_config["txt_ids_dims"] == [1, 2] model_config = model_config_from_unet_config(unet_config, sd) assert model_config is not None assert type(model_config).__name__ == "LongCatImage" - def test_longcat_process_unet_state_dict_converts_keys(self): - sd = _make_longcat_diffusers_sd() + def test_longcat_comfyui_keys_pass_through_unchanged(self): + """Pre-converted weights should not be transformed by process_unet_state_dict.""" + sd = _make_longcat_comfyui_sd() unet_config = detect_unet_config(sd, "") model_config = model_config_from_unet_config(unet_config, sd) - converted = model_config.process_unet_state_dict(dict(sd)) - assert "img_in.weight" in converted - assert "img_in.bias" in converted - assert "txt_in.weight" in converted - assert "x_embedder.weight" not in converted - assert "context_embedder.weight" not in converted + processed = model_config.process_unet_state_dict(dict(sd)) + assert "img_in.weight" in processed + assert "txt_in.weight" in processed + assert "time_in.in_layer.weight" in processed + assert "final_layer.linear.weight" in processed def test_flux_schnell_comfyui_detected_as_flux_schnell(self): sd = _make_flux_schnell_comfyui_sd()