Merge branch 'master' into dr-support-pip-cm

2025-12-19 11:03:00 +08:00 · 2025-09-23 07:28:52 +09:00 · 2025-09-23 07:28:52 +09:00 · 74c1a58566
commit 74c1a58566
parent 316aa125c9 707b2638ec
5 changed files with 179 additions and 10 deletions
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@ -400,21 +400,25 @@ class Qwen25_7BVLI(BaseLlama, torch.nn.Module):

    def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
        grid = None
+        position_ids = None
+        offset = 0
        for e in embeds_info:
            if e.get("type") == "image":
                grid = e.get("extra", None)
-                position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
                start = e.get("index")
-                position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
+                if position_ids is None:
+                    position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
+                    position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
                end = e.get("size") + start
                len_max = int(grid.max()) // 2
                start_next = len_max + start
-                position_ids[:, end:] = torch.arange(start_next, start_next + (embeds.shape[1] - end), device=embeds.device)
-                position_ids[0, start:end] = start
+                position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
+                position_ids[0, start:end] = start + offset
                max_d = int(grid[0][1]) // 2
-                position_ids[1, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
+                position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
                max_d = int(grid[0][2]) // 2
-                position_ids[2, start:end] = torch.arange(start, start + max_d, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
+                position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
+                offset += len_max - (end - start)

        if grid is None:
            position_ids = None
--- a/comfy_extras/nodes_qwen.py
+++ b/comfy_extras/nodes_qwen.py
@ -43,6 +43,61 @@ class TextEncodeQwenImageEdit:
        return (conditioning, )


+class TextEncodeQwenImageEditPlus:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP", ),
+            "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
+            },
+            "optional": {"vae": ("VAE", ),
+                         "image1": ("IMAGE", ),
+                         "image2": ("IMAGE", ),
+                         "image3": ("IMAGE", ),
+                         }}
+
+    RETURN_TYPES = ("CONDITIONING",)
+    FUNCTION = "encode"
+
+    CATEGORY = "advanced/conditioning"
+
+    def encode(self, clip, prompt, vae=None, image1=None, image2=None, image3=None):
+        ref_latents = []
+        images = [image1, image2, image3]
+        images_vl = []
+        llama_template = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        image_prompt = ""
+
+        for i, image in enumerate(images):
+            if image is not None:
+                samples = image.movedim(-1, 1)
+                total = int(384 * 384)
+
+                scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+                width = round(samples.shape[3] * scale_by)
+                height = round(samples.shape[2] * scale_by)
+
+                s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+                images_vl.append(s.movedim(1, -1))
+                if vae is not None:
+                    total = int(1024 * 1024)
+                    scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
+                    width = round(samples.shape[3] * scale_by / 8.0) * 8
+                    height = round(samples.shape[2] * scale_by / 8.0) * 8
+
+                    s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
+                    ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
+
+                image_prompt += "Picture {}: <|vision_start|><|image_pad|><|vision_end|>".format(i + 1)
+
+        tokens = clip.tokenize(image_prompt + prompt, images=images_vl, llama_template=llama_template)
+        conditioning = clip.encode_from_tokens_scheduled(tokens)
+        if len(ref_latents) > 0:
+            conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True)
+        return (conditioning, )
+
+
 NODE_CLASS_MAPPINGS = {
    "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit,
+    "TextEncodeQwenImageEditPlus": TextEncodeQwenImageEditPlus,
 }
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@ -1210,7 +1210,7 @@ class WanAnimateToVideo(io.ComfyNode):
                background_video = background_video[video_frame_offset:]
                background_video = comfy.utils.common_upscale(background_video[:length].movedim(-1, 1), width, height, "area", "center").movedim(1, -1)
                if background_video.shape[0] > ref_images_num:
-                    image[ref_images_num:background_video.shape[0] - ref_images_num] = background_video[ref_images_num:]
+                    image[ref_images_num:background_video.shape[0]] = background_video[ref_images_num:]

        mask_refmotion = torch.ones((1, 1, latent_length * 4, concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=mask.device, dtype=mask.dtype)
        if continue_motion is not None:
@ -1229,7 +1229,7 @@ class WanAnimateToVideo(io.ComfyNode):
                    character_mask = character_mask.unsqueeze(1)
                character_mask = comfy.utils.common_upscale(character_mask[:, :, :length], concat_latent_image.shape[-1], concat_latent_image.shape[-2], "nearest-exact", "center")
                if character_mask.shape[2] > ref_images_num:
-                    mask_refmotion[:, :, ref_images_num:character_mask.shape[2] + ref_images_num] = character_mask[:, :, ref_images_num:]
+                    mask_refmotion[:, :, ref_images_num:character_mask.shape[2]] = character_mask[:, :, ref_images_num:]

        concat_latent_image = torch.cat((concat_latent_image, vae.encode(image[:, :, :, :3])), dim=2)

--- a/server.py
+++ b/server.py
@ -651,7 +651,14 @@ class PromptServer():
            max_items = request.rel_url.query.get("max_items", None)
            if max_items is not None:
                max_items = int(max_items)
-            return web.json_response(self.prompt_queue.get_history(max_items=max_items))
+
+            offset = request.rel_url.query.get("offset", None)
+            if offset is not None:
+                offset = int(offset)
+            else:
+                offset = -1
+
+            return web.json_response(self.prompt_queue.get_history(max_items=max_items, offset=offset))

        @routes.get("/history/{prompt_id}")
        async def get_history_prompt_id(request):
--- a/tests/execution/test_execution.py
+++ b/tests/execution/test_execution.py
@ -84,6 +84,21 @@ class ComfyClient:
        with urllib.request.urlopen("http://{}/history/{}".format(self.server_address, prompt_id)) as response:
            return json.loads(response.read())

+    def get_all_history(self, max_items=None, offset=None):
+        url = "http://{}/history".format(self.server_address)
+        params = {}
+        if max_items is not None:
+            params["max_items"] = max_items
+        if offset is not None:
+            params["offset"] = offset
+
+        if params:
+            url_values = urllib.parse.urlencode(params)
+            url = "{}?{}".format(url, url_values)
+
+        with urllib.request.urlopen(url) as response:
+            return json.loads(response.read())
+
    def set_test_name(self, name):
        self.test_name = name

@ -498,7 +513,6 @@ class TestExecution:
        assert len(images1) == 1, "Should have 1 image"
        assert len(images2) == 1, "Should have 1 image"

-
    # This tests that only constant outputs are used in the call to `IS_CHANGED`
    def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder):
        g = builder
@ -762,3 +776,92 @@ class TestExecution:
        except urllib.error.HTTPError:
            pass  # Expected behavior

+    def _create_history_item(self, client, builder):
+        g = GraphBuilder(prefix="offset_test")
+        input_node = g.node(
+            "StubImage", content="BLACK", height=32, width=32, batch_size=1
+        )
+        g.node("SaveImage", images=input_node.out(0))
+        return client.run(g)
+
+    def test_offset_returns_different_items_than_beginning_of_history(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test that offset skips items at the beginning"""
+        for _ in range(5):
+            self._create_history_item(client, builder)
+
+        first_two = client.get_all_history(max_items=2, offset=0)
+        next_two = client.get_all_history(max_items=2, offset=2)
+
+        assert set(first_two.keys()).isdisjoint(
+            set(next_two.keys())
+        ), "Offset should skip initial items"
+
+    def test_offset_beyond_history_length_returns_empty(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset larger than total history returns empty result"""
+        self._create_history_item(client, builder)
+
+        result = client.get_all_history(offset=100)
+        assert len(result) == 0, "Large offset should return no items"
+
+    def test_offset_at_exact_history_length_returns_empty(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset equal to history length returns empty"""
+        for _ in range(3):
+            self._create_history_item(client, builder)
+
+        all_history = client.get_all_history()
+        result = client.get_all_history(offset=len(all_history))
+        assert len(result) == 0, "Offset at history length should return empty"
+
+    def test_offset_zero_equals_no_offset_parameter(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset=0 behaves same as omitting offset"""
+        self._create_history_item(client, builder)
+
+        with_zero = client.get_all_history(offset=0)
+        without_offset = client.get_all_history()
+
+        assert with_zero == without_offset, "offset=0 should equal no offset"
+
+    def test_offset_without_max_items_skips_from_beginning(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset alone (no max_items) returns remaining items"""
+        for _ in range(4):
+            self._create_history_item(client, builder)
+
+        all_items = client.get_all_history()
+        offset_items = client.get_all_history(offset=2)
+
+        assert (
+            len(offset_items) == len(all_items) - 2
+        ), "Offset should skip specified number of items"
+
+    def test_offset_with_max_items_returns_correct_window(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset + max_items returns correct slice of history"""
+        for _ in range(6):
+            self._create_history_item(client, builder)
+
+        window = client.get_all_history(max_items=2, offset=1)
+        assert len(window) <= 2, "Should respect max_items limit"
+
+    def test_offset_near_end_returns_remaining_items_only(
+        self, client: ComfyClient, builder: GraphBuilder
+    ):
+        """Test offset near end of history returns only remaining items"""
+        for _ in range(3):
+            self._create_history_item(client, builder)
+
+        all_history = client.get_all_history()
+        # Offset to near the end
+        result = client.get_all_history(max_items=5, offset=len(all_history) - 1)
+
+        assert len(result) <= 1, "Should return at most 1 item when offset is near end"