diff --git a/comfy/ldm/ideogram4/model.py b/comfy/ldm/ideogram4/model.py index 3b02a243a..b86c65bf0 100644 --- a/comfy/ldm/ideogram4/model.py +++ b/comfy/ldm/ideogram4/model.py @@ -174,7 +174,7 @@ class Ideogram4Transformer(nn.Module): llm = self.llm_cond_proj(llm) * text_mask h[:, :L_text] = h[:, :L_text] + llm - h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long)) + h = h + self.embed_image_indicator((indicator == OUTPUT_IMAGE_INDICATOR).to(torch.long), out_dtype=h.dtype) # Qwen3-VL interleaved MRoPE; position_ids (B, L, 3) -> (3, L) (same across batch). freqs_cis = precompute_freqs_cis( @@ -235,7 +235,7 @@ class Ideogram4Transformer2DModel(Ideogram4Transformer): def _run_conditional(self, x_chunk, context_chunk, attn_mask_chunk, t_chunk, gh, gw, transformer_options): B = x_chunk.shape[0] device = x_chunk.device - img_tokens = self._img_to_tokens(x_chunk).to(self.dtype) + img_tokens = self._img_to_tokens(x_chunk) L_img = img_tokens.shape[1] L_text = context_chunk.shape[1] L = L_text + L_img @@ -268,7 +268,7 @@ class Ideogram4Transformer2DModel(Ideogram4Transformer): def _run_image_only(self, x_chunk, t_chunk, gh, gw, transformer_options): B = x_chunk.shape[0] device = x_chunk.device - img_tokens = self._img_to_tokens(x_chunk).to(self.dtype) + img_tokens = self._img_to_tokens(x_chunk) L_img = img_tokens.shape[1] position_ids = self._image_position_ids(gh, gw, device).unsqueeze(0).expand(B, L_img, 3)