From 34751fe9f9ade0c715768202c19211dc0c72e760 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Wed, 7 Jan 2026 16:12:15 -0800
Subject: [PATCH] Lower ltxv text encoder vram use. (#11713)

---
 comfy/text_encoders/lt.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py
index 130ebaeae..dc0694e0e 100644
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -98,10 +98,13 @@ class LTXAVTEModel(torch.nn.Module):
 
         out, pooled, extra = self.gemma3_12b.encode_token_weights(token_weight_pairs)
         out_device = out.device
+        if comfy.model_management.should_use_bf16(self.execution_device):
+            out = out.to(device=self.execution_device, dtype=torch.bfloat16)
         out = out.movedim(1, -1).to(self.execution_device)
         out = 8.0 * (out - out.mean(dim=(1, 2), keepdim=True)) / (out.amax(dim=(1, 2), keepdim=True) - out.amin(dim=(1, 2), keepdim=True) + 1e-6)
         out = out.reshape((out.shape[0], out.shape[1], -1))
         out = self.text_embedding_projection(out)
+        out = out.float()
         out_vid = self.video_embeddings_connector(out)[0]
         out_audio = self.audio_embeddings_connector(out)[0]
         out = torch.concat((out_vid, out_audio), dim=-1)