diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py
index cd8337a7e..7646b8b68 100644
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -2016,12 +2016,16 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None,
         bbox = torch.zeros((c.shape[0], 3), device=device, dtype=c.dtype)
         return torch.cat([c, cube.bbox_proj(bbox).unsqueeze(1)], dim=1)
 
-    with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=autocast_enabled):
-        cond = add_bbox(cube.encode_text(pos.to(device=device, dtype=weight_dtype)))
-        if use_cfg:
-            ucond = add_bbox(cube.encode_text(neg.to(device=device, dtype=weight_dtype)))
-            cond = torch.cat([cond, ucond], dim=0)
+    # Conditioning (text_proj + bbox_proj) is computed in the model's weight dtype
+    # OUTSIDE the bf16 autocast block, matching upstream cube's Engine.prepare_inputs
+    # (run_clip/encode_text run in full precision). The autocast only covers the
+    # autoregressive transformer forward, exactly like Engine.run_gpt.
+    cond = add_bbox(cube.encode_text(pos.to(device=device, dtype=weight_dtype)))
+    if use_cfg:
+        ucond = add_bbox(cube.encode_text(neg.to(device=device, dtype=weight_dtype)))
+        cond = torch.cat([cond, ucond], dim=0)
 
+    with torch.autocast(device_type=device.type, dtype=torch.bfloat16, enabled=autocast_enabled):
         bos = torch.full((cond.shape[0], 1), cube.shape_bos_id, dtype=torch.long, device=device)
         embed = cube.encode_token(bos)
         Bp, input_seq_len, dim = embed.shape
diff --git a/comfy/sd.py b/comfy/sd.py
index 54ab5570d..dcafb87b9 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -783,6 +783,10 @@ class VAE:
             elif "bottleneck.block.codebook.weight" in sd:
                 self.cube3d = True
                 self.latent_dim = 1
+                # VAEDecodeCube calls first_stage_model.decode_indices/extract_geometry
+                # directly (not through the patcher-managed forward), so the weights must
+                # be fully resident on-device. Disable dynamic streaming offload.
+                self.disable_offload = True
                 embed_dim = sd["bottleneck.block.codebook.weight"].shape[1]
                 num_codes = sd["bottleneck.block.codebook.weight"].shape[0]
                 width = sd["bottleneck.block.c_out.weight"].shape[0]
diff --git a/comfy_extras/nodes_cube.py b/comfy_extras/nodes_cube.py
index 625f09e13..919810545 100644
--- a/comfy_extras/nodes_cube.py
+++ b/comfy_extras/nodes_cube.py
@@ -38,7 +38,10 @@ class EmptyCubeLatent(IO.ComfyNode):
 
     @classmethod
     def execute(cls, num_tokens, batch_size) -> IO.NodeOutput:
-        latent = torch.zeros([batch_size, num_tokens], device=comfy.model_management.intermediate_device())
+        # Trailing singleton dim keeps this a 3D latent so it flows through ComfyUI's
+        # conds/noise pipeline (encode_model_conds reads noise.shape[2]); the sampler
+        # only uses dim 1 (num_tokens).
+        latent = torch.zeros([batch_size, num_tokens, 1], device=comfy.model_management.intermediate_device())
         return IO.NodeOutput({"samples": latent, "type": "cube_tokens"})
 
 
@@ -121,7 +124,8 @@ class VAEDecodeCube(IO.ComfyNode):
     def execute(cls, vae, samples, resolution_base, chunk_size) -> IO.NodeOutput:
         comfy.model_management.load_models_gpu([vae.patcher])
         tok = vae.first_stage_model
-        ids = samples["samples"][:, :tok.cfg_num_encoder_latents].long()
+        ids = samples["samples"]
+        ids = ids.reshape(ids.shape[0], -1)[:, :tok.cfg_num_encoder_latents].long()
         ids = ids.clamp(0, tok.cfg_num_codes - 1).to(vae.device)
 
         latents = tok.decode_indices(ids)
diff --git a/requirements.txt b/requirements.txt
index a49d968af..5c0f739cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -31,6 +31,7 @@ blake3
 #non essential dependencies:
 kornia>=0.7.1
 spandrel
+scikit-image  # marching cubes for Cube3D (VAEDecodeCube)
 pydantic~=2.0
 pydantic-settings~=2.0
 PyOpenGL