From e7f99168ae240547f7261018f0c22185027784c8 Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Sun, 14 Jun 2026 23:58:14 -0700
Subject: [PATCH] Cube3D: document convention deviations + drop unused VAE flag
 (review aid)

- Remove the unused self.cube3d VAE flag (set but never read).
- Comment why VAE working_dtypes is fp32-only (VQ lookup + occupancy query
  parity), unlike most VAEs that allow fp16/bf16.
- Comment why Cube3D.clip_target() returns None (GPT-only checkpoint; graph
  wires a standard CLIPLoader/CLIPTextEncode).
- Note rope_theta=10000 is upstream's fixed constant, not in the state dict.

No behaviour change; comments/cleanup only.

Amp-Thread-ID: https://ampcode.com/threads/T-019ec361-addb-70d8-a74b-438ce8a1e096
Co-authored-by: Amp <amp@ampcode.com>
---
 comfy/model_detection.py  | 2 +-
 comfy/sd.py               | 5 +++--
 comfy/supported_models.py | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 0f7750f29..a9c359226 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -668,7 +668,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
         dit_config["text_model_embed_dim"] = state_dict['{}text_proj.weight'.format(key_prefix)].shape[1]
         dit_config["use_bbox"] = '{}bbox_proj.weight'.format(key_prefix) in state_dict_keys
         dit_config["bias"] = '{}text_proj.bias'.format(key_prefix) in state_dict_keys
-        dit_config["rope_theta"] = 10000
+        dit_config["rope_theta"] = 10000  # not stored in the state dict; upstream's fixed constant
         return dit_config
 
     if '{}latent_in.weight'.format(key_prefix) in state_dict_keys:  # Hunyuan 3D
diff --git a/comfy/sd.py b/comfy/sd.py
index 74e388553..4ab0d34bd 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -490,7 +490,6 @@ class VAE:
         self.disable_offload = False
         self.not_video = False
         self.size = None
-        self.cube3d = False
 
         self.downscale_index_formula = None
         self.upscale_index_formula = None
@@ -781,7 +780,6 @@ class VAE:
 
             # Roblox Cube3D shape tokenizer (OneDAutoEncoder, decode-only)
             elif "bottleneck.block.codebook.weight" in sd:
-                self.cube3d = True
                 self.latent_dim = 1
                 # The VQ bottleneck (get_codebook/lookup_codebook) reads raw parameters
                 # outside any hooked forward, so the streaming-offload cast hooks can't
@@ -809,6 +807,9 @@ class VAE:
                 self.process_input = lambda image: image
                 # shape is the token-ID latent (B, 1, num_tokens); size by num_tokens.
                 self.memory_used_decode = lambda shape, dtype: (1000 * shape[-1] * 768) * model_management.dtype_size(dtype)
+                # fp32-only (unlike most VAEs that allow fp16/bf16): the VQ codebook lookup
+                # and occupancy-grid query must run in fp32 to match upstream and keep the
+                # isosurface stable.
                 self.working_dtypes = [torch.float32]
 
             elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 2c9d86328..4cc27de25 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1572,6 +1572,8 @@ class Cube3D(supported_models_base.BASE):
         return model_base.Cube3D(self, device=device)
 
     def clip_target(self, state_dict={}):
+        # No bundled text encoder: the cube checkpoint is GPT-only. The graph wires a
+        # standard CLIPLoader(clip-l)/CLIPTextEncode, so there is no clip_target to build.
         return None
 
 class TripoSplat(supported_models_base.BASE):