diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 0f7750f29..a9c359226 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -668,7 +668,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["text_model_embed_dim"] = state_dict['{}text_proj.weight'.format(key_prefix)].shape[1] dit_config["use_bbox"] = '{}bbox_proj.weight'.format(key_prefix) in state_dict_keys dit_config["bias"] = '{}text_proj.bias'.format(key_prefix) in state_dict_keys - dit_config["rope_theta"] = 10000 + dit_config["rope_theta"] = 10000 # not stored in the state dict; upstream's fixed constant return dit_config if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D diff --git a/comfy/sd.py b/comfy/sd.py index 74e388553..4ab0d34bd 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -490,7 +490,6 @@ class VAE: self.disable_offload = False self.not_video = False self.size = None - self.cube3d = False self.downscale_index_formula = None self.upscale_index_formula = None @@ -781,7 +780,6 @@ class VAE: # Roblox Cube3D shape tokenizer (OneDAutoEncoder, decode-only) elif "bottleneck.block.codebook.weight" in sd: - self.cube3d = True self.latent_dim = 1 # The VQ bottleneck (get_codebook/lookup_codebook) reads raw parameters # outside any hooked forward, so the streaming-offload cast hooks can't @@ -809,6 +807,9 @@ class VAE: self.process_input = lambda image: image # shape is the token-ID latent (B, 1, num_tokens); size by num_tokens. self.memory_used_decode = lambda shape, dtype: (1000 * shape[-1] * 768) * model_management.dtype_size(dtype) + # fp32-only (unlike most VAEs that allow fp16/bf16): the VQ codebook lookup + # and occupancy-grid query must run in fp32 to match upstream and keep the + # isosurface stable. self.working_dtypes = [torch.float32] elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 2c9d86328..4cc27de25 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1572,6 +1572,8 @@ class Cube3D(supported_models_base.BASE): return model_base.Cube3D(self, device=device) def clip_target(self, state_dict={}): + # No bundled text encoder: the cube checkpoint is GPT-only. The graph wires a + # standard CLIPLoader(clip-l)/CLIPTextEncode, so there is no clip_target to build. return None class TripoSplat(supported_models_base.BASE):