mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-23 16:29:25 +08:00
Cube3D: document convention deviations + drop unused VAE flag (review aid)
- Remove the unused self.cube3d VAE flag (set but never read). - Comment why VAE working_dtypes is fp32-only (VQ lookup + occupancy query parity), unlike most VAEs that allow fp16/bf16. - Comment why Cube3D.clip_target() returns None (GPT-only checkpoint; graph wires a standard CLIPLoader/CLIPTextEncode). - Note rope_theta=10000 is upstream's fixed constant, not in the state dict. No behaviour change; comments/cleanup only. Amp-Thread-ID: https://ampcode.com/threads/T-019ec361-addb-70d8-a74b-438ce8a1e096 Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
parent
029b782936
commit
e7f99168ae
@ -668,7 +668,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["text_model_embed_dim"] = state_dict['{}text_proj.weight'.format(key_prefix)].shape[1]
|
||||
dit_config["use_bbox"] = '{}bbox_proj.weight'.format(key_prefix) in state_dict_keys
|
||||
dit_config["bias"] = '{}text_proj.bias'.format(key_prefix) in state_dict_keys
|
||||
dit_config["rope_theta"] = 10000
|
||||
dit_config["rope_theta"] = 10000 # not stored in the state dict; upstream's fixed constant
|
||||
return dit_config
|
||||
|
||||
if '{}latent_in.weight'.format(key_prefix) in state_dict_keys: # Hunyuan 3D
|
||||
|
||||
@ -490,7 +490,6 @@ class VAE:
|
||||
self.disable_offload = False
|
||||
self.not_video = False
|
||||
self.size = None
|
||||
self.cube3d = False
|
||||
|
||||
self.downscale_index_formula = None
|
||||
self.upscale_index_formula = None
|
||||
@ -781,7 +780,6 @@ class VAE:
|
||||
|
||||
# Roblox Cube3D shape tokenizer (OneDAutoEncoder, decode-only)
|
||||
elif "bottleneck.block.codebook.weight" in sd:
|
||||
self.cube3d = True
|
||||
self.latent_dim = 1
|
||||
# The VQ bottleneck (get_codebook/lookup_codebook) reads raw parameters
|
||||
# outside any hooked forward, so the streaming-offload cast hooks can't
|
||||
@ -809,6 +807,9 @@ class VAE:
|
||||
self.process_input = lambda image: image
|
||||
# shape is the token-ID latent (B, 1, num_tokens); size by num_tokens.
|
||||
self.memory_used_decode = lambda shape, dtype: (1000 * shape[-1] * 768) * model_management.dtype_size(dtype)
|
||||
# fp32-only (unlike most VAEs that allow fp16/bf16): the VQ codebook lookup
|
||||
# and occupancy-grid query must run in fp32 to match upstream and keep the
|
||||
# isosurface stable.
|
||||
self.working_dtypes = [torch.float32]
|
||||
|
||||
elif "vocoder.backbone.channel_layers.0.0.bias" in sd: #Ace Step Audio
|
||||
|
||||
@ -1572,6 +1572,8 @@ class Cube3D(supported_models_base.BASE):
|
||||
return model_base.Cube3D(self, device=device)
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
# No bundled text encoder: the cube checkpoint is GPT-only. The graph wires a
|
||||
# standard CLIPLoader(clip-l)/CLIPTextEncode, so there is no clip_target to build.
|
||||
return None
|
||||
|
||||
class TripoSplat(supported_models_base.BASE):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user