Cube3D: use channels-first 1D latent (B,1,L) like Hunyuan3Dv2

Replaces the dummy trailing-dim latent with a channels-first 1D latent
(B, 1, num_tokens) and a dedicated latent_formats.Cube3D
(latent_channels=1, latent_dimensions=1). This mirrors the existing
native 3D model Hunyuan3Dv2's (B, C, L) convention and avoids
fix_empty_latent_channels truncating the token sequence (it narrows
dim=1 to latent_channels for empty latents). Requires no core sampler
changes: encode_model_conds sees a valid noise.shape[2].

- latent_formats.Cube3D added; wired into supported_models.Cube3D
- EmptyCubeLatent emits (B, 1, num_tokens)
- sample_cube takes T from x.shape[-1], returns (B, 1, T), and repeats
  conditioning to the latent batch size

Amp-Thread-ID: https://ampcode.com/threads/T-019ec361-addb-70d8-a74b-438ce8a1e096
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Jedrzej Kosinski 2026-06-14 23:14:17 -07:00
parent 871f7bc390
commit a6c7397b71
4 changed files with 23 additions and 8 deletions

View File

@ -1976,7 +1976,7 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None,
Autoregressive sampler for Roblox Cube3D shape GPT (DualStreamRoformer).
Not a diffusion sampler: the noised input `x` and `sigmas` values are ignored;
only x's shape (batch, num_tokens) is used. Generates a 1024-long sequence of VQ
only x's shape (batch, 1, num_tokens) is used. Generates a 1024-long sequence of VQ
token IDs from CLIP text conditioning, with upstream's linearly-decaying CFG and
optional top-p. Plugs into SamplerCustomAdvanced via the SamplerCube node.
@ -2005,7 +2005,12 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None,
device = x.device
weight_dtype = base_model.get_dtype()
T = x.shape[1]
T = x.shape[-1] # sequence length; latent is (batch, 1, num_tokens)
batch = x.shape[0]
import comfy.utils
pos = comfy.utils.repeat_to_batch_size(pos, batch)
if neg is not None:
neg = comfy.utils.repeat_to_batch_size(neg, batch)
use_cfg = (cfg is not None) and (cfg > 0.0) and (neg is not None)
autocast_enabled = (device.type == "cuda")
cache_dtype = torch.bfloat16 if autocast_enabled else weight_dtype
@ -2065,4 +2070,5 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None,
if callback is not None:
callback({"x": x, "i": i, "sigma": sigmas[0], "sigma_hat": sigmas[0], "denoised": x})
return torch.cat(output_ids, dim=1).to(torch.float32)
# (B, T) token IDs -> (B, 1, T) to keep the channels-first 1D latent layout.
return torch.cat(output_ids, dim=1).to(torch.float32).unsqueeze(1)

View File

@ -775,6 +775,16 @@ class Hunyuan3Dv2mini(LatentFormat):
latent_dimensions = 1
scale_factor = 1.0188137142395404
class Cube3D(LatentFormat):
# Roblox Cube3D shape "latent" is a flat sequence of VQ token IDs (one scalar per
# position), so it maps to a channels-first 1D latent (B, 1, num_tokens), mirroring
# Hunyuan3Dv2's (B, C, L) convention. latent_channels=1 keeps fix_empty_latent_channels
# from truncating the token sequence. scale_factor=1.0 since IDs must pass through
# process_latent_in/out unchanged.
latent_channels = 1
latent_dimensions = 1
scale_factor = 1.0
class ACEAudio(LatentFormat):
latent_channels = 8
latent_dimensions = 2

View File

@ -1560,7 +1560,7 @@ class Cube3D(supported_models_base.BASE):
sampling_settings = {}
latent_format = latent_formats.LatentFormat
latent_format = latent_formats.Cube3D
memory_usage_factor = 1.0

View File

@ -38,10 +38,9 @@ class EmptyCubeLatent(IO.ComfyNode):
@classmethod
def execute(cls, num_tokens, batch_size) -> IO.NodeOutput:
# Trailing singleton dim keeps this a 3D latent so it flows through ComfyUI's
# conds/noise pipeline (encode_model_conds reads noise.shape[2]); the sampler
# only uses dim 1 (num_tokens).
latent = torch.zeros([batch_size, num_tokens, 1], device=comfy.model_management.intermediate_device())
# Channels-first 1D latent (B, 1, num_tokens), mirroring Hunyuan3Dv2's (B, C, L)
# convention (latent_channels=1). The sampler only uses the sequence length.
latent = torch.zeros([batch_size, 1, num_tokens], device=comfy.model_management.intermediate_device())
return IO.NodeOutput({"samples": latent, "type": "cube_tokens"})