diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py index 7646b8b68..7de8d4216 100644 --- a/comfy/k_diffusion/sampling.py +++ b/comfy/k_diffusion/sampling.py @@ -1976,7 +1976,7 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None, Autoregressive sampler for Roblox Cube3D shape GPT (DualStreamRoformer). Not a diffusion sampler: the noised input `x` and `sigmas` values are ignored; - only x's shape (batch, num_tokens) is used. Generates a 1024-long sequence of VQ + only x's shape (batch, 1, num_tokens) is used. Generates a 1024-long sequence of VQ token IDs from CLIP text conditioning, with upstream's linearly-decaying CFG and optional top-p. Plugs into SamplerCustomAdvanced via the SamplerCube node. @@ -2005,7 +2005,12 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None, device = x.device weight_dtype = base_model.get_dtype() - T = x.shape[1] + T = x.shape[-1] # sequence length; latent is (batch, 1, num_tokens) + batch = x.shape[0] + import comfy.utils + pos = comfy.utils.repeat_to_batch_size(pos, batch) + if neg is not None: + neg = comfy.utils.repeat_to_batch_size(neg, batch) use_cfg = (cfg is not None) and (cfg > 0.0) and (neg is not None) autocast_enabled = (device.type == "cuda") cache_dtype = torch.bfloat16 if autocast_enabled else weight_dtype @@ -2065,4 +2070,5 @@ def sample_cube(model, x, sigmas, extra_args=None, callback=None, disable=None, if callback is not None: callback({"x": x, "i": i, "sigma": sigmas[0], "sigma_hat": sigmas[0], "denoised": x}) - return torch.cat(output_ids, dim=1).to(torch.float32) + # (B, T) token IDs -> (B, 1, T) to keep the channels-first 1D latent layout. + return torch.cat(output_ids, dim=1).to(torch.float32).unsqueeze(1) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index bbdfd4bc2..17588262b 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -775,6 +775,16 @@ class Hunyuan3Dv2mini(LatentFormat): latent_dimensions = 1 scale_factor = 1.0188137142395404 +class Cube3D(LatentFormat): + # Roblox Cube3D shape "latent" is a flat sequence of VQ token IDs (one scalar per + # position), so it maps to a channels-first 1D latent (B, 1, num_tokens), mirroring + # Hunyuan3Dv2's (B, C, L) convention. latent_channels=1 keeps fix_empty_latent_channels + # from truncating the token sequence. scale_factor=1.0 since IDs must pass through + # process_latent_in/out unchanged. + latent_channels = 1 + latent_dimensions = 1 + scale_factor = 1.0 + class ACEAudio(LatentFormat): latent_channels = 8 latent_dimensions = 2 diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 388e88a1a..2c9d86328 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1560,7 +1560,7 @@ class Cube3D(supported_models_base.BASE): sampling_settings = {} - latent_format = latent_formats.LatentFormat + latent_format = latent_formats.Cube3D memory_usage_factor = 1.0 diff --git a/comfy_extras/nodes_cube.py b/comfy_extras/nodes_cube.py index 919810545..b10813805 100644 --- a/comfy_extras/nodes_cube.py +++ b/comfy_extras/nodes_cube.py @@ -38,10 +38,9 @@ class EmptyCubeLatent(IO.ComfyNode): @classmethod def execute(cls, num_tokens, batch_size) -> IO.NodeOutput: - # Trailing singleton dim keeps this a 3D latent so it flows through ComfyUI's - # conds/noise pipeline (encode_model_conds reads noise.shape[2]); the sampler - # only uses dim 1 (num_tokens). - latent = torch.zeros([batch_size, num_tokens, 1], device=comfy.model_management.intermediate_device()) + # Channels-first 1D latent (B, 1, num_tokens), mirroring Hunyuan3Dv2's (B, C, L) + # convention (latent_channels=1). The sampler only uses the sequence length. + latent = torch.zeros([batch_size, 1, num_tokens], device=comfy.model_management.intermediate_device()) return IO.NodeOutput({"samples": latent, "type": "cube_tokens"})