mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-21 07:52:39 +08:00
feat(api-nodes): add automatic downscaling of videos for ByteDance 2 nodes
Signed-off-by: bigcat88 <bigcat88@icloud.com>
This commit is contained in:
parent
3086026401
commit
7b0a0e2817
@ -158,10 +158,17 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
||||
("Custom", None, None),
|
||||
]
|
||||
|
||||
# Seedance 2.0 reference video pixel count limits per model.
|
||||
# Seedance 2.0 reference video pixel count limits per model and output resolution.
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
|
||||
"dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
|
||||
"dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
|
||||
"dreamina-seedance-2-0-260128": {
|
||||
"480p": {"min": 409_600, "max": 927_408},
|
||||
"720p": {"min": 409_600, "max": 927_408},
|
||||
"1080p": {"min": 409_600, "max": 2_073_600},
|
||||
},
|
||||
"dreamina-seedance-2-0-fast-260128": {
|
||||
"480p": {"min": 409_600, "max": 927_408},
|
||||
"720p": {"min": 409_600, "max": 927_408},
|
||||
},
|
||||
}
|
||||
|
||||
# The time in this dictionary are given for 10 seconds duration.
|
||||
|
||||
@ -35,6 +35,7 @@ from comfy_api_nodes.util import (
|
||||
get_number_of_images,
|
||||
image_tensor_pair_to_batch,
|
||||
poll_op,
|
||||
resize_video_to_pixel_budget,
|
||||
sync_op,
|
||||
upload_audio_to_comfyapi,
|
||||
upload_image_to_comfyapi,
|
||||
@ -69,9 +70,12 @@ DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-2504
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
|
||||
"""Validate reference video pixel count against Seedance 2.0 model limits."""
|
||||
limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
|
||||
def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None:
|
||||
"""Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution."""
|
||||
model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
|
||||
if not model_limits:
|
||||
return
|
||||
limits = model_limits.get(resolution)
|
||||
if not limits:
|
||||
return
|
||||
try:
|
||||
@ -1373,6 +1377,14 @@ def _seedance2_reference_inputs(resolutions: list[str]):
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"auto_downscale",
|
||||
default=False,
|
||||
advanced=True,
|
||||
optional=True,
|
||||
tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
|
||||
"for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -1480,10 +1492,23 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
has_video_input = len(reference_videos) > 0
|
||||
|
||||
if model.get("auto_downscale") and reference_videos:
|
||||
max_px = (
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
|
||||
.get(model["resolution"], {})
|
||||
.get("max")
|
||||
)
|
||||
if max_px:
|
||||
for key in reference_videos:
|
||||
reference_videos[key] = resize_video_to_pixel_budget(
|
||||
reference_videos[key], max_px
|
||||
)
|
||||
|
||||
total_video_duration = 0.0
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
video = reference_videos[key]
|
||||
_validate_ref_video_pixels(video, model_id, i)
|
||||
_validate_ref_video_pixels(video, model_id, model["resolution"], i)
|
||||
try:
|
||||
dur = video.get_duration()
|
||||
if dur < 1.8:
|
||||
|
||||
@ -19,6 +19,7 @@ from .conversions import (
|
||||
image_tensor_pair_to_batch,
|
||||
pil_to_bytesio,
|
||||
resize_mask_to_image,
|
||||
resize_video_to_pixel_budget,
|
||||
tensor_to_base64_string,
|
||||
tensor_to_bytesio,
|
||||
tensor_to_pil,
|
||||
@ -90,6 +91,7 @@ __all__ = [
|
||||
"image_tensor_pair_to_batch",
|
||||
"pil_to_bytesio",
|
||||
"resize_mask_to_image",
|
||||
"resize_video_to_pixel_budget",
|
||||
"tensor_to_base64_string",
|
||||
"tensor_to_bytesio",
|
||||
"tensor_to_pil",
|
||||
|
||||
@ -129,22 +129,38 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO:
|
||||
return img_byte_arr
|
||||
|
||||
|
||||
def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
|
||||
"""Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits.
|
||||
|
||||
Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
|
||||
are rounded down to even values (many codecs require divisible-by-2).
|
||||
"""
|
||||
pixels = src_w * src_h
|
||||
if pixels <= total_pixels:
|
||||
return None
|
||||
scale = math.sqrt(total_pixels / pixels)
|
||||
new_w = max(2, int(src_w * scale))
|
||||
new_h = max(2, int(src_h * scale))
|
||||
new_w -= new_w % 2
|
||||
new_h -= new_h % 2
|
||||
return new_w, new_h
|
||||
|
||||
|
||||
def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
|
||||
"""Downscale input image tensor to roughly the specified total pixels."""
|
||||
"""Downscale input image tensor to roughly the specified total pixels.
|
||||
|
||||
Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels``
|
||||
and is compatible with codecs that require even dimensions (e.g. yuv420p).
|
||||
"""
|
||||
samples = image.movedim(-1, 1)
|
||||
total = int(total_pixels)
|
||||
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
||||
if scale_by >= 1:
|
||||
dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels))
|
||||
if dims is None:
|
||||
return image
|
||||
width = round(samples.shape[3] * scale_by)
|
||||
height = round(samples.shape[2] * scale_by)
|
||||
|
||||
s = common_upscale(samples, width, height, "lanczos", "disabled")
|
||||
s = s.movedim(1, -1)
|
||||
return s
|
||||
new_w, new_h = dims
|
||||
return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1)
|
||||
|
||||
|
||||
def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
|
||||
def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
|
||||
"""Downscale input image tensor so the largest dimension is at most max_side pixels."""
|
||||
samples = image.movedim(-1, 1)
|
||||
height, width = samples.shape[2], samples.shape[3]
|
||||
@ -399,6 +415,72 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
|
||||
raise RuntimeError(f"Failed to trim video: {str(e)}") from e
|
||||
|
||||
|
||||
def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video:
|
||||
"""Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio.
|
||||
|
||||
Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
|
||||
Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
|
||||
"""
|
||||
src_w, src_h = video.get_dimensions()
|
||||
scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels)
|
||||
if scale_dims is None:
|
||||
return video
|
||||
return _apply_video_scale(video, scale_dims)
|
||||
|
||||
|
||||
def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video:
|
||||
"""Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass."""
|
||||
out_w, out_h = scale_dims
|
||||
output_buffer = BytesIO()
|
||||
input_container = None
|
||||
output_container = None
|
||||
|
||||
try:
|
||||
input_source = video.get_stream_source()
|
||||
input_container = av.open(input_source, mode="r")
|
||||
output_container = av.open(output_buffer, mode="w", format="mp4")
|
||||
|
||||
video_stream = output_container.add_stream("h264", rate=video.get_frame_rate())
|
||||
video_stream.width = out_w
|
||||
video_stream.height = out_h
|
||||
video_stream.pix_fmt = "yuv420p"
|
||||
|
||||
audio_stream = None
|
||||
for stream in input_container.streams:
|
||||
if isinstance(stream, av.AudioStream):
|
||||
audio_stream = output_container.add_stream("aac", rate=stream.sample_rate)
|
||||
audio_stream.sample_rate = stream.sample_rate
|
||||
audio_stream.layout = stream.layout
|
||||
break
|
||||
|
||||
for frame in input_container.decode(video=0):
|
||||
frame = frame.reformat(width=out_w, height=out_h, format="yuv420p")
|
||||
for packet in video_stream.encode(frame):
|
||||
output_container.mux(packet)
|
||||
for packet in video_stream.encode():
|
||||
output_container.mux(packet)
|
||||
|
||||
if audio_stream is not None:
|
||||
input_container.seek(0)
|
||||
for audio_frame in input_container.decode(audio=0):
|
||||
for packet in audio_stream.encode(audio_frame):
|
||||
output_container.mux(packet)
|
||||
for packet in audio_stream.encode():
|
||||
output_container.mux(packet)
|
||||
|
||||
output_container.close()
|
||||
input_container.close()
|
||||
output_buffer.seek(0)
|
||||
return InputImpl.VideoFromFile(output_buffer)
|
||||
|
||||
except Exception as e:
|
||||
if input_container is not None:
|
||||
input_container.close()
|
||||
if output_container is not None:
|
||||
output_container.close()
|
||||
raise RuntimeError(f"Failed to resize video: {str(e)}") from e
|
||||
|
||||
|
||||
def _f32_pcm(wav: torch.Tensor) -> torch.Tensor:
|
||||
"""Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file."""
|
||||
if wav.dtype.is_floating_point:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user