diff --git a/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt new file mode 100755 index 000000000..96a500be2 --- /dev/null +++ b/.ci/windows_amd_base_files/README_VERY_IMPORTANT.txt @@ -0,0 +1,27 @@ +As of the time of writing this you need this preview driver for best results: +https://www.amd.com/en/resources/support-articles/release-notes/RN-AMDGPU-WINDOWS-PYTORCH-PREVIEW.html + +HOW TO RUN: + +If you have a AMD gpu: + +run_amd_gpu.bat + +If you have memory issues you can try disabling the smart memory management by running comfyui with: + +run_amd_gpu_disable_smart_memory.bat + +IF YOU GET A RED ERROR IN THE UI MAKE SURE YOU HAVE A MODEL/CHECKPOINT IN: ComfyUI\models\checkpoints + +You can download the stable diffusion XL one from: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0_0.9vae.safetensors + + +RECOMMENDED WAY TO UPDATE: +To update the ComfyUI code: update\update_comfyui.bat + + +TO SHARE MODELS BETWEEN COMFYUI AND ANOTHER UI: +In the ComfyUI directory you will find a file: extra_model_paths.yaml.example +Rename this file to: extra_model_paths.yaml and edit it with your favorite text editor. + + diff --git a/.ci/windows_base_files/run_nvidia_gpu.bat b/.ci/windows_amd_base_files/run_amd_gpu.bat similarity index 100% rename from .ci/windows_base_files/run_nvidia_gpu.bat rename to .ci/windows_amd_base_files/run_amd_gpu.bat diff --git a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat old mode 100644 new mode 100755 similarity index 65% rename from .ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat rename to .ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat index 38f06ecb2..cece0aeb2 --- a/.ci/windows_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat +++ b/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat @@ -1,2 +1,2 @@ -.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation +.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory pause diff --git a/.ci/windows_base_files/README_VERY_IMPORTANT.txt b/.ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt similarity index 100% rename from .ci/windows_base_files/README_VERY_IMPORTANT.txt rename to .ci/windows_nvidia_base_files/README_VERY_IMPORTANT.txt diff --git a/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat new file mode 100644 index 000000000..ed00583b6 --- /dev/null +++ b/.ci/windows_nvidia_base_files/advanced/run_nvidia_gpu_disable_api_nodes.bat @@ -0,0 +1,3 @@ +..\python_embeded\python.exe -s ..\ComfyUI\main.py --windows-standalone-build --disable-api-nodes +echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. +pause diff --git a/.ci/windows_base_files/run_cpu.bat b/.ci/windows_nvidia_base_files/run_cpu.bat similarity index 100% rename from .ci/windows_base_files/run_cpu.bat rename to .ci/windows_nvidia_base_files/run_cpu.bat diff --git a/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat new file mode 100755 index 000000000..4898a424f --- /dev/null +++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu.bat @@ -0,0 +1,3 @@ +.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build +echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. +pause diff --git a/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat new file mode 100644 index 000000000..32611e4af --- /dev/null +++ b/.ci/windows_nvidia_base_files/run_nvidia_gpu_fast_fp16_accumulation.bat @@ -0,0 +1,3 @@ +.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --fast fp16_accumulation +echo If you see this and ComfyUI did not start try updating your Nvidia Drivers to the latest. +pause diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 3cf2717b7..6556677e0 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -8,13 +8,15 @@ body: Before submitting a **Bug Report**, please ensure the following: - **1:** You are running the latest version of ComfyUI. - - **2:** You have looked at the existing bug reports and made sure this isn't already reported. + - **2:** You have your ComfyUI logs and relevant workflow on hand and will post them in this bug report. - **3:** You confirmed that the bug is not caused by a custom node. You can disable all custom nodes by passing - `--disable-all-custom-nodes` command line argument. + `--disable-all-custom-nodes` command line argument. If you have custom node try updating them to the latest version. - **4:** This is an actual bug in ComfyUI, not just a support question. A bug is when you can specify exact steps to replicate what went wrong and others will be able to repeat your steps and see the same issue happen. - If unsure, ask on the [ComfyUI Matrix Space](https://app.element.io/#/room/%23comfyui_space%3Amatrix.org) or the [Comfy Org Discord](https://discord.gg/comfyorg) first. + ## Very Important + + Please make sure that you post ALL your ComfyUI logs in the bug report. A bug report without logs will likely be ignored. - type: checkboxes id: custom-nodes-test attributes: diff --git a/.github/PULL_REQUEST_TEMPLATE/api-node.md b/.github/PULL_REQUEST_TEMPLATE/api-node.md new file mode 100644 index 000000000..c1f1bafb1 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/api-node.md @@ -0,0 +1,21 @@ + + +## API Node PR Checklist + +### Scope +- [ ] **Is API Node Change** + +### Pricing & Billing +- [ ] **Need pricing update** +- [ ] **No pricing update** + +If **Need pricing update**: +- [ ] Metronome rate cards updated +- [ ] Auto‑billing tests updated and passing + +### QA +- [ ] **QA done** +- [ ] **QA not required** + +### Comms +- [ ] Informed **Kosinkadink** diff --git a/.github/workflows/api-node-template.yml b/.github/workflows/api-node-template.yml new file mode 100644 index 000000000..fdb81c0c5 --- /dev/null +++ b/.github/workflows/api-node-template.yml @@ -0,0 +1,58 @@ +name: Append API Node PR template + +on: + pull_request_target: + types: [opened, reopened, synchronize, ready_for_review] + paths: + - 'comfy_api_nodes/**' # only run if these files changed + +permissions: + contents: read + pull-requests: write + +jobs: + inject: + runs-on: ubuntu-latest + steps: + - name: Ensure template exists and append to PR body + uses: actions/github-script@v7 + with: + script: | + const { owner, repo } = context.repo; + const number = context.payload.pull_request.number; + const templatePath = '.github/PULL_REQUEST_TEMPLATE/api-node.md'; + const marker = ''; + + const { data: pr } = await github.rest.pulls.get({ owner, repo, pull_number: number }); + + let templateText; + try { + const res = await github.rest.repos.getContent({ + owner, + repo, + path: templatePath, + ref: pr.base.ref + }); + const buf = Buffer.from(res.data.content, res.data.encoding || 'base64'); + templateText = buf.toString('utf8'); + } catch (e) { + core.setFailed(`Required PR template not found at "${templatePath}" on ${pr.base.ref}. Please add it to the repo.`); + return; + } + + // Enforce the presence of the marker inside the template (for idempotence) + if (!templateText.includes(marker)) { + core.setFailed(`Template at "${templatePath}" does not contain the required marker:\n${marker}\nAdd it so we can detect duplicates safely.`); + return; + } + + // If the PR already contains the marker, do not append again. + const body = pr.body || ''; + if (body.includes(marker)) { + core.info('Template already present in PR body; nothing to inject.'); + return; + } + + const newBody = (body ? body + '\n\n' : '') + templateText + '\n'; + await github.rest.pulls.update({ owner, repo, pull_number: number, body: newBody }); + core.notice('API Node template appended to PR description.'); diff --git a/.github/workflows/release-stable-all.yml b/.github/workflows/release-stable-all.yml new file mode 100644 index 000000000..f7de3a7c3 --- /dev/null +++ b/.github/workflows/release-stable-all.yml @@ -0,0 +1,78 @@ +name: "Release Stable All Portable Versions" + +on: + workflow_dispatch: + inputs: + git_tag: + description: 'Git tag' + required: true + type: string + +jobs: + release_nvidia_default: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + name: "Release NVIDIA Default (cu129)" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "cu130" + python_minor: "13" + python_patch: "9" + rel_name: "nvidia" + rel_extra_name: "" + test_release: true + secrets: inherit + + release_nvidia_cu128: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + name: "Release NVIDIA cu128" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "cu128" + python_minor: "12" + python_patch: "10" + rel_name: "nvidia" + rel_extra_name: "_cu128" + test_release: true + secrets: inherit + + release_nvidia_cu126: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + name: "Release NVIDIA cu126" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "cu126" + python_minor: "12" + python_patch: "10" + rel_name: "nvidia" + rel_extra_name: "_cu126" + test_release: true + secrets: inherit + + release_amd_rocm: + permissions: + contents: "write" + packages: "write" + pull-requests: "read" + name: "Release AMD ROCm 6.4.4" + uses: ./.github/workflows/stable-release.yml + with: + git_tag: ${{ inputs.git_tag }} + cache_tag: "rocm644" + python_minor: "12" + python_patch: "10" + rel_name: "amd" + rel_extra_name: "" + test_release: false + secrets: inherit diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 4c1a02594..b24d86a6b 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -21,3 +21,28 @@ jobs: - name: Run Ruff run: ruff check . + + pylint: + name: Run Pylint + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install requirements + run: | + python -m pip install --upgrade pip + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + pip install -r requirements.txt + + - name: Install Pylint + run: pip install pylint + + - name: Run Pylint + run: pylint comfy_api_nodes diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml index 2bc8e5905..28484a9d1 100644 --- a/.github/workflows/stable-release.yml +++ b/.github/workflows/stable-release.yml @@ -2,17 +2,17 @@ name: "Release Stable Version" on: - workflow_dispatch: + workflow_call: inputs: git_tag: description: 'Git tag' required: true type: string - cu: - description: 'CUDA version' + cache_tag: + description: 'Cached dependencies tag' required: true type: string - default: "129" + default: "cu129" python_minor: description: 'Python minor version' required: true @@ -23,7 +23,57 @@ on: required: true type: string default: "6" - + rel_name: + description: 'Release name' + required: true + type: string + default: "nvidia" + rel_extra_name: + description: 'Release extra name' + required: false + type: string + default: "" + test_release: + description: 'Test Release' + required: true + type: boolean + default: true + workflow_dispatch: + inputs: + git_tag: + description: 'Git tag' + required: true + type: string + cache_tag: + description: 'Cached dependencies tag' + required: true + type: string + default: "cu129" + python_minor: + description: 'Python minor version' + required: true + type: string + default: "13" + python_patch: + description: 'Python patch version' + required: true + type: string + default: "6" + rel_name: + description: 'Release name' + required: true + type: string + default: "nvidia" + rel_extra_name: + description: 'Release extra name' + required: false + type: string + default: "" + test_release: + description: 'Test Release' + required: true + type: boolean + default: true jobs: package_comfy_windows: @@ -42,15 +92,15 @@ jobs: id: cache with: path: | - cu${{ inputs.cu }}_python_deps.tar + ${{ inputs.cache_tag }}_python_deps.tar update_comfyui_and_python_dependencies.bat - key: ${{ runner.os }}-build-cu${{ inputs.cu }}-${{ inputs.python_minor }} + key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }} - shell: bash run: | - mv cu${{ inputs.cu }}_python_deps.tar ../ + mv ${{ inputs.cache_tag }}_python_deps.tar ../ mv update_comfyui_and_python_dependencies.bat ../ cd .. - tar xf cu${{ inputs.cu }}_python_deps.tar + tar xf ${{ inputs.cache_tag }}_python_deps.tar pwd ls @@ -65,12 +115,19 @@ jobs: echo 'import site' >> ./python3${{ inputs.python_minor }}._pth curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py ./python.exe get-pip.py - ./python.exe -s -m pip install ../cu${{ inputs.cu }}_python_deps/* + ./python.exe -s -m pip install ../${{ inputs.cache_tag }}_python_deps/* + + grep comfyui ../ComfyUI/requirements.txt > ./requirements_comfyui.txt + ./python.exe -s -m pip install -r requirements_comfyui.txt + rm requirements_comfyui.txt + sed -i '1i../ComfyUI' ./python3${{ inputs.python_minor }}._pth - rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space - rm ./Lib/site-packages/torch/lib/libprotoc.lib - rm ./Lib/site-packages/torch/lib/libprotobuf.lib + if test -f ./Lib/site-packages/torch/lib/dnnl.lib; then + rm ./Lib/site-packages/torch/lib/dnnl.lib #I don't think this is actually used and I need the space + rm ./Lib/site-packages/torch/lib/libprotoc.lib + rm ./Lib/site-packages/torch/lib/libprotobuf.lib + fi cd .. @@ -85,14 +142,18 @@ jobs: mkdir update cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ + cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./ cp ../update_comfyui_and_python_dependencies.bat ./update/ cd .. "C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable - mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_nvidia.7z + mv ComfyUI_windows_portable.7z ComfyUI/ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z + - shell: bash + if: ${{ inputs.test_release }} + run: | + cd .. cd ComfyUI_windows_portable python_embeded/python.exe -s ComfyUI/main.py --quick-test-for-ci --cpu @@ -101,10 +162,9 @@ jobs: ls - name: Upload binaries to release - uses: svenstaro/upload-release-action@v2 + uses: softprops/action-gh-release@v2 with: - repo_token: ${{ secrets.GITHUB_TOKEN }} - file: ComfyUI_windows_portable_nvidia.7z - tag: ${{ inputs.git_tag }} - overwrite: true + files: ComfyUI_windows_portable_${{ inputs.rel_name }}${{ inputs.rel_extra_name }}.7z + tag_name: ${{ inputs.git_tag }} draft: true + overwrite_files: true diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml index 418dca0ab..1660ec8e3 100644 --- a/.github/workflows/test-ci.yml +++ b/.github/workflows/test-ci.yml @@ -21,14 +21,15 @@ jobs: fail-fast: false matrix: # os: [macos, linux, windows] - os: [macos, linux] - python_version: ["3.9", "3.10", "3.11", "3.12"] + # os: [macos, linux] + os: [linux] + python_version: ["3.10", "3.11", "3.12"] cuda_version: ["12.1"] torch_version: ["stable"] include: - - os: macos - runner_label: [self-hosted, macOS] - flags: "--use-pytorch-cross-attention" + # - os: macos + # runner_label: [self-hosted, macOS] + # flags: "--use-pytorch-cross-attention" - os: linux runner_label: [self-hosted, Linux] flags: "" @@ -73,14 +74,15 @@ jobs: strategy: fail-fast: false matrix: - os: [macos, linux] + # os: [macos, linux] + os: [linux] python_version: ["3.11"] cuda_version: ["12.1"] torch_version: ["nightly"] include: - - os: macos - runner_label: [self-hosted, macOS] - flags: "--use-pytorch-cross-attention" + # - os: macos + # runner_label: [self-hosted, macOS] + # flags: "--use-pytorch-cross-attention" - os: linux runner_label: [self-hosted, Linux] flags: "" diff --git a/.github/workflows/windows_release_dependencies.yml b/.github/workflows/windows_release_dependencies.yml index 7761cc1ed..f61ee21a2 100644 --- a/.github/workflows/windows_release_dependencies.yml +++ b/.github/workflows/windows_release_dependencies.yml @@ -17,7 +17,7 @@ on: description: 'cuda version' required: true type: string - default: "129" + default: "130" python_minor: description: 'python minor version' @@ -29,7 +29,7 @@ on: description: 'python patch version' required: true type: string - default: "6" + default: "9" # push: # branches: # - master @@ -56,7 +56,8 @@ jobs: ..\python_embeded\python.exe -s -m pip install --upgrade torch torchvision torchaudio ${{ inputs.xformers }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r ../ComfyUI/requirements.txt pygit2 pause" > update_comfyui_and_python_dependencies.bat - python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements.txt pygit2 -w ./temp_wheel_dir + grep -v comfyui requirements.txt > requirements_nocomfyui.txt + python -m pip wheel --no-cache-dir torch torchvision torchaudio ${{ inputs.xformers }} ${{ inputs.extra_dependencies }} --extra-index-url https://download.pytorch.org/whl/cu${{ inputs.cu }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir python -m pip install --no-cache-dir ./temp_wheel_dir/* echo installed basic ls -lah temp_wheel_dir diff --git a/.github/workflows/windows_release_dependencies_manual.yml b/.github/workflows/windows_release_dependencies_manual.yml new file mode 100644 index 000000000..0799feef1 --- /dev/null +++ b/.github/workflows/windows_release_dependencies_manual.yml @@ -0,0 +1,64 @@ +name: "Windows Release dependencies Manual" + +on: + workflow_dispatch: + inputs: + torch_dependencies: + description: 'torch dependencies' + required: false + type: string + default: "torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu128" + cache_tag: + description: 'Cached dependencies tag' + required: true + type: string + default: "cu128" + + python_minor: + description: 'python minor version' + required: true + type: string + default: "12" + + python_patch: + description: 'python patch version' + required: true + type: string + default: "10" + +jobs: + build_dependencies: + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.${{ inputs.python_minor }}.${{ inputs.python_patch }} + + - shell: bash + run: | + echo "@echo off + call update_comfyui.bat nopause + echo - + echo This will try to update pytorch and all python dependencies. + echo - + echo If you just want to update normally, close this and run update_comfyui.bat instead. + echo - + pause + ..\python_embeded\python.exe -s -m pip install --upgrade ${{ inputs.torch_dependencies }} -r ../ComfyUI/requirements.txt pygit2 + pause" > update_comfyui_and_python_dependencies.bat + + grep -v comfyui requirements.txt > requirements_nocomfyui.txt + python -m pip wheel --no-cache-dir ${{ inputs.torch_dependencies }} -r requirements_nocomfyui.txt pygit2 -w ./temp_wheel_dir + python -m pip install --no-cache-dir ./temp_wheel_dir/* + echo installed basic + ls -lah temp_wheel_dir + mv temp_wheel_dir ${{ inputs.cache_tag }}_python_deps + tar cf ${{ inputs.cache_tag }}_python_deps.tar ${{ inputs.cache_tag }}_python_deps + + - uses: actions/cache/save@v4 + with: + path: | + ${{ inputs.cache_tag }}_python_deps.tar + update_comfyui_and_python_dependencies.bat + key: ${{ runner.os }}-build-${{ inputs.cache_tag }}-${{ inputs.python_minor }} diff --git a/.github/workflows/windows_release_nightly_pytorch.yml b/.github/workflows/windows_release_nightly_pytorch.yml index 5bdc940de..ca1ef71ae 100644 --- a/.github/workflows/windows_release_nightly_pytorch.yml +++ b/.github/workflows/windows_release_nightly_pytorch.yml @@ -68,7 +68,7 @@ jobs: mkdir update cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ + cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./ cp -r ComfyUI/.ci/windows_nightly_base_files/* ./ echo "call update_comfyui.bat nopause diff --git a/.github/workflows/windows_release_package.yml b/.github/workflows/windows_release_package.yml index 46375698e..7955325fc 100644 --- a/.github/workflows/windows_release_package.yml +++ b/.github/workflows/windows_release_package.yml @@ -81,7 +81,7 @@ jobs: mkdir update cp -r ComfyUI/.ci/update_windows/* ./update/ - cp -r ComfyUI/.ci/windows_base_files/* ./ + cp -r ComfyUI/.ci/windows_nvidia_base_files/* ./ cp ../update_comfyui_and_python_dependencies.bat ./update/ cd .. diff --git a/QUANTIZATION.md b/QUANTIZATION.md new file mode 100644 index 000000000..1693e13f3 --- /dev/null +++ b/QUANTIZATION.md @@ -0,0 +1,168 @@ +# The Comfy guide to Quantization + + +## How does quantization work? + +Quantization aims to map a high-precision value x_f to a lower precision format with minimal loss in accuracy. These smaller formats then serve to reduce the models memory footprint and increase throughput by using specialized hardware. + +When simply converting a value from FP16 to FP8 using the round-nearest method we might hit two issues: +- The dynamic range of FP16 (-65,504, 65,504) far exceeds FP8 formats like E4M3 (-448, 448) or E5M2 (-57,344, 57,344), potentially resulting in clipped values +- The original values are concentrated in a small range (e.g. -1,1) leaving many FP8-bits "unused" + +By using a scaling factor, we aim to map these values into the quantized-dtype range, making use of the full spectrum. One of the easiest approaches, and common, is using per-tensor absolute-maximum scaling. + +``` +absmax = max(abs(tensor)) +scale = amax / max_dynamic_range_low_precision + +# Quantization +tensor_q = (tensor / scale).to(low_precision_dtype) + +# De-Quantization +tensor_dq = tensor_q.to(fp16) * scale + +tensor_dq ~ tensor +``` + +Given that additional information (scaling factor) is needed to "interpret" the quantized values, we describe those as derived datatypes. + + +## Quantization in Comfy + +``` +QuantizedTensor (torch.Tensor subclass) + ↓ __torch_dispatch__ +Two-Level Registry (generic + layout handlers) + ↓ +MixedPrecisionOps + Metadata Detection +``` + +### Representation + +To represent these derived datatypes, ComfyUI uses a subclass of torch.Tensor to implements these using the `QuantizedTensor` class found in `comfy/quant_ops.py` + +A `Layout` class defines how a specific quantization format behaves: +- Required parameters +- Quantize method +- De-Quantize method + +```python +from comfy.quant_ops import QuantizedLayout + +class MyLayout(QuantizedLayout): + @classmethod + def quantize(cls, tensor, **kwargs): + # Convert to quantized format + qdata = ... + params = {'scale': ..., 'orig_dtype': tensor.dtype} + return qdata, params + + @staticmethod + def dequantize(qdata, scale, orig_dtype, **kwargs): + return qdata.to(orig_dtype) * scale +``` + +To then run operations using these QuantizedTensors we use two registry systems to define supported operations. +The first is a **generic registry** that handles operations common to all quantized formats (e.g., `.to()`, `.clone()`, `.reshape()`). + +The second registry is layout-specific and allows to implement fast-paths like nn.Linear. +```python +from comfy.quant_ops import register_layout_op + +@register_layout_op(torch.ops.aten.linear.default, MyLayout) +def my_linear(func, args, kwargs): + # Extract tensors, call optimized kernel + ... +``` +When `torch.nn.functional.linear()` is called with QuantizedTensor arguments, `__torch_dispatch__` automatically routes to the registered implementation. +For any unsupported operation, QuantizedTensor will fallback to call `dequantize` and dispatch using the high-precision implementation. + + +### Mixed Precision + +The `MixedPrecisionOps` class (lines 542-648 in `comfy/ops.py`) enables per-layer quantization decisions, allowing different layers in a model to use different precisions. This is activated when a model config contains a `layer_quant_config` dictionary that specifies which layers should be quantized and how. + +**Architecture:** + +```python +class MixedPrecisionOps(disable_weight_init): + _layer_quant_config = {} # Maps layer names to quantization configs + _compute_dtype = torch.bfloat16 # Default compute / dequantize precision +``` + +**Key mechanism:** + +The custom `Linear._load_from_state_dict()` method inspects each layer during model loading: +- If the layer name is **not** in `_layer_quant_config`: load weight as regular tensor in `_compute_dtype` +- If the layer name **is** in `_layer_quant_config`: + - Load weight as `QuantizedTensor` with the specified layout (e.g., `TensorCoreFP8Layout`) + - Load associated quantization parameters (scales, block_size, etc.) + +**Why it's needed:** + +Not all layers tolerate quantization equally. Sensitive operations like final projections can be kept in higher precision, while compute-heavy matmuls are quantized. This provides most of the performance benefits while maintaining quality. + +The system is selected in `pick_operations()` when `model_config.layer_quant_config` is present, making it the highest-priority operation mode. + + +## Checkpoint Format + +Quantized checkpoints are stored as standard safetensors files with quantized weight tensors and associated scaling parameters, plus a `_quantization_metadata` JSON entry describing the quantization scheme. + +The quantized checkpoint will contain the same layers as the original checkpoint but: +- The weights are stored as quantized values, sometimes using a different storage datatype. E.g. uint8 container for fp8. +- For each quantized weight a number of additional scaling parameters are stored alongside depending on the recipe. +- We store a metadata.json in the metadata of the final safetensor containing the `_quantization_metadata` describing which layers are quantized and what layout has been used. + +### Scaling Parameters details +We define 4 possible scaling parameters that should cover most recipes in the near-future: +- **weight_scale**: quantization scalers for the weights +- **weight_scale_2**: global scalers in the context of double scaling +- **pre_quant_scale**: scalers used for smoothing salient weights +- **input_scale**: quantization scalers for the activations + +| Format | Storage dtype | weight_scale | weight_scale_2 | pre_quant_scale | input_scale | +|--------|---------------|--------------|----------------|-----------------|-------------| +| float8_e4m3fn | float32 | float32 (scalar) | - | - | float32 (scalar) | + +You can find the defined formats in `comfy/quant_ops.py` (QUANT_ALGOS). + +### Quantization Metadata + +The metadata stored alongside the checkpoint contains: +- **format_version**: String to define a version of the standard +- **layers**: A dictionary mapping layer names to their quantization format. The format string maps to the definitions found in `QUANT_ALGOS`. + +Example: +```json +{ + "_quantization_metadata": { + "format_version": "1.0", + "layers": { + "model.layers.0.mlp.up_proj": "float8_e4m3fn", + "model.layers.0.mlp.down_proj": "float8_e4m3fn", + "model.layers.1.mlp.up_proj": "float8_e4m3fn" + } + } +} +``` + + +## Creating Quantized Checkpoints + +To create compatible checkpoints, use any quantization tool provided the output follows the checkpoint format described above and uses a layout defined in `QUANT_ALGOS`. + +### Weight Quantization + +Weight quantization is straightforward - compute the scaling factor directly from the weight tensor using the absolute maximum method described earlier. Each layer's weights are quantized independently and stored with their corresponding `weight_scale` parameter. + +### Calibration (for Activation Quantization) + +Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_scale` parameters that cannot be determined from static weights alone. Since activation values depend on actual inputs, we use **post-training calibration (PTQ)**: + +1. **Collect statistics**: Run inference on N representative samples +2. **Track activations**: Record the absolute maximum (`amax`) of inputs to each quantized layer +3. **Compute scales**: Derive `input_scale` from collected statistics +4. **Store in checkpoint**: Save `input_scale` parameters alongside weights + +The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters. \ No newline at end of file diff --git a/README.md b/README.md index 3f6cfc2ed..28beec427 100644 --- a/README.md +++ b/README.md @@ -112,10 +112,11 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git ## Release Process -ComfyUI follows a weekly release cycle targeting Friday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories: +ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories: 1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)** - - Releases a new stable version (e.g., v0.7.0) + - Releases a new stable version (e.g., v0.7.0) roughly every week. + - Commits outside of the stable release tags may be very unstable and break many custom nodes. - Serves as the foundation for the desktop release 2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)** @@ -172,10 +173,20 @@ There is a portable standalone build for Windows that should work for running on ### [Direct link to download](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) -Simply download, extract with [7-Zip](https://7-zip.org) and run. Make sure you put your Stable Diffusion checkpoints/models (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints +Simply download, extract with [7-Zip](https://7-zip.org) or with the windows explorer on recent windows versions and run. For smaller models you normally only need to put the checkpoints (the huge ckpt/safetensors files) in: ComfyUI\models\checkpoints but many of the larger models have multiple files. Make sure to follow the instructions to know which subfolder to put them in ComfyUI\models\ If you have trouble extracting it, right click the file -> properties -> unblock +Update your Nvidia drivers if it doesn't start. + +#### Alternative Downloads: + +[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z) + +[Portable with pytorch cuda 12.8 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu128.7z). + +[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs). + #### How do I share models between another UI and ComfyUI? See the [Config file](extra_model_paths.yaml.example) to set the search paths for models. In the standalone windows build you can find this file in the ComfyUI directory. Rename this file to extra_model_paths.yaml and edit it with your favorite text editor. @@ -191,7 +202,11 @@ comfy install ## Manual Install (Windows, Linux) -Python 3.13 is very well supported. If you have trouble with some custom node dependencies you can try 3.12 +Python 3.14 works but you may encounter issues with the torch compile node. The free threaded variant is still missing some dependencies. + +Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12 + +### Instructions: Git clone this repo. @@ -200,18 +215,36 @@ Put your SD checkpoints (the huge ckpt/safetensors files) in: models/checkpoints Put your VAE in: models/vae -### AMD GPUs (Linux only) +### AMD GPUs (Linux) + AMD users can install rocm and pytorch with pip if you don't have it already installed, this is the command to install the stable version: ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.4``` -This is the command to install the nightly with ROCm 6.4 which might have some performance improvements: +This is the command to install the nightly with ROCm 7.0 which might have some performance improvements: -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.4``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm7.1``` + + +### AMD GPUs (Experimental: Windows and Linux), RDNA 3, 3.5 and 4 only. + +These have less hardware support than the builds above but they work on windows. You also need to install the pytorch version specific to your hardware. + +RDNA 3 (RX 7000 series): + +```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx110X-dgpu/``` + +RDNA 3.5 (Strix halo/Ryzen AI Max+ 365): + +```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx1151/``` + +RDNA 4 (RX 9000 series): + +```pip install --pre torch torchvision torchaudio --index-url https://rocm.nightlies.amd.com/v2/gfx120X-all/``` ### Intel GPUs (Windows and Linux) -(Option 1) Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) +Intel Arc GPU users can install native PyTorch with torch.xpu support using pip. More information can be found [here](https://pytorch.org/docs/main/notes/get_start_xpu.html) 1. To install PyTorch xpu, use the following command: @@ -221,19 +254,15 @@ This is the command to install the Pytorch xpu nightly which might have some per ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/xpu``` -(Option 2) Alternatively, Intel GPUs supported by Intel Extension for PyTorch (IPEX) can leverage IPEX for improved performance. - -1. visit [Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu) for more information. - ### NVIDIA Nvidia users should install stable pytorch using this command: -```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu129``` +```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu130``` This is the command to install pytorch nightly instead which might have performance improvements. -```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu129``` +```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130``` #### Troubleshooting @@ -264,12 +293,6 @@ You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS ve > **Note**: Remember to add your models, VAE, LoRAs etc. to the corresponding Comfy folders, as discussed in [ComfyUI manual installation](#manual-install-windows-linux). -#### DirectML (AMD Cards on Windows) - -This is very badly supported and is not recommended. There are some unofficial builds of pytorch ROCm on windows that exist that will give you a much better experience than this. This readme will be updated once official pytorch ROCm builds for windows come out. - -```pip install torch-directml``` Then you can launch ComfyUI with: ```python main.py --directml``` - #### Ascend NPUs For models compatible with Ascend Extension for PyTorch (torch_npu). To get started, ensure your environment meets the prerequisites outlined on the [installation](https://ascend.github.io/docs/sources/ascend/quick_install.html) page. Here's a step-by-step guide tailored to your platform and installation method: diff --git a/app/subgraph_manager.py b/app/subgraph_manager.py new file mode 100644 index 000000000..dbe404541 --- /dev/null +++ b/app/subgraph_manager.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from typing import TypedDict +import os +import folder_paths +import glob +from aiohttp import web +import hashlib + + +class Source: + custom_node = "custom_node" + +class SubgraphEntry(TypedDict): + source: str + """ + Source of subgraph - custom_nodes vs templates. + """ + path: str + """ + Relative path of the subgraph file. + For custom nodes, will be the relative directory like /subgraphs/.json + """ + name: str + """ + Name of subgraph file. + """ + info: CustomNodeSubgraphEntryInfo + """ + Additional info about subgraph; in the case of custom_nodes, will contain nodepack name + """ + data: str + +class CustomNodeSubgraphEntryInfo(TypedDict): + node_pack: str + """Node pack name.""" + +class SubgraphManager: + def __init__(self): + self.cached_custom_node_subgraphs: dict[SubgraphEntry] | None = None + + async def load_entry_data(self, entry: SubgraphEntry): + with open(entry['path'], 'r') as f: + entry['data'] = f.read() + return entry + + async def sanitize_entry(self, entry: SubgraphEntry | None, remove_data=False) -> SubgraphEntry | None: + if entry is None: + return None + entry = entry.copy() + entry.pop('path', None) + if remove_data: + entry.pop('data', None) + return entry + + async def sanitize_entries(self, entries: dict[str, SubgraphEntry], remove_data=False) -> dict[str, SubgraphEntry]: + entries = entries.copy() + for key in list(entries.keys()): + entries[key] = await self.sanitize_entry(entries[key], remove_data) + return entries + + async def get_custom_node_subgraphs(self, loadedModules, force_reload=False): + # if not forced to reload and cached, return cache + if not force_reload and self.cached_custom_node_subgraphs is not None: + return self.cached_custom_node_subgraphs + # Load subgraphs from custom nodes + subfolder = "subgraphs" + subgraphs_dict: dict[SubgraphEntry] = {} + + for folder in folder_paths.get_folder_paths("custom_nodes"): + pattern = os.path.join(folder, f"*/{subfolder}/*.json") + matched_files = glob.glob(pattern) + for file in matched_files: + # replace backslashes with forward slashes + file = file.replace('\\', '/') + info: CustomNodeSubgraphEntryInfo = { + "node_pack": "custom_nodes." + file.split('/')[-3] + } + source = Source.custom_node + # hash source + path to make sure id will be as unique as possible, but + # reproducible across backend reloads + id = hashlib.sha256(f"{source}{file}".encode()).hexdigest() + entry: SubgraphEntry = { + "source": Source.custom_node, + "name": os.path.splitext(os.path.basename(file))[0], + "path": file, + "info": info, + } + subgraphs_dict[id] = entry + self.cached_custom_node_subgraphs = subgraphs_dict + return subgraphs_dict + + async def get_custom_node_subgraph(self, id: str, loadedModules): + subgraphs = await self.get_custom_node_subgraphs(loadedModules) + entry: SubgraphEntry = subgraphs.get(id, None) + if entry is not None and entry.get('data', None) is None: + await self.load_entry_data(entry) + return entry + + def add_routes(self, routes, loadedModules): + @routes.get("/global_subgraphs") + async def get_global_subgraphs(request): + subgraphs_dict = await self.get_custom_node_subgraphs(loadedModules) + # NOTE: we may want to include other sources of global subgraphs such as templates in the future; + # that's the reasoning for the current implementation + return web.json_response(await self.sanitize_entries(subgraphs_dict, remove_data=True)) + + @routes.get("/global_subgraphs/{id}") + async def get_global_subgraph(request): + id = request.match_info.get("id", None) + subgraph = await self.get_custom_node_subgraph(id, loadedModules) + return web.json_response(await self.sanitize_entry(subgraph)) diff --git a/comfy/cli_args.py b/comfy/cli_args.py index cc1f12482..2f30b72d2 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -105,6 +105,7 @@ cache_group = parser.add_mutually_exclusive_group() cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.") cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.") cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.") +cache_group.add_argument("--cache-ram", nargs='?', const=4.0, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threhold the cache remove large items to free RAM. Default 4GB") attn_group = parser.add_mutually_exclusive_group() attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.") @@ -145,7 +146,9 @@ class PerformanceFeature(enum.Enum): CublasOps = "cublas_ops" AutoTune = "autotune" -parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) +parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. This is used to test new features so using it might crash your comfyui. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature)))) + +parser.add_argument("--disable-pinned-memory", action="store_true", help="Disable pinned memory use.") parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.") parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.") diff --git a/comfy/controlnet.py b/comfy/controlnet.py index f08ff4b36..0b5e30f52 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -310,11 +310,13 @@ class ControlLoraOps: self.bias = None def forward(self, input): - weight, bias = comfy.ops.cast_bias_weight(self, input) + weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True) if self.up is not None: - return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias) + x = torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias) else: - return torch.nn.functional.linear(input, weight, bias) + x = torch.nn.functional.linear(input, weight, bias) + comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream) + return x class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp): def __init__( @@ -350,12 +352,13 @@ class ControlLoraOps: def forward(self, input): - weight, bias = comfy.ops.cast_bias_weight(self, input) + weight, bias, offload_stream = comfy.ops.cast_bias_weight(self, input, offloadable=True) if self.up is not None: - return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups) + x = torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups) else: - return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) - + x = torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) + comfy.ops.uncast_bias_weight(self, weight, bias, offload_stream) + return x class ControlLora(ControlNet): def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options diff --git a/comfy/ldm/ace/vae/music_dcae_pipeline.py b/comfy/ldm/ace/vae/music_dcae_pipeline.py index af81280eb..3c8830c17 100644 --- a/comfy/ldm/ace/vae/music_dcae_pipeline.py +++ b/comfy/ldm/ace/vae/music_dcae_pipeline.py @@ -23,8 +23,6 @@ class MusicDCAE(torch.nn.Module): else: self.source_sample_rate = source_sample_rate - # self.resampler = torchaudio.transforms.Resample(source_sample_rate, 44100) - self.transform = transforms.Compose([ transforms.Normalize(0.5, 0.5), ]) @@ -37,10 +35,6 @@ class MusicDCAE(torch.nn.Module): self.scale_factor = 0.1786 self.shift_factor = -1.9091 - def load_audio(self, audio_path): - audio, sr = torchaudio.load(audio_path) - return audio, sr - def forward_mel(self, audios): mels = [] for i in range(len(audios)): @@ -73,10 +67,8 @@ class MusicDCAE(torch.nn.Module): latent = self.dcae.encoder(mel.unsqueeze(0)) latents.append(latent) latents = torch.cat(latents, dim=0) - # latent_lengths = (audio_lengths / sr * 44100 / 512 / self.time_dimention_multiple).long() latents = (latents - self.shift_factor) * self.scale_factor return latents - # return latents, latent_lengths @torch.no_grad() def decode(self, latents, audio_lengths=None, sr=None): @@ -91,9 +83,7 @@ class MusicDCAE(torch.nn.Module): wav = self.vocoder.decode(mels[0]).squeeze(1) if sr is not None: - # resampler = torchaudio.transforms.Resample(44100, sr).to(latents.device).to(latents.dtype) wav = torchaudio.functional.resample(wav, 44100, sr) - # wav = resampler(wav) else: sr = 44100 pred_wavs.append(wav) @@ -101,7 +91,6 @@ class MusicDCAE(torch.nn.Module): if audio_lengths is not None: pred_wavs = [wav[:, :length].cpu() for wav, length in zip(pred_wavs, audio_lengths)] return torch.stack(pred_wavs) - # return sr, pred_wavs def forward(self, audios, audio_lengths=None, sr=None): latents, latent_lengths = self.encode(audios=audios, audio_lengths=audio_lengths, sr=sr) diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py index fc7110cce..2d5684348 100644 --- a/comfy/ldm/chroma/layers.py +++ b/comfy/ldm/chroma/layers.py @@ -1,15 +1,15 @@ import torch from torch import Tensor, nn -from comfy.ldm.flux.math import attention from comfy.ldm.flux.layers import ( MLPEmbedder, RMSNorm, - QKNorm, - SelfAttention, ModulationOut, ) +# TODO: remove this in a few months +SingleStreamBlock = None +DoubleStreamBlock = None class ChromaModulationOut(ModulationOut): @@ -48,124 +48,6 @@ class Approximator(nn.Module): return x -class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None): - super().__init__() - - mlp_hidden_dim = int(hidden_size * mlp_ratio) - self.num_heads = num_heads - self.hidden_size = hidden_size - self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) - - self.img_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.img_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) - - self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) - - self.txt_norm2 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - self.txt_mlp = nn.Sequential( - operations.Linear(hidden_size, mlp_hidden_dim, bias=True, dtype=dtype, device=device), - nn.GELU(approximate="tanh"), - operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), - ) - self.flipped_img_txt = flipped_img_txt - - def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}): - (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec - - # prepare image for attention - img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img)) - img_qkv = self.img_attn.qkv(img_modulated) - img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) - - # prepare txt for attention - txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt)) - txt_qkv = self.txt_attn.qkv(txt_modulated) - txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) - - # run actual attention - attn = attention(torch.cat((txt_q, img_q), dim=2), - torch.cat((txt_k, img_k), dim=2), - torch.cat((txt_v, img_v), dim=2), - pe=pe, mask=attn_mask, transformer_options=transformer_options) - - txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] - - # calculate the img bloks - img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn)) - img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img)))) - - # calculate the txt bloks - txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn)) - txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt)))) - - if txt.dtype == torch.float16: - txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504) - - return img, txt - - -class SingleStreamBlock(nn.Module): - """ - A DiT block with parallel linear layers as described in - https://arxiv.org/abs/2302.05442 and adapted modulation interface. - """ - - def __init__( - self, - hidden_size: int, - num_heads: int, - mlp_ratio: float = 4.0, - qk_scale: float = None, - dtype=None, - device=None, - operations=None - ): - super().__init__() - self.hidden_dim = hidden_size - self.num_heads = num_heads - head_dim = hidden_size // num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.mlp_hidden_dim = int(hidden_size * mlp_ratio) - # qkv and mlp_in - self.linear1 = operations.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim, dtype=dtype, device=device) - # proj and mlp_out - self.linear2 = operations.Linear(hidden_size + self.mlp_hidden_dim, hidden_size, dtype=dtype, device=device) - - self.norm = QKNorm(head_dim, dtype=dtype, device=device, operations=operations) - - self.hidden_size = hidden_size - self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) - - self.mlp_act = nn.GELU(approximate="tanh") - - def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None, transformer_options={}) -> Tensor: - mod = vec - x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x)) - qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) - - q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) - q, k = self.norm(q, k, v) - - # compute attention - attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) - # compute activation in mlp stream, cat again and run second linear layer - output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) - x.addcmul_(mod.gate, output) - if x.dtype == torch.float16: - x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504) - return x - - class LastLayer(nn.Module): def __init__(self, hidden_size: int, patch_size: int, out_channels: int, dtype=None, device=None, operations=None): super().__init__() diff --git a/comfy/ldm/chroma/model.py b/comfy/ldm/chroma/model.py index ad1c523fe..67bf70eb1 100644 --- a/comfy/ldm/chroma/model.py +++ b/comfy/ldm/chroma/model.py @@ -11,12 +11,12 @@ import comfy.ldm.common_dit from comfy.ldm.flux.layers import ( EmbedND, timestep_embedding, + DoubleStreamBlock, + SingleStreamBlock, ) from .layers import ( - DoubleStreamBlock, LastLayer, - SingleStreamBlock, Approximator, ChromaModulationOut, ) @@ -90,6 +90,7 @@ class Chroma(nn.Module): self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, + modulation=False, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -98,7 +99,7 @@ class Chroma(nn.Module): self.single_blocks = nn.ModuleList( [ - SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, dtype=dtype, device=device, operations=operations) + SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, modulation=False, dtype=dtype, device=device, operations=operations) for _ in range(params.depth_single_blocks) ] ) diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py index 47aa11b04..e643b4414 100644 --- a/comfy/ldm/chroma_radiance/model.py +++ b/comfy/ldm/chroma_radiance/model.py @@ -10,12 +10,10 @@ from torch import Tensor, nn from einops import repeat import comfy.ldm.common_dit -from comfy.ldm.flux.layers import EmbedND +from comfy.ldm.flux.layers import EmbedND, DoubleStreamBlock, SingleStreamBlock from comfy.ldm.chroma.model import Chroma, ChromaParams from comfy.ldm.chroma.layers import ( - DoubleStreamBlock, - SingleStreamBlock, Approximator, ) from .layers import ( @@ -89,7 +87,6 @@ class ChromaRadiance(Chroma): dtype=dtype, device=device, operations=operations ) - self.double_blocks = nn.ModuleList( [ DoubleStreamBlock( @@ -97,6 +94,7 @@ class ChromaRadiance(Chroma): self.num_heads, mlp_ratio=params.mlp_ratio, qkv_bias=params.qkv_bias, + modulation=False, dtype=dtype, device=device, operations=operations ) for _ in range(params.depth) @@ -109,6 +107,7 @@ class ChromaRadiance(Chroma): self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio, + modulation=False, dtype=dtype, device=device, operations=operations, ) for _ in range(params.depth_single_blocks) @@ -189,15 +188,15 @@ class ChromaRadiance(Chroma): nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size) nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P] + # Reshape for per-patch processing + nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size) + nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2) + if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size: # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than # the tile size. - img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params) + img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params) else: - # Reshape for per-patch processing - nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size) - nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2) - # Get DCT-encoded pixel embeddings [pixel-dct] img_dct = self.nerf_image_embedder(nerf_pixels) @@ -240,17 +239,8 @@ class ChromaRadiance(Chroma): end = min(i + tile_size, num_patches) # Slice the current tile from the input tensors - nerf_hidden_tile = nerf_hidden[:, i:end, :] - nerf_pixels_tile = nerf_pixels[:, i:end, :] - - # Get the actual number of patches in this tile (can be smaller for the last tile) - num_patches_tile = nerf_hidden_tile.shape[1] - - # Reshape the tile for per-patch processing - # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D] - nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size) - # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C] - nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2) + nerf_hidden_tile = nerf_hidden[i * batch:end * batch] + nerf_pixels_tile = nerf_pixels[i * batch:end * batch] # get DCT-encoded pixel embeddings [pixel-dct] img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile) diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py index ef21b416b..23150a712 100644 --- a/comfy/ldm/flux/layers.py +++ b/comfy/ldm/flux/layers.py @@ -130,13 +130,17 @@ def apply_mod(tensor, m_mult, m_add=None, modulation_dims=None): class DoubleStreamBlock(nn.Module): - def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, dtype=None, device=None, operations=None): + def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False, flipped_img_txt=False, modulation=True, dtype=None, device=None, operations=None): super().__init__() mlp_hidden_dim = int(hidden_size * mlp_ratio) self.num_heads = num_heads self.hidden_size = hidden_size - self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + self.modulation = modulation + + if self.modulation: + self.img_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + self.img_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) @@ -147,7 +151,9 @@ class DoubleStreamBlock(nn.Module): operations.Linear(mlp_hidden_dim, hidden_size, bias=True, dtype=dtype, device=device), ) - self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + if self.modulation: + self.txt_mod = Modulation(hidden_size, double=True, dtype=dtype, device=device, operations=operations) + self.txt_norm1 = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias, dtype=dtype, device=device, operations=operations) @@ -160,46 +166,65 @@ class DoubleStreamBlock(nn.Module): self.flipped_img_txt = flipped_img_txt def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims_img=None, modulation_dims_txt=None, transformer_options={}): - img_mod1, img_mod2 = self.img_mod(vec) - txt_mod1, txt_mod2 = self.txt_mod(vec) + if self.modulation: + img_mod1, img_mod2 = self.img_mod(vec) + txt_mod1, txt_mod2 = self.txt_mod(vec) + else: + (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec # prepare image for attention img_modulated = self.img_norm1(img) img_modulated = apply_mod(img_modulated, (1 + img_mod1.scale), img_mod1.shift, modulation_dims_img) img_qkv = self.img_attn.qkv(img_modulated) + del img_modulated img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + del img_qkv img_q, img_k = self.img_attn.norm(img_q, img_k, img_v) # prepare txt for attention txt_modulated = self.txt_norm1(txt) txt_modulated = apply_mod(txt_modulated, (1 + txt_mod1.scale), txt_mod1.shift, modulation_dims_txt) txt_qkv = self.txt_attn.qkv(txt_modulated) + del txt_modulated txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + del txt_qkv txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v) if self.flipped_img_txt: + q = torch.cat((img_q, txt_q), dim=2) + del img_q, txt_q + k = torch.cat((img_k, txt_k), dim=2) + del img_k, txt_k + v = torch.cat((img_v, txt_v), dim=2) + del img_v, txt_v # run actual attention - attn = attention(torch.cat((img_q, txt_q), dim=2), - torch.cat((img_k, txt_k), dim=2), - torch.cat((img_v, txt_v), dim=2), + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v img_attn, txt_attn = attn[:, : img.shape[1]], attn[:, img.shape[1]:] else: + q = torch.cat((txt_q, img_q), dim=2) + del txt_q, img_q + k = torch.cat((txt_k, img_k), dim=2) + del txt_k, img_k + v = torch.cat((txt_v, img_v), dim=2) + del txt_v, img_v # run actual attention - attn = attention(torch.cat((txt_q, img_q), dim=2), - torch.cat((txt_k, img_k), dim=2), - torch.cat((txt_v, img_v), dim=2), + attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1]:] # calculate the img bloks - img = img + apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img) - img = img + apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img) + img += apply_mod(self.img_attn.proj(img_attn), img_mod1.gate, None, modulation_dims_img) + del img_attn + img += apply_mod(self.img_mlp(apply_mod(self.img_norm2(img), (1 + img_mod2.scale), img_mod2.shift, modulation_dims_img)), img_mod2.gate, None, modulation_dims_img) # calculate the txt bloks txt += apply_mod(self.txt_attn.proj(txt_attn), txt_mod1.gate, None, modulation_dims_txt) + del txt_attn txt += apply_mod(self.txt_mlp(apply_mod(self.txt_norm2(txt), (1 + txt_mod2.scale), txt_mod2.shift, modulation_dims_txt)), txt_mod2.gate, None, modulation_dims_txt) if txt.dtype == torch.float16: @@ -220,6 +245,7 @@ class SingleStreamBlock(nn.Module): num_heads: int, mlp_ratio: float = 4.0, qk_scale: float = None, + modulation=True, dtype=None, device=None, operations=None @@ -242,19 +268,29 @@ class SingleStreamBlock(nn.Module): self.pre_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6, dtype=dtype, device=device) self.mlp_act = nn.GELU(approximate="tanh") - self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations) + if modulation: + self.modulation = Modulation(hidden_size, double=False, dtype=dtype, device=device, operations=operations) + else: + self.modulation = None def forward(self, x: Tensor, vec: Tensor, pe: Tensor, attn_mask=None, modulation_dims=None, transformer_options={}) -> Tensor: - mod, _ = self.modulation(vec) + if self.modulation: + mod, _ = self.modulation(vec) + else: + mod = vec + qkv, mlp = torch.split(self.linear1(apply_mod(self.pre_norm(x), (1 + mod.scale), mod.shift, modulation_dims)), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1) q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + del qkv q, k = self.norm(q, k, v) # compute attention attn = attention(q, k, v, pe=pe, mask=attn_mask, transformer_options=transformer_options) + del q, k, v # compute activation in mlp stream, cat again and run second linear layer - output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)) + mlp = self.mlp_act(mlp) + output = self.linear2(torch.cat((attn, mlp), 2)) x += apply_mod(output, mod.gate, None, modulation_dims) if x.dtype == torch.float16: x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504) diff --git a/comfy/ldm/flux/math.py b/comfy/ldm/flux/math.py index fb7cd7586..158420290 100644 --- a/comfy/ldm/flux/math.py +++ b/comfy/ldm/flux/math.py @@ -7,15 +7,7 @@ import comfy.model_management def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor, mask=None, transformer_options={}) -> Tensor: - q_shape = q.shape - k_shape = k.shape - - if pe is not None: - q = q.to(dtype=pe.dtype).reshape(*q.shape[:-1], -1, 1, 2) - k = k.to(dtype=pe.dtype).reshape(*k.shape[:-1], -1, 1, 2) - q = (pe[..., 0] * q[..., 0] + pe[..., 1] * q[..., 1]).reshape(*q_shape).type_as(v) - k = (pe[..., 0] * k[..., 0] + pe[..., 1] * k[..., 1]).reshape(*k_shape).type_as(v) - + q, k = apply_rope(q, k, pe) heads = q.shape[1] x = optimized_attention(q, k, v, heads, skip_reshape=True, mask=mask, transformer_options=transformer_options) return x @@ -37,7 +29,10 @@ def rope(pos: Tensor, dim: int, theta: int) -> Tensor: def apply_rope1(x: Tensor, freqs_cis: Tensor): x_ = x.to(dtype=freqs_cis.dtype).reshape(*x.shape[:-1], -1, 1, 2) - x_out = freqs_cis[..., 0] * x_[..., 0] + freqs_cis[..., 1] * x_[..., 1] + + x_out = freqs_cis[..., 0] * x_[..., 0] + x_out.addcmul_(freqs_cis[..., 1], x_[..., 1]) + return x_out.reshape(*x.shape).type_as(x) def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor): diff --git a/comfy/ldm/flux/model.py b/comfy/ldm/flux/model.py index 14f90cea5..b9d36f202 100644 --- a/comfy/ldm/flux/model.py +++ b/comfy/ldm/flux/model.py @@ -210,7 +210,7 @@ class Flux(nn.Module): img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels) return img - def process_img(self, x, index=0, h_offset=0, w_offset=0): + def process_img(self, x, index=0, h_offset=0, w_offset=0, transformer_options={}): bs, c, h, w = x.shape patch_size = self.patch_size x = comfy.ldm.common_dit.pad_to_patch_size(x, (patch_size, patch_size)) @@ -222,10 +222,22 @@ class Flux(nn.Module): h_offset = ((h_offset + (patch_size // 2)) // patch_size) w_offset = ((w_offset + (patch_size // 2)) // patch_size) - img_ids = torch.zeros((h_len, w_len, 3), device=x.device, dtype=x.dtype) + steps_h = h_len + steps_w = w_len + + rope_options = transformer_options.get("rope_options", None) + if rope_options is not None: + h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0 + w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0 + + index += rope_options.get("shift_t", 0.0) + h_offset += rope_options.get("shift_y", 0.0) + w_offset += rope_options.get("shift_x", 0.0) + + img_ids = torch.zeros((steps_h, steps_w, 3), device=x.device, dtype=x.dtype) img_ids[:, :, 0] = img_ids[:, :, 1] + index - img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=h_len, device=x.device, dtype=x.dtype).unsqueeze(1) - img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=w_len, device=x.device, dtype=x.dtype).unsqueeze(0) + img_ids[:, :, 1] = img_ids[:, :, 1] + torch.linspace(h_offset, h_len - 1 + h_offset, steps=steps_h, device=x.device, dtype=x.dtype).unsqueeze(1) + img_ids[:, :, 2] = img_ids[:, :, 2] + torch.linspace(w_offset, w_len - 1 + w_offset, steps=steps_w, device=x.device, dtype=x.dtype).unsqueeze(0) return img, repeat(img_ids, "h w c -> b (h w) c", b=bs) def forward(self, x, timestep, context, y=None, guidance=None, ref_latents=None, control=None, transformer_options={}, **kwargs): @@ -241,7 +253,7 @@ class Flux(nn.Module): h_len = ((h_orig + (patch_size // 2)) // patch_size) w_len = ((w_orig + (patch_size // 2)) // patch_size) - img, img_ids = self.process_img(x) + img, img_ids = self.process_img(x, transformer_options=transformer_options) img_tokens = img.shape[1] if ref_latents is not None: h = 0 diff --git a/comfy/ldm/hunyuan_video/vae_refiner.py b/comfy/ldm/hunyuan_video/vae_refiner.py index c6f742710..c2a0b507d 100644 --- a/comfy/ldm/hunyuan_video/vae_refiner.py +++ b/comfy/ldm/hunyuan_video/vae_refiner.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d +from comfy.ldm.modules.diffusionmodules.model import ResnetBlock, AttnBlock, VideoConv3d, Normalize import comfy.ops import comfy.ldm.models.autoencoder ops = comfy.ops.disable_weight_init @@ -17,11 +17,12 @@ class RMS_norm(nn.Module): return F.normalize(x, dim=1) * self.scale * self.gamma class DnSmpl(nn.Module): - def __init__(self, ic, oc, tds=True): + def __init__(self, ic, oc, tds=True, refiner_vae=True, op=VideoConv3d): super().__init__() fct = 2 * 2 * 2 if tds else 1 * 2 * 2 assert oc % fct == 0 - self.conv = VideoConv3d(ic, oc // fct, kernel_size=3) + self.conv = op(ic, oc // fct, kernel_size=3, stride=1, padding=1) + self.refiner_vae = refiner_vae self.tds = tds self.gs = fct * ic // oc @@ -30,7 +31,7 @@ class DnSmpl(nn.Module): r1 = 2 if self.tds else 1 h = self.conv(x) - if self.tds: + if self.tds and self.refiner_vae: hf = h[:, :, :1, :, :] b, c, f, ht, wd = hf.shape hf = hf.reshape(b, c, f, ht // 2, 2, wd // 2, 2) @@ -66,6 +67,7 @@ class DnSmpl(nn.Module): sc = torch.cat([xf, xn], dim=2) else: b, c, frms, ht, wd = h.shape + nf = frms // r1 h = h.reshape(b, c, nf, r1, ht // 2, 2, wd // 2, 2) h = h.permute(0, 3, 5, 7, 1, 2, 4, 6) @@ -83,10 +85,11 @@ class DnSmpl(nn.Module): class UpSmpl(nn.Module): - def __init__(self, ic, oc, tus=True): + def __init__(self, ic, oc, tus=True, refiner_vae=True, op=VideoConv3d): super().__init__() fct = 2 * 2 * 2 if tus else 1 * 2 * 2 - self.conv = VideoConv3d(ic, oc * fct, kernel_size=3) + self.conv = op(ic, oc * fct, kernel_size=3, stride=1, padding=1) + self.refiner_vae = refiner_vae self.tus = tus self.rp = fct * oc // ic @@ -95,7 +98,7 @@ class UpSmpl(nn.Module): r1 = 2 if self.tus else 1 h = self.conv(x) - if self.tus: + if self.tus and self.refiner_vae: hf = h[:, :, :1, :, :] b, c, f, ht, wd = hf.shape nc = c // (2 * 2) @@ -148,43 +151,56 @@ class UpSmpl(nn.Module): class Encoder(nn.Module): def __init__(self, in_channels, z_channels, block_out_channels, num_res_blocks, - ffactor_spatial, ffactor_temporal, downsample_match_channel=True, **_): + ffactor_spatial, ffactor_temporal, downsample_match_channel=True, refiner_vae=True, **_): super().__init__() self.z_channels = z_channels self.block_out_channels = block_out_channels self.num_res_blocks = num_res_blocks - self.conv_in = VideoConv3d(in_channels, block_out_channels[0], 3, 1, 1) + self.ffactor_temporal = ffactor_temporal + + self.refiner_vae = refiner_vae + if self.refiner_vae: + conv_op = VideoConv3d + norm_op = RMS_norm + else: + conv_op = ops.Conv3d + norm_op = Normalize + + self.conv_in = conv_op(in_channels, block_out_channels[0], 3, 1, 1) self.down = nn.ModuleList() ch = block_out_channels[0] depth = (ffactor_spatial >> 1).bit_length() - depth_temporal = ((ffactor_spatial // ffactor_temporal) >> 1).bit_length() + depth_temporal = ((ffactor_spatial // self.ffactor_temporal) >> 1).bit_length() for i, tgt in enumerate(block_out_channels): stage = nn.Module() stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, out_channels=tgt, temb_channels=0, - conv_op=VideoConv3d, norm_op=RMS_norm) + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and downsample_match_channel else ch - stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal) + stage.downsample = DnSmpl(ch, nxt, tds=i >= depth_temporal, refiner_vae=self.refiner_vae, op=conv_op) ch = nxt self.down.append(stage) self.mid = nn.Module() - self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) - self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm) - self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) - self.norm_out = RMS_norm(ch) - self.conv_out = VideoConv3d(ch, z_channels << 1, 3, 1, 1) + self.norm_out = norm_op(ch) + self.conv_out = conv_op(ch, z_channels << 1, 3, 1, 1) self.regul = comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer() def forward(self, x): + if not self.refiner_vae and x.shape[2] == 1: + x = x.expand(-1, -1, self.ffactor_temporal, -1, -1) + x = self.conv_in(x) for stage in self.down: @@ -200,31 +216,42 @@ class Encoder(nn.Module): skip = x.view(b, c // grp, grp, t, h, w).mean(2) out = self.conv_out(F.silu(self.norm_out(x))) + skip - out = self.regul(out)[0] - out = torch.cat((out[:, :, :1], out), dim=2) - out = out.permute(0, 2, 1, 3, 4) - b, f_times_2, c, h, w = out.shape - out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) - out = out.permute(0, 2, 1, 3, 4).contiguous() + if self.refiner_vae: + out = self.regul(out)[0] + + out = torch.cat((out[:, :, :1], out), dim=2) + out = out.permute(0, 2, 1, 3, 4) + b, f_times_2, c, h, w = out.shape + out = out.reshape(b, f_times_2 // 2, 2 * c, h, w) + out = out.permute(0, 2, 1, 3, 4).contiguous() + return out class Decoder(nn.Module): def __init__(self, z_channels, out_channels, block_out_channels, num_res_blocks, - ffactor_spatial, ffactor_temporal, upsample_match_channel=True, **_): + ffactor_spatial, ffactor_temporal, upsample_match_channel=True, refiner_vae=True, **_): super().__init__() block_out_channels = block_out_channels[::-1] self.z_channels = z_channels self.block_out_channels = block_out_channels self.num_res_blocks = num_res_blocks + self.refiner_vae = refiner_vae + if self.refiner_vae: + conv_op = VideoConv3d + norm_op = RMS_norm + else: + conv_op = ops.Conv3d + norm_op = Normalize + ch = block_out_channels[0] - self.conv_in = VideoConv3d(z_channels, ch, 3) + self.conv_in = conv_op(z_channels, ch, kernel_size=3, stride=1, padding=1) self.mid = nn.Module() - self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) - self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=RMS_norm) - self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=VideoConv3d, norm_op=RMS_norm) + self.mid.block_1 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) + self.mid.attn_1 = AttnBlock(ch, conv_op=ops.Conv3d, norm_op=norm_op) + self.mid.block_2 = ResnetBlock(in_channels=ch, out_channels=ch, temb_channels=0, conv_op=conv_op, norm_op=norm_op) self.up = nn.ModuleList() depth = (ffactor_spatial >> 1).bit_length() @@ -235,25 +262,26 @@ class Decoder(nn.Module): stage.block = nn.ModuleList([ResnetBlock(in_channels=ch if j == 0 else tgt, out_channels=tgt, temb_channels=0, - conv_op=VideoConv3d, norm_op=RMS_norm) + conv_op=conv_op, norm_op=norm_op) for j in range(num_res_blocks + 1)]) ch = tgt if i < depth: nxt = block_out_channels[i + 1] if i + 1 < len(block_out_channels) and upsample_match_channel else ch - stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal) + stage.upsample = UpSmpl(ch, nxt, tus=i < depth_temporal, refiner_vae=self.refiner_vae, op=conv_op) ch = nxt self.up.append(stage) - self.norm_out = RMS_norm(ch) - self.conv_out = VideoConv3d(ch, out_channels, 3) + self.norm_out = norm_op(ch) + self.conv_out = conv_op(ch, out_channels, 3, stride=1, padding=1) def forward(self, z): - z = z.permute(0, 2, 1, 3, 4) - b, f, c, h, w = z.shape - z = z.reshape(b, f, 2, c // 2, h, w) - z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) - z = z.permute(0, 2, 1, 3, 4) - z = z[:, :, 1:] + if self.refiner_vae: + z = z.permute(0, 2, 1, 3, 4) + b, f, c, h, w = z.shape + z = z.reshape(b, f, 2, c // 2, h, w) + z = z.permute(0, 1, 2, 3, 4, 5).reshape(b, f * 2, c // 2, h, w) + z = z.permute(0, 2, 1, 3, 4) + z = z[:, :, 1:] x = self.conv_in(z) + z.repeat_interleave(self.block_out_channels[0] // self.z_channels, 1) x = self.mid.block_2(self.mid.attn_1(self.mid.block_1(x))) @@ -264,4 +292,10 @@ class Decoder(nn.Module): if hasattr(stage, 'upsample'): x = stage.upsample(x) - return self.conv_out(F.silu(self.norm_out(x))) + out = self.conv_out(F.silu(self.norm_out(x))) + + if not self.refiner_vae: + if z.shape[-3] == 1: + out = out[:, :, -1:] + + return out diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py index def365ba7..593f7940f 100644 --- a/comfy/ldm/lightricks/model.py +++ b/comfy/ldm/lightricks/model.py @@ -3,12 +3,11 @@ from torch import nn import comfy.patcher_extension import comfy.ldm.modules.attention import comfy.ldm.common_dit -from einops import rearrange import math from typing import Dict, Optional, Tuple from .symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords - +from comfy.ldm.flux.math import apply_rope1 def get_timestep_embedding( timesteps: torch.Tensor, @@ -238,20 +237,6 @@ class FeedForward(nn.Module): return self.net(x) -def apply_rotary_emb(input_tensor, freqs_cis): #TODO: remove duplicate funcs and pick the best/fastest one - cos_freqs = freqs_cis[0] - sin_freqs = freqs_cis[1] - - t_dup = rearrange(input_tensor, "... (d r) -> ... d r", r=2) - t1, t2 = t_dup.unbind(dim=-1) - t_dup = torch.stack((-t2, t1), dim=-1) - input_tensor_rot = rearrange(t_dup, "... d r -> ... (d r)") - - out = input_tensor * cos_freqs + input_tensor_rot * sin_freqs - - return out - - class CrossAttention(nn.Module): def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., attn_precision=None, dtype=None, device=None, operations=None): super().__init__() @@ -281,8 +266,8 @@ class CrossAttention(nn.Module): k = self.k_norm(k) if pe is not None: - q = apply_rotary_emb(q, pe) - k = apply_rotary_emb(k, pe) + q = apply_rope1(q.unsqueeze(1), pe).squeeze(1) + k = apply_rope1(k.unsqueeze(1), pe).squeeze(1) if mask is None: out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options) @@ -306,12 +291,17 @@ class BasicTransformerBlock(nn.Module): def forward(self, x, context=None, attention_mask=None, timestep=None, pe=None, transformer_options={}): shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (self.scale_shift_table[None, None].to(device=x.device, dtype=x.dtype) + timestep.reshape(x.shape[0], timestep.shape[1], self.scale_shift_table.shape[0], -1)).unbind(dim=2) - x += self.attn1(comfy.ldm.common_dit.rms_norm(x) * (1 + scale_msa) + shift_msa, pe=pe, transformer_options=transformer_options) * gate_msa + attn1_input = comfy.ldm.common_dit.rms_norm(x) + attn1_input = torch.addcmul(attn1_input, attn1_input, scale_msa).add_(shift_msa) + attn1_input = self.attn1(attn1_input, pe=pe, transformer_options=transformer_options) + x.addcmul_(attn1_input, gate_msa) + del attn1_input x += self.attn2(x, context=context, mask=attention_mask, transformer_options=transformer_options) - y = comfy.ldm.common_dit.rms_norm(x) * (1 + scale_mlp) + shift_mlp - x += self.ff(y) * gate_mlp + y = comfy.ldm.common_dit.rms_norm(x) + y = torch.addcmul(y, y, scale_mlp).add_(shift_mlp) + x.addcmul_(self.ff(y), gate_mlp) return x @@ -327,41 +317,35 @@ def get_fractional_positions(indices_grid, max_pos): def precompute_freqs_cis(indices_grid, dim, out_dtype, theta=10000.0, max_pos=[20, 2048, 2048]): - dtype = torch.float32 #self.dtype + dtype = torch.float32 + device = indices_grid.device + # Get fractional positions and compute frequency indices fractional_positions = get_fractional_positions(indices_grid, max_pos) + indices = theta ** torch.linspace(0, 1, dim // 6, device=device, dtype=dtype) * math.pi / 2 - start = 1 - end = theta - device = fractional_positions.device + # Compute frequencies and apply cos/sin + freqs = (indices * (fractional_positions.unsqueeze(-1) * 2 - 1)).transpose(-1, -2).flatten(2) + cos_vals = freqs.cos().repeat_interleave(2, dim=-1) + sin_vals = freqs.sin().repeat_interleave(2, dim=-1) - indices = theta ** ( - torch.linspace( - math.log(start, theta), - math.log(end, theta), - dim // 6, - device=device, - dtype=dtype, - ) - ) - indices = indices.to(dtype=dtype) - - indices = indices * math.pi / 2 - - freqs = ( - (indices * (fractional_positions.unsqueeze(-1) * 2 - 1)) - .transpose(-1, -2) - .flatten(2) - ) - - cos_freq = freqs.cos().repeat_interleave(2, dim=-1) - sin_freq = freqs.sin().repeat_interleave(2, dim=-1) + # Pad if dim is not divisible by 6 if dim % 6 != 0: - cos_padding = torch.ones_like(cos_freq[:, :, : dim % 6]) - sin_padding = torch.zeros_like(cos_freq[:, :, : dim % 6]) - cos_freq = torch.cat([cos_padding, cos_freq], dim=-1) - sin_freq = torch.cat([sin_padding, sin_freq], dim=-1) - return cos_freq.to(out_dtype), sin_freq.to(out_dtype) + padding_size = dim % 6 + cos_vals = torch.cat([torch.ones_like(cos_vals[:, :, :padding_size]), cos_vals], dim=-1) + sin_vals = torch.cat([torch.zeros_like(sin_vals[:, :, :padding_size]), sin_vals], dim=-1) + + # Reshape and extract one value per pair (since repeat_interleave duplicates each value) + cos_vals = cos_vals.reshape(*cos_vals.shape[:2], -1, 2)[..., 0].to(out_dtype) # [B, N, dim//2] + sin_vals = sin_vals.reshape(*sin_vals.shape[:2], -1, 2)[..., 0].to(out_dtype) # [B, N, dim//2] + + # Build rotation matrix [[cos, -sin], [sin, cos]] and add heads dimension + freqs_cis = torch.stack([ + torch.stack([cos_vals, -sin_vals], dim=-1), + torch.stack([sin_vals, cos_vals], dim=-1) + ], dim=-2).unsqueeze(1) # [B, 1, N, dim//2, 2, 2] + + return freqs_cis class LTXVModel(torch.nn.Module): @@ -501,7 +485,7 @@ class LTXVModel(torch.nn.Module): shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1] x = self.norm_out(x) # Modulation - x = x * (1 + scale) + shift + x = torch.addcmul(x, x, scale).add_(shift) x = self.proj_out(x) x = self.patchifier.unpatchify( diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index f87d98ac0..b4494a51d 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -522,7 +522,7 @@ class NextDiT(nn.Module): max_cap_len = max(l_effective_cap_len) max_img_len = max(l_effective_img_len) - position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.int32, device=device) + position_ids = torch.zeros(bsz, max_seq_len, 3, dtype=torch.float32, device=device) for i in range(bsz): cap_len = l_effective_cap_len[i] @@ -531,10 +531,22 @@ class NextDiT(nn.Module): H_tokens, W_tokens = H // pH, W // pW assert H_tokens * W_tokens == img_len - position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.int32, device=device) + rope_options = transformer_options.get("rope_options", None) + h_scale = 1.0 + w_scale = 1.0 + h_start = 0 + w_start = 0 + if rope_options is not None: + h_scale = rope_options.get("scale_y", 1.0) + w_scale = rope_options.get("scale_x", 1.0) + + h_start = rope_options.get("shift_y", 0.0) + w_start = rope_options.get("shift_x", 0.0) + + position_ids[i, :cap_len, 0] = torch.arange(cap_len, dtype=torch.float32, device=device) position_ids[i, cap_len:cap_len+img_len, 0] = cap_len - row_ids = torch.arange(H_tokens, dtype=torch.int32, device=device).view(-1, 1).repeat(1, W_tokens).flatten() - col_ids = torch.arange(W_tokens, dtype=torch.int32, device=device).view(1, -1).repeat(H_tokens, 1).flatten() + row_ids = (torch.arange(H_tokens, dtype=torch.float32, device=device) * h_scale + h_start).view(-1, 1).repeat(1, W_tokens).flatten() + col_ids = (torch.arange(W_tokens, dtype=torch.float32, device=device) * w_scale + w_start).view(1, -1).repeat(H_tokens, 1).flatten() position_ids[i, cap_len:cap_len+img_len, 1] = row_ids position_ids[i, cap_len:cap_len+img_len, 2] = col_ids diff --git a/comfy/ldm/mmaudio/vae/__init__.py b/comfy/ldm/mmaudio/vae/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/comfy/ldm/mmaudio/vae/activations.py b/comfy/ldm/mmaudio/vae/activations.py new file mode 100644 index 000000000..db9192e3e --- /dev/null +++ b/comfy/ldm/mmaudio/vae/activations.py @@ -0,0 +1,120 @@ +# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +from torch import nn, sin, pow +from torch.nn import Parameter +import comfy.model_management + +class Snake(nn.Module): + ''' + Implementation of a sine-based periodic activation function + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter + References: + - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snake(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha: trainable parameter + alpha is initialized to 1 by default, higher values = higher-frequency. + alpha will be trained along with the rest of your model. + ''' + super(Snake, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: + self.alpha = Parameter(torch.empty(in_features)) + else: + self.alpha = Parameter(torch.empty(in_features)) + + self.alpha.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + Snake ∶= x + 1/a * sin^2 (xa) + ''' + alpha = comfy.model_management.cast_to(self.alpha, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + if self.alpha_logscale: + alpha = torch.exp(alpha) + x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x + + +class SnakeBeta(nn.Module): + ''' + A modified Snake function which uses separate parameters for the magnitude of the periodic components + Shape: + - Input: (B, C, T) + - Output: (B, C, T), same shape as the input + Parameters: + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + References: + - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: + https://arxiv.org/abs/2006.08195 + Examples: + >>> a1 = snakebeta(256) + >>> x = torch.randn(256) + >>> x = a1(x) + ''' + def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False): + ''' + Initialization. + INPUT: + - in_features: shape of the input + - alpha - trainable parameter that controls frequency + - beta - trainable parameter that controls magnitude + alpha is initialized to 1 by default, higher values = higher-frequency. + beta is initialized to 1 by default, higher values = higher-magnitude. + alpha will be trained along with the rest of your model. + ''' + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: + self.alpha = Parameter(torch.empty(in_features)) + self.beta = Parameter(torch.empty(in_features)) + else: + self.alpha = Parameter(torch.empty(in_features)) + self.beta = Parameter(torch.empty(in_features)) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + self.no_div_by_zero = 0.000000001 + + def forward(self, x): + ''' + Forward pass of the function. + Applies the function to the input elementwise. + SnakeBeta ∶= x + 1/b * sin^2 (xa) + ''' + alpha = comfy.model_management.cast_to(self.alpha, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T] + beta = comfy.model_management.cast_to(self.beta, dtype=x.dtype, device=x.device).unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2) + + return x diff --git a/comfy/ldm/mmaudio/vae/alias_free_torch.py b/comfy/ldm/mmaudio/vae/alias_free_torch.py new file mode 100644 index 000000000..35c70b897 --- /dev/null +++ b/comfy/ldm/mmaudio/vae/alias_free_torch.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import math +import comfy.model_management + +if 'sinc' in dir(torch): + sinc = torch.sinc +else: + # This code is adopted from adefossez's julius.core.sinc under the MIT License + # https://adefossez.github.io/julius/julius/core.html + # LICENSE is in incl_licenses directory. + def sinc(x: torch.Tensor): + """ + Implementation of sinc, i.e. sin(pi * x) / (pi * x) + __Warning__: Different to julius.sinc, the input is multiplied by `pi`! + """ + return torch.where(x == 0, + torch.tensor(1., device=x.device, dtype=x.dtype), + torch.sin(math.pi * x) / math.pi / x) + + +# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License +# https://adefossez.github.io/julius/julius/lowpass.html +# LICENSE is in incl_licenses directory. +def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size] + even = (kernel_size % 2 == 0) + half_size = kernel_size // 2 + + #For kaiser window + delta_f = 4 * half_width + A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95 + if A > 50.: + beta = 0.1102 * (A - 8.7) + elif A >= 21.: + beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.) + else: + beta = 0. + window = torch.kaiser_window(kernel_size, beta=beta, periodic=False) + + # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio + if even: + time = (torch.arange(-half_size, half_size) + 0.5) + else: + time = torch.arange(kernel_size) - half_size + if cutoff == 0: + filter_ = torch.zeros_like(time) + else: + filter_ = 2 * cutoff * window * sinc(2 * cutoff * time) + # Normalize filter to have sum = 1, otherwise we will have a small leakage + # of the constant component in the input signal. + filter_ /= filter_.sum() + filter = filter_.view(1, 1, kernel_size) + + return filter + + +class LowPassFilter1d(nn.Module): + def __init__(self, + cutoff=0.5, + half_width=0.6, + stride: int = 1, + padding: bool = True, + padding_mode: str = 'replicate', + kernel_size: int = 12): + # kernel_size should be even number for stylegan3 setup, + # in this implementation, odd number is also possible. + super().__init__() + if cutoff < -0.: + raise ValueError("Minimum cutoff must be larger than zero.") + if cutoff > 0.5: + raise ValueError("A cutoff above 0.5 does not make sense.") + self.kernel_size = kernel_size + self.even = (kernel_size % 2 == 0) + self.pad_left = kernel_size // 2 - int(self.even) + self.pad_right = kernel_size // 2 + self.stride = stride + self.padding = padding + self.padding_mode = padding_mode + filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size) + self.register_buffer("filter", filter) + + #input [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + if self.padding: + x = F.pad(x, (self.pad_left, self.pad_right), + mode=self.padding_mode) + out = F.conv1d(x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), + stride=self.stride, groups=C) + + return out + + +class UpSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.stride = ratio + self.pad = self.kernel_size // ratio - 1 + self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2 + self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2 + filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + kernel_size=self.kernel_size) + self.register_buffer("filter", filter) + + # x: [B, C, T] + def forward(self, x): + _, C, _ = x.shape + + x = F.pad(x, (self.pad, self.pad), mode='replicate') + x = self.ratio * F.conv_transpose1d( + x, comfy.model_management.cast_to(self.filter.expand(C, -1, -1), dtype=x.dtype, device=x.device), stride=self.stride, groups=C) + x = x[..., self.pad_left:-self.pad_right] + + return x + + +class DownSample1d(nn.Module): + def __init__(self, ratio=2, kernel_size=None): + super().__init__() + self.ratio = ratio + self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size + self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio, + half_width=0.6 / ratio, + stride=ratio, + kernel_size=self.kernel_size) + + def forward(self, x): + xx = self.lowpass(x) + + return xx + +class Activation1d(nn.Module): + def __init__(self, + activation, + up_ratio: int = 2, + down_ratio: int = 2, + up_kernel_size: int = 12, + down_kernel_size: int = 12): + super().__init__() + self.up_ratio = up_ratio + self.down_ratio = down_ratio + self.act = activation + self.upsample = UpSample1d(up_ratio, up_kernel_size) + self.downsample = DownSample1d(down_ratio, down_kernel_size) + + # x: [B,C,T] + def forward(self, x): + x = self.upsample(x) + x = self.act(x) + x = self.downsample(x) + + return x diff --git a/comfy/ldm/mmaudio/vae/autoencoder.py b/comfy/ldm/mmaudio/vae/autoencoder.py new file mode 100644 index 000000000..cbb9de302 --- /dev/null +++ b/comfy/ldm/mmaudio/vae/autoencoder.py @@ -0,0 +1,156 @@ +from typing import Literal + +import torch +import torch.nn as nn + +from .distributions import DiagonalGaussianDistribution +from .vae import VAE_16k +from .bigvgan import BigVGANVocoder +import logging + +try: + import torchaudio +except: + logging.warning("torchaudio missing, MMAudio VAE model will be broken") + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5, *, norm_fn): + return norm_fn(torch.clamp(x, min=clip_val) * C) + + +def spectral_normalize_torch(magnitudes, norm_fn): + output = dynamic_range_compression_torch(magnitudes, norm_fn=norm_fn) + return output + +class MelConverter(nn.Module): + + def __init__( + self, + *, + sampling_rate: float, + n_fft: int, + num_mels: int, + hop_size: int, + win_size: int, + fmin: float, + fmax: float, + norm_fn, + ): + super().__init__() + self.sampling_rate = sampling_rate + self.n_fft = n_fft + self.num_mels = num_mels + self.hop_size = hop_size + self.win_size = win_size + self.fmin = fmin + self.fmax = fmax + self.norm_fn = norm_fn + + # mel = librosa_mel_fn(sr=self.sampling_rate, + # n_fft=self.n_fft, + # n_mels=self.num_mels, + # fmin=self.fmin, + # fmax=self.fmax) + # mel_basis = torch.from_numpy(mel).float() + mel_basis = torch.empty((num_mels, 1 + n_fft // 2)) + hann_window = torch.hann_window(self.win_size) + + self.register_buffer('mel_basis', mel_basis) + self.register_buffer('hann_window', hann_window) + + @property + def device(self): + return self.mel_basis.device + + def forward(self, waveform: torch.Tensor, center: bool = False) -> torch.Tensor: + waveform = waveform.clamp(min=-1., max=1.).to(self.device) + + waveform = torch.nn.functional.pad( + waveform.unsqueeze(1), + [int((self.n_fft - self.hop_size) / 2), + int((self.n_fft - self.hop_size) / 2)], + mode='reflect') + waveform = waveform.squeeze(1) + + spec = torch.stft(waveform, + self.n_fft, + hop_length=self.hop_size, + win_length=self.win_size, + window=self.hann_window, + center=center, + pad_mode='reflect', + normalized=False, + onesided=True, + return_complex=True) + + spec = torch.view_as_real(spec) + spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) + spec = torch.matmul(self.mel_basis, spec) + spec = spectral_normalize_torch(spec, self.norm_fn) + + return spec + +class AudioAutoencoder(nn.Module): + + def __init__( + self, + *, + # ckpt_path: str, + mode=Literal['16k', '44k'], + need_vae_encoder: bool = True, + ): + super().__init__() + + assert mode == "16k", "Only 16k mode is supported currently." + self.mel_converter = MelConverter(sampling_rate=16_000, + n_fft=1024, + num_mels=80, + hop_size=256, + win_size=1024, + fmin=0, + fmax=8_000, + norm_fn=torch.log10) + + self.vae = VAE_16k().eval() + + bigvgan_config = { + "resblock": "1", + "num_mels": 80, + "upsample_rates": [4, 4, 2, 2, 2, 2], + "upsample_kernel_sizes": [8, 8, 4, 4, 4, 4], + "upsample_initial_channel": 1536, + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [ + [1, 3, 5], + [1, 3, 5], + [1, 3, 5], + ], + "activation": "snakebeta", + "snake_logscale": True, + } + + self.vocoder = BigVGANVocoder( + bigvgan_config + ).eval() + + @torch.inference_mode() + def encode_audio(self, x) -> DiagonalGaussianDistribution: + # x: (B * L) + mel = self.mel_converter(x) + dist = self.vae.encode(mel) + + return dist + + @torch.no_grad() + def decode(self, z): + mel_decoded = self.vae.decode(z) + audio = self.vocoder(mel_decoded) + + audio = torchaudio.functional.resample(audio, 16000, 44100) + return audio + + @torch.no_grad() + def encode(self, audio): + audio = audio.mean(dim=1) + audio = torchaudio.functional.resample(audio, 44100, 16000) + dist = self.encode_audio(audio) + return dist.mean diff --git a/comfy/ldm/mmaudio/vae/bigvgan.py b/comfy/ldm/mmaudio/vae/bigvgan.py new file mode 100644 index 000000000..3a24337f6 --- /dev/null +++ b/comfy/ldm/mmaudio/vae/bigvgan.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022 NVIDIA CORPORATION. +# Licensed under the MIT license. + +# Adapted from https://github.com/jik876/hifi-gan under the MIT license. +# LICENSE is in incl_licenses directory. + +import torch +import torch.nn as nn +from types import SimpleNamespace +from . import activations +from .alias_free_torch import Activation1d +import comfy.ops +ops = comfy.ops.disable_weight_init + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + +class AMPBlock1(torch.nn.Module): + + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5), activation=None): + super(AMPBlock1, self).__init__() + self.h = h + + self.convs1 = nn.ModuleList([ + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0])), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1])), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2])) + ]) + + self.convs2 = nn.ModuleList([ + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1)), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1)), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1)) + ]) + + self.num_layers = len(self.convs1) + len(self.convs2) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError( + "activation incorrectly specified. check the config file and look for 'activation'." + ) + + def forward(self, x): + acts1, acts2 = self.activations[::2], self.activations[1::2] + for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2): + xt = a1(x) + xt = c1(xt) + xt = a2(xt) + xt = c2(xt) + x = xt + x + + return x + + +class AMPBlock2(torch.nn.Module): + + def __init__(self, h, channels, kernel_size=3, dilation=(1, 3), activation=None): + super(AMPBlock2, self).__init__() + self.h = h + + self.convs = nn.ModuleList([ + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0])), + ops.Conv1d(channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1])) + ]) + + self.num_layers = len(self.convs) # total number of conv layers + + if activation == 'snake': # periodic nonlinearity with snake function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.Snake(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + elif activation == 'snakebeta': # periodic nonlinearity with snakebeta function and anti-aliasing + self.activations = nn.ModuleList([ + Activation1d( + activation=activations.SnakeBeta(channels, alpha_logscale=h.snake_logscale)) + for _ in range(self.num_layers) + ]) + else: + raise NotImplementedError( + "activation incorrectly specified. check the config file and look for 'activation'." + ) + + def forward(self, x): + for c, a in zip(self.convs, self.activations): + xt = a(x) + xt = c(xt) + x = xt + x + + return x + + +class BigVGANVocoder(torch.nn.Module): + # this is our main BigVGAN model. Applies anti-aliased periodic activation for resblocks. + def __init__(self, h): + super().__init__() + if isinstance(h, dict): + h = SimpleNamespace(**h) + self.h = h + + self.num_kernels = len(h.resblock_kernel_sizes) + self.num_upsamples = len(h.upsample_rates) + + # pre conv + self.conv_pre = ops.Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3) + + # define which AMPBlock to use. BigVGAN uses AMPBlock1 as default + resblock = AMPBlock1 if h.resblock == '1' else AMPBlock2 + + # transposed conv-based upsamplers. does not apply anti-aliasing + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)): + self.ups.append( + nn.ModuleList([ + ops.ConvTranspose1d(h.upsample_initial_channel // (2**i), + h.upsample_initial_channel // (2**(i + 1)), + k, + u, + padding=(k - u) // 2) + ])) + + # residual blocks using anti-aliased multi-periodicity composition modules (AMP) + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = h.upsample_initial_channel // (2**(i + 1)) + for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)): + self.resblocks.append(resblock(h, ch, k, d, activation=h.activation)) + + # post conv + if h.activation == "snake": # periodic nonlinearity with snake function and anti-aliasing + activation_post = activations.Snake(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + elif h.activation == "snakebeta": # periodic nonlinearity with snakebeta function and anti-aliasing + activation_post = activations.SnakeBeta(ch, alpha_logscale=h.snake_logscale) + self.activation_post = Activation1d(activation=activation_post) + else: + raise NotImplementedError( + "activation incorrectly specified. check the config file and look for 'activation'." + ) + + self.conv_post = ops.Conv1d(ch, 1, 7, 1, padding=3) + + + def forward(self, x): + # pre conv + x = self.conv_pre(x) + + for i in range(self.num_upsamples): + # upsampling + for i_up in range(len(self.ups[i])): + x = self.ups[i][i_up](x) + # AMP blocks + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + + # post conv + x = self.activation_post(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x diff --git a/comfy/ldm/mmaudio/vae/distributions.py b/comfy/ldm/mmaudio/vae/distributions.py new file mode 100644 index 000000000..df987c5ec --- /dev/null +++ b/comfy/ldm/mmaudio/vae/distributions.py @@ -0,0 +1,92 @@ +import torch +import numpy as np + + +class AbstractDistribution: + def sample(self): + raise NotImplementedError() + + def mode(self): + raise NotImplementedError() + + +class DiracDistribution(AbstractDistribution): + def __init__(self, value): + self.value = value + + def sample(self): + return self.value + + def mode(self): + return self.value + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean, device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn(self.mean.shape, device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * torch.sum(torch.pow(self.mean, 2) + + self.var - 1.0 - self.logvar, + dim=[1, 2, 3]) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=[1, 2, 3]) + + def nll(self, sample, dims=[1,2,3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + torch.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) + ) diff --git a/comfy/ldm/mmaudio/vae/vae.py b/comfy/ldm/mmaudio/vae/vae.py new file mode 100644 index 000000000..62f24606c --- /dev/null +++ b/comfy/ldm/mmaudio/vae/vae.py @@ -0,0 +1,358 @@ +import logging +from typing import Optional + +import torch +import torch.nn as nn + +from .vae_modules import (AttnBlock1D, Downsample1D, ResnetBlock1D, + Upsample1D, nonlinearity) +from .distributions import DiagonalGaussianDistribution + +import comfy.ops +ops = comfy.ops.disable_weight_init + +log = logging.getLogger() + +DATA_MEAN_80D = [ + -1.6058, -1.3676, -1.2520, -1.2453, -1.2078, -1.2224, -1.2419, -1.2439, -1.2922, -1.2927, + -1.3170, -1.3543, -1.3401, -1.3836, -1.3907, -1.3912, -1.4313, -1.4152, -1.4527, -1.4728, + -1.4568, -1.5101, -1.5051, -1.5172, -1.5623, -1.5373, -1.5746, -1.5687, -1.6032, -1.6131, + -1.6081, -1.6331, -1.6489, -1.6489, -1.6700, -1.6738, -1.6953, -1.6969, -1.7048, -1.7280, + -1.7361, -1.7495, -1.7658, -1.7814, -1.7889, -1.8064, -1.8221, -1.8377, -1.8417, -1.8643, + -1.8857, -1.8929, -1.9173, -1.9379, -1.9531, -1.9673, -1.9824, -2.0042, -2.0215, -2.0436, + -2.0766, -2.1064, -2.1418, -2.1855, -2.2319, -2.2767, -2.3161, -2.3572, -2.3954, -2.4282, + -2.4659, -2.5072, -2.5552, -2.6074, -2.6584, -2.7107, -2.7634, -2.8266, -2.8981, -2.9673 +] + +DATA_STD_80D = [ + 1.0291, 1.0411, 1.0043, 0.9820, 0.9677, 0.9543, 0.9450, 0.9392, 0.9343, 0.9297, 0.9276, 0.9263, + 0.9242, 0.9254, 0.9232, 0.9281, 0.9263, 0.9315, 0.9274, 0.9247, 0.9277, 0.9199, 0.9188, 0.9194, + 0.9160, 0.9161, 0.9146, 0.9161, 0.9100, 0.9095, 0.9145, 0.9076, 0.9066, 0.9095, 0.9032, 0.9043, + 0.9038, 0.9011, 0.9019, 0.9010, 0.8984, 0.8983, 0.8986, 0.8961, 0.8962, 0.8978, 0.8962, 0.8973, + 0.8993, 0.8976, 0.8995, 0.9016, 0.8982, 0.8972, 0.8974, 0.8949, 0.8940, 0.8947, 0.8936, 0.8939, + 0.8951, 0.8956, 0.9017, 0.9167, 0.9436, 0.9690, 1.0003, 1.0225, 1.0381, 1.0491, 1.0545, 1.0604, + 1.0761, 1.0929, 1.1089, 1.1196, 1.1176, 1.1156, 1.1117, 1.1070 +] + +DATA_MEAN_128D = [ + -3.3462, -2.6723, -2.4893, -2.3143, -2.2664, -2.3317, -2.1802, -2.4006, -2.2357, -2.4597, + -2.3717, -2.4690, -2.5142, -2.4919, -2.6610, -2.5047, -2.7483, -2.5926, -2.7462, -2.7033, + -2.7386, -2.8112, -2.7502, -2.9594, -2.7473, -3.0035, -2.8891, -2.9922, -2.9856, -3.0157, + -3.1191, -2.9893, -3.1718, -3.0745, -3.1879, -3.2310, -3.1424, -3.2296, -3.2791, -3.2782, + -3.2756, -3.3134, -3.3509, -3.3750, -3.3951, -3.3698, -3.4505, -3.4509, -3.5089, -3.4647, + -3.5536, -3.5788, -3.5867, -3.6036, -3.6400, -3.6747, -3.7072, -3.7279, -3.7283, -3.7795, + -3.8259, -3.8447, -3.8663, -3.9182, -3.9605, -3.9861, -4.0105, -4.0373, -4.0762, -4.1121, + -4.1488, -4.1874, -4.2461, -4.3170, -4.3639, -4.4452, -4.5282, -4.6297, -4.7019, -4.7960, + -4.8700, -4.9507, -5.0303, -5.0866, -5.1634, -5.2342, -5.3242, -5.4053, -5.4927, -5.5712, + -5.6464, -5.7052, -5.7619, -5.8410, -5.9188, -6.0103, -6.0955, -6.1673, -6.2362, -6.3120, + -6.3926, -6.4797, -6.5565, -6.6511, -6.8130, -6.9961, -7.1275, -7.2457, -7.3576, -7.4663, + -7.6136, -7.7469, -7.8815, -8.0132, -8.1515, -8.3071, -8.4722, -8.7418, -9.3975, -9.6628, + -9.7671, -9.8863, -9.9992, -10.0860, -10.1709, -10.5418, -11.2795, -11.3861 +] + +DATA_STD_128D = [ + 2.3804, 2.4368, 2.3772, 2.3145, 2.2803, 2.2510, 2.2316, 2.2083, 2.1996, 2.1835, 2.1769, 2.1659, + 2.1631, 2.1618, 2.1540, 2.1606, 2.1571, 2.1567, 2.1612, 2.1579, 2.1679, 2.1683, 2.1634, 2.1557, + 2.1668, 2.1518, 2.1415, 2.1449, 2.1406, 2.1350, 2.1313, 2.1415, 2.1281, 2.1352, 2.1219, 2.1182, + 2.1327, 2.1195, 2.1137, 2.1080, 2.1179, 2.1036, 2.1087, 2.1036, 2.1015, 2.1068, 2.0975, 2.0991, + 2.0902, 2.1015, 2.0857, 2.0920, 2.0893, 2.0897, 2.0910, 2.0881, 2.0925, 2.0873, 2.0960, 2.0900, + 2.0957, 2.0958, 2.0978, 2.0936, 2.0886, 2.0905, 2.0845, 2.0855, 2.0796, 2.0840, 2.0813, 2.0817, + 2.0838, 2.0840, 2.0917, 2.1061, 2.1431, 2.1976, 2.2482, 2.3055, 2.3700, 2.4088, 2.4372, 2.4609, + 2.4731, 2.4847, 2.5072, 2.5451, 2.5772, 2.6147, 2.6529, 2.6596, 2.6645, 2.6726, 2.6803, 2.6812, + 2.6899, 2.6916, 2.6931, 2.6998, 2.7062, 2.7262, 2.7222, 2.7158, 2.7041, 2.7485, 2.7491, 2.7451, + 2.7485, 2.7233, 2.7297, 2.7233, 2.7145, 2.6958, 2.6788, 2.6439, 2.6007, 2.4786, 2.2469, 2.1877, + 2.1392, 2.0717, 2.0107, 1.9676, 1.9140, 1.7102, 0.9101, 0.7164 +] + + +class VAE(nn.Module): + + def __init__( + self, + *, + data_dim: int, + embed_dim: int, + hidden_dim: int, + ): + super().__init__() + + if data_dim == 80: + self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_80D, dtype=torch.float32)) + self.data_std = nn.Buffer(torch.tensor(DATA_STD_80D, dtype=torch.float32)) + elif data_dim == 128: + self.data_mean = nn.Buffer(torch.tensor(DATA_MEAN_128D, dtype=torch.float32)) + self.data_std = nn.Buffer(torch.tensor(DATA_STD_128D, dtype=torch.float32)) + + self.data_mean = self.data_mean.view(1, -1, 1) + self.data_std = self.data_std.view(1, -1, 1) + + self.encoder = Encoder1D( + dim=hidden_dim, + ch_mult=(1, 2, 4), + num_res_blocks=2, + attn_layers=[3], + down_layers=[0], + in_dim=data_dim, + embed_dim=embed_dim, + ) + self.decoder = Decoder1D( + dim=hidden_dim, + ch_mult=(1, 2, 4), + num_res_blocks=2, + attn_layers=[3], + down_layers=[0], + in_dim=data_dim, + out_dim=data_dim, + embed_dim=embed_dim, + ) + + self.embed_dim = embed_dim + # self.quant_conv = nn.Conv1d(2 * embed_dim, 2 * embed_dim, 1) + # self.post_quant_conv = nn.Conv1d(embed_dim, embed_dim, 1) + + self.initialize_weights() + + def initialize_weights(self): + pass + + def encode(self, x: torch.Tensor, normalize: bool = True) -> DiagonalGaussianDistribution: + if normalize: + x = self.normalize(x) + moments = self.encoder(x) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z: torch.Tensor, unnormalize: bool = True) -> torch.Tensor: + dec = self.decoder(z) + if unnormalize: + dec = self.unnormalize(dec) + return dec + + def normalize(self, x: torch.Tensor) -> torch.Tensor: + return (x - comfy.model_management.cast_to(self.data_mean, dtype=x.dtype, device=x.device)) / comfy.model_management.cast_to(self.data_std, dtype=x.dtype, device=x.device) + + def unnormalize(self, x: torch.Tensor) -> torch.Tensor: + return x * comfy.model_management.cast_to(self.data_std, dtype=x.dtype, device=x.device) + comfy.model_management.cast_to(self.data_mean, dtype=x.dtype, device=x.device) + + def forward( + self, + x: torch.Tensor, + sample_posterior: bool = True, + rng: Optional[torch.Generator] = None, + normalize: bool = True, + unnormalize: bool = True, + ) -> tuple[torch.Tensor, DiagonalGaussianDistribution]: + + posterior = self.encode(x, normalize=normalize) + if sample_posterior: + z = posterior.sample(rng) + else: + z = posterior.mode() + dec = self.decode(z, unnormalize=unnormalize) + return dec, posterior + + def load_weights(self, src_dict) -> None: + self.load_state_dict(src_dict, strict=True) + + @property + def device(self) -> torch.device: + return next(self.parameters()).device + + def get_last_layer(self): + return self.decoder.conv_out.weight + + def remove_weight_norm(self): + return self + + +class Encoder1D(nn.Module): + + def __init__(self, + *, + dim: int, + ch_mult: tuple[int] = (1, 2, 4, 8), + num_res_blocks: int, + attn_layers: list[int] = [], + down_layers: list[int] = [], + resamp_with_conv: bool = True, + in_dim: int, + embed_dim: int, + double_z: bool = True, + kernel_size: int = 3, + clip_act: float = 256.0): + super().__init__() + self.dim = dim + self.num_layers = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_dim + self.clip_act = clip_act + self.down_layers = down_layers + self.attn_layers = attn_layers + self.conv_in = ops.Conv1d(in_dim, self.dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + + in_ch_mult = (1, ) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + # downsampling + self.down = nn.ModuleList() + for i_level in range(self.num_layers): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = dim * in_ch_mult[i_level] + block_out = dim * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock1D(in_dim=block_in, + out_dim=block_out, + kernel_size=kernel_size, + use_norm=True)) + block_in = block_out + if i_level in attn_layers: + attn.append(AttnBlock1D(block_in)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level in down_layers: + down.downsample = Downsample1D(block_in, resamp_with_conv) + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock1D(in_dim=block_in, + out_dim=block_in, + kernel_size=kernel_size, + use_norm=True) + self.mid.attn_1 = AttnBlock1D(block_in) + self.mid.block_2 = ResnetBlock1D(in_dim=block_in, + out_dim=block_in, + kernel_size=kernel_size, + use_norm=True) + + # end + self.conv_out = ops.Conv1d(block_in, + 2 * embed_dim if double_z else embed_dim, + kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + + self.learnable_gain = nn.Parameter(torch.zeros([])) + + def forward(self, x): + + # downsampling + h = self.conv_in(x) + for i_level in range(self.num_layers): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](h) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + h = h.clamp(-self.clip_act, self.clip_act) + if i_level in self.down_layers: + h = self.down[i_level].downsample(h) + + # middle + h = self.mid.block_1(h) + h = self.mid.attn_1(h) + h = self.mid.block_2(h) + h = h.clamp(-self.clip_act, self.clip_act) + + # end + h = nonlinearity(h) + h = self.conv_out(h) * (self.learnable_gain + 1) + return h + + +class Decoder1D(nn.Module): + + def __init__(self, + *, + dim: int, + out_dim: int, + ch_mult: tuple[int] = (1, 2, 4, 8), + num_res_blocks: int, + attn_layers: list[int] = [], + down_layers: list[int] = [], + kernel_size: int = 3, + resamp_with_conv: bool = True, + in_dim: int, + embed_dim: int, + clip_act: float = 256.0): + super().__init__() + self.ch = dim + self.num_layers = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.in_channels = in_dim + self.clip_act = clip_act + self.down_layers = [i + 1 for i in down_layers] # each downlayer add one + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = dim * ch_mult[self.num_layers - 1] + + # z to block_in + self.conv_in = ops.Conv1d(embed_dim, block_in, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True) + self.mid.attn_1 = AttnBlock1D(block_in) + self.mid.block_2 = ResnetBlock1D(in_dim=block_in, out_dim=block_in, use_norm=True) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_layers)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = dim * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append(ResnetBlock1D(in_dim=block_in, out_dim=block_out, use_norm=True)) + block_in = block_out + if i_level in attn_layers: + attn.append(AttnBlock1D(block_in)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level in self.down_layers: + up.upsample = Upsample1D(block_in, resamp_with_conv) + self.up.insert(0, up) # prepend to get consistent order + + # end + self.conv_out = ops.Conv1d(block_in, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + self.learnable_gain = nn.Parameter(torch.zeros([])) + + def forward(self, z): + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h) + h = self.mid.attn_1(h) + h = self.mid.block_2(h) + h = h.clamp(-self.clip_act, self.clip_act) + + # upsampling + for i_level in reversed(range(self.num_layers)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + h = h.clamp(-self.clip_act, self.clip_act) + if i_level in self.down_layers: + h = self.up[i_level].upsample(h) + + h = nonlinearity(h) + h = self.conv_out(h) * (self.learnable_gain + 1) + return h + + +def VAE_16k(**kwargs) -> VAE: + return VAE(data_dim=80, embed_dim=20, hidden_dim=384, **kwargs) + + +def VAE_44k(**kwargs) -> VAE: + return VAE(data_dim=128, embed_dim=40, hidden_dim=512, **kwargs) + + +def get_my_vae(name: str, **kwargs) -> VAE: + if name == '16k': + return VAE_16k(**kwargs) + if name == '44k': + return VAE_44k(**kwargs) + raise ValueError(f'Unknown model: {name}') + diff --git a/comfy/ldm/mmaudio/vae/vae_modules.py b/comfy/ldm/mmaudio/vae/vae_modules.py new file mode 100644 index 000000000..3ad05134b --- /dev/null +++ b/comfy/ldm/mmaudio/vae/vae_modules.py @@ -0,0 +1,121 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from comfy.ldm.modules.diffusionmodules.model import vae_attention +import math +import comfy.ops +ops = comfy.ops.disable_weight_init + +def nonlinearity(x): + # swish + return torch.nn.functional.silu(x) / 0.596 + +def mp_sum(a, b, t=0.5): + return a.lerp(b, t) / math.sqrt((1 - t)**2 + t**2) + +def normalize(x, dim=None, eps=1e-4): + if dim is None: + dim = list(range(1, x.ndim)) + norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32) + norm = torch.add(eps, norm, alpha=math.sqrt(norm.numel() / x.numel())) + return x / norm.to(x.dtype) + +class ResnetBlock1D(nn.Module): + + def __init__(self, *, in_dim, out_dim=None, conv_shortcut=False, kernel_size=3, use_norm=True): + super().__init__() + self.in_dim = in_dim + out_dim = in_dim if out_dim is None else out_dim + self.out_dim = out_dim + self.use_conv_shortcut = conv_shortcut + self.use_norm = use_norm + + self.conv1 = ops.Conv1d(in_dim, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + self.conv2 = ops.Conv1d(out_dim, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + if self.in_dim != self.out_dim: + if self.use_conv_shortcut: + self.conv_shortcut = ops.Conv1d(in_dim, out_dim, kernel_size=kernel_size, padding=kernel_size // 2, bias=False) + else: + self.nin_shortcut = ops.Conv1d(in_dim, out_dim, kernel_size=1, padding=0, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + + # pixel norm + if self.use_norm: + x = normalize(x, dim=1) + + h = x + h = nonlinearity(h) + h = self.conv1(h) + + h = nonlinearity(h) + h = self.conv2(h) + + if self.in_dim != self.out_dim: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return mp_sum(x, h, t=0.3) + + +class AttnBlock1D(nn.Module): + + def __init__(self, in_channels, num_heads=1): + super().__init__() + self.in_channels = in_channels + + self.num_heads = num_heads + self.qkv = ops.Conv1d(in_channels, in_channels * 3, kernel_size=1, padding=0, bias=False) + self.proj_out = ops.Conv1d(in_channels, in_channels, kernel_size=1, padding=0, bias=False) + self.optimized_attention = vae_attention() + + def forward(self, x): + h = x + y = self.qkv(h) + y = y.reshape(y.shape[0], -1, 3, y.shape[-1]) + q, k, v = normalize(y, dim=1).unbind(2) + + h = self.optimized_attention(q, k, v) + h = self.proj_out(h) + + return mp_sum(x, h, t=0.3) + + +class Upsample1D(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = ops.Conv1d(in_channels, in_channels, kernel_size=3, padding=1, bias=False) + + def forward(self, x): + x = F.interpolate(x, scale_factor=2.0, mode='nearest-exact') # support 3D tensor(B,C,T) + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample1D(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv1 = ops.Conv1d(in_channels, in_channels, kernel_size=1, padding=0, bias=False) + self.conv2 = ops.Conv1d(in_channels, in_channels, kernel_size=1, padding=0, bias=False) + + def forward(self, x): + + if self.with_conv: + x = self.conv1(x) + + x = F.avg_pool1d(x, kernel_size=2, stride=2) + + if self.with_conv: + x = self.conv2(x) + + return x diff --git a/comfy/ldm/qwen_image/controlnet.py b/comfy/ldm/qwen_image/controlnet.py index 92ac3cf0a..a6d408104 100644 --- a/comfy/ldm/qwen_image/controlnet.py +++ b/comfy/ldm/qwen_image/controlnet.py @@ -44,7 +44,7 @@ class QwenImageControlNetModel(QwenImageTransformer2DModel): txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) - image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + image_rotary_emb = self.pe_embedder(ids).to(x.dtype).contiguous() del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) + self.controlnet_x_embedder(hint) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index b9f60c2b7..427ea19c1 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -10,6 +10,7 @@ from comfy.ldm.modules.attention import optimized_attention_masked from comfy.ldm.flux.layers import EmbedND import comfy.ldm.common_dit import comfy.patcher_extension +from comfy.ldm.flux.math import apply_rope1 class GELU(nn.Module): def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True, dtype=None, device=None, operations=None): @@ -134,33 +135,34 @@ class Attention(nn.Module): image_rotary_emb: Optional[torch.Tensor] = None, transformer_options={}, ) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size = hidden_states.shape[0] + seq_img = hidden_states.shape[1] seq_txt = encoder_hidden_states.shape[1] - img_query = self.to_q(hidden_states).unflatten(-1, (self.heads, -1)) - img_key = self.to_k(hidden_states).unflatten(-1, (self.heads, -1)) - img_value = self.to_v(hidden_states).unflatten(-1, (self.heads, -1)) + # Project and reshape to BHND format (batch, heads, seq, dim) + img_query = self.to_q(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2).contiguous() + img_key = self.to_k(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2).contiguous() + img_value = self.to_v(hidden_states).view(batch_size, seq_img, self.heads, -1).transpose(1, 2) - txt_query = self.add_q_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) - txt_key = self.add_k_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) - txt_value = self.add_v_proj(encoder_hidden_states).unflatten(-1, (self.heads, -1)) + txt_query = self.add_q_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2).contiguous() + txt_key = self.add_k_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2).contiguous() + txt_value = self.add_v_proj(encoder_hidden_states).view(batch_size, seq_txt, self.heads, -1).transpose(1, 2) img_query = self.norm_q(img_query) img_key = self.norm_k(img_key) txt_query = self.norm_added_q(txt_query) txt_key = self.norm_added_k(txt_key) - joint_query = torch.cat([txt_query, img_query], dim=1) - joint_key = torch.cat([txt_key, img_key], dim=1) - joint_value = torch.cat([txt_value, img_value], dim=1) + joint_query = torch.cat([txt_query, img_query], dim=2) + joint_key = torch.cat([txt_key, img_key], dim=2) + joint_value = torch.cat([txt_value, img_value], dim=2) - joint_query = apply_rotary_emb(joint_query, image_rotary_emb) - joint_key = apply_rotary_emb(joint_key, image_rotary_emb) + joint_query = apply_rope1(joint_query, image_rotary_emb) + joint_key = apply_rope1(joint_key, image_rotary_emb) - joint_query = joint_query.flatten(start_dim=2) - joint_key = joint_key.flatten(start_dim=2) - joint_value = joint_value.flatten(start_dim=2) - - joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, attention_mask, transformer_options=transformer_options) + joint_hidden_states = optimized_attention_masked(joint_query, joint_key, joint_value, self.heads, + attention_mask, transformer_options=transformer_options, + skip_reshape=True) txt_attn_output = joint_hidden_states[:, :seq_txt, :] img_attn_output = joint_hidden_states[:, seq_txt:, :] @@ -234,10 +236,10 @@ class QwenImageTransformerBlock(nn.Module): img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1) txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) - img_normed = self.img_norm1(hidden_states) - img_modulated, img_gate1 = self._modulate(img_normed, img_mod1) - txt_normed = self.txt_norm1(encoder_hidden_states) - txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1) + img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1) + del img_mod1 + txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1) + del txt_mod1 img_attn_output, txt_attn_output = self.attn( hidden_states=img_modulated, @@ -246,16 +248,20 @@ class QwenImageTransformerBlock(nn.Module): image_rotary_emb=image_rotary_emb, transformer_options=transformer_options, ) + del img_modulated + del txt_modulated hidden_states = hidden_states + img_gate1 * img_attn_output encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output + del img_attn_output + del txt_attn_output + del img_gate1 + del txt_gate1 - img_normed2 = self.img_norm2(hidden_states) - img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2) + img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2) hidden_states = torch.addcmul(hidden_states, img_gate2, self.img_mlp(img_modulated2)) - txt_normed2 = self.txt_norm2(encoder_hidden_states) - txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2) + txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2) encoder_hidden_states = torch.addcmul(encoder_hidden_states, txt_gate2, self.txt_mlp(txt_modulated2)) return encoder_hidden_states, hidden_states @@ -413,7 +419,7 @@ class QwenImageTransformer2DModel(nn.Module): txt_start = round(max(((x.shape[-1] + (self.patch_size // 2)) // self.patch_size) // 2, ((x.shape[-2] + (self.patch_size // 2)) // self.patch_size) // 2)) txt_ids = torch.arange(txt_start, txt_start + context.shape[1], device=x.device).reshape(1, -1, 1).repeat(x.shape[0], 1, 3) ids = torch.cat((txt_ids, img_ids), dim=1) - image_rotary_emb = self.pe_embedder(ids).squeeze(1).unsqueeze(2).to(x.dtype) + image_rotary_emb = self.pe_embedder(ids).to(x.dtype).contiguous() del ids, txt_ids, img_ids hidden_states = self.img_in(hidden_states) diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py index 54616e6eb..a9d5e10d9 100644 --- a/comfy/ldm/wan/model.py +++ b/comfy/ldm/wan/model.py @@ -232,11 +232,13 @@ class WanAttentionBlock(nn.Module): # assert e[0].dtype == torch.float32 # self-attention + x = x.contiguous() # otherwise implicit in LayerNorm y = self.self_attn( torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)), freqs, transformer_options=transformer_options) x = torch.addcmul(x, y, repeat_e(e[2], x)) + del y # cross-attention & ffn x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options) @@ -587,7 +589,7 @@ class WanModel(torch.nn.Module): x = self.unpatchify(x, grid_sizes) return x - def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None): + def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, transformer_options={}): patch_size = self.patch_size t_len = ((t + (patch_size[0] // 2)) // patch_size[0]) h_len = ((h + (patch_size[1] // 2)) // patch_size[1]) @@ -600,10 +602,22 @@ class WanModel(torch.nn.Module): if steps_w is None: steps_w = w_len + h_start = 0 + w_start = 0 + rope_options = transformer_options.get("rope_options", None) + if rope_options is not None: + t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0 + h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0 + w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0 + + t_start += rope_options.get("shift_t", 0.0) + h_start += rope_options.get("shift_y", 0.0) + w_start += rope_options.get("shift_x", 0.0) + img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype) img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1) - img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(0, h_len - 1, steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) - img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(0, w_len - 1, steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) + img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1) + img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1) img_ids = img_ids.reshape(1, -1, img_ids.shape[-1]) freqs = self.rope_embedder(img_ids).movedim(1, 2) @@ -629,7 +643,7 @@ class WanModel(torch.nn.Module): if self.ref_conv is not None and "reference_latent" in kwargs: t_len += 1 - freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype) + freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, transformer_options=transformer_options) return self.forward_orig(x, timestep, context, clip_fea=clip_fea, freqs=freqs, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w] def unpatchify(self, x, grid_sizes): @@ -902,7 +916,7 @@ class MotionEncoder_tc(nn.Module): def __init__(self, in_dim: int, hidden_dim: int, - num_heads=int, + num_heads: int, need_global=True, dtype=None, device=None, diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py index 791596938..ccbb25822 100644 --- a/comfy/ldm/wan/vae.py +++ b/comfy/ldm/wan/vae.py @@ -468,55 +468,46 @@ class WanVAE(nn.Module): attn_scales, self.temperal_upsample, dropout) def encode(self, x): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.decoder) ## cache t = x.shape[2] iter_ = 1 + (t - 1) // 4 ## 对encode输入的x,按时间拆分为1、4、4、4.... for i in range(iter_): - self._enc_conv_idx = [0] + conv_idx = [0] if i == 0: out = self.encoder( x[:, :, :1, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) else: out_ = self.encoder( x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) out = torch.cat([out, out_], 2) mu, log_var = self.conv1(out).chunk(2, dim=1) - self.clear_cache() return mu def decode(self, z): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.decoder) # z: [b,c,t,h,w] iter_ = z.shape[2] x = self.conv2(z) for i in range(iter_): - self._conv_idx = [0] + conv_idx = [0] if i == 0: out = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) else: out_ = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx) + feat_cache=feat_map, + feat_idx=conv_idx) out = torch.cat([out, out_], 2) - self.clear_cache() return out - - def clear_cache(self): - self._conv_num = count_conv3d(self.decoder) - self._conv_idx = [0] - self._feat_map = [None] * self._conv_num - #cache encode - self._enc_conv_num = count_conv3d(self.encoder) - self._enc_conv_idx = [0] - self._enc_feat_map = [None] * self._enc_conv_num diff --git a/comfy/ldm/wan/vae2_2.py b/comfy/ldm/wan/vae2_2.py index 1f6d584a2..8e1593a54 100644 --- a/comfy/ldm/wan/vae2_2.py +++ b/comfy/ldm/wan/vae2_2.py @@ -657,51 +657,51 @@ class WanVAE(nn.Module): ) def encode(self, x): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.encoder) x = patchify(x, patch_size=2) t = x.shape[2] iter_ = 1 + (t - 1) // 4 for i in range(iter_): - self._enc_conv_idx = [0] + conv_idx = [0] if i == 0: out = self.encoder( x[:, :, :1, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, ) else: out_ = self.encoder( x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :], - feat_cache=self._enc_feat_map, - feat_idx=self._enc_conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, ) out = torch.cat([out, out_], 2) mu, log_var = self.conv1(out).chunk(2, dim=1) - self.clear_cache() return mu def decode(self, z): - self.clear_cache() + conv_idx = [0] + feat_map = [None] * count_conv3d(self.decoder) iter_ = z.shape[2] x = self.conv2(z) for i in range(iter_): - self._conv_idx = [0] + conv_idx = [0] if i == 0: out = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, first_chunk=True, ) else: out_ = self.decoder( x[:, :, i:i + 1, :, :], - feat_cache=self._feat_map, - feat_idx=self._conv_idx, + feat_cache=feat_map, + feat_idx=conv_idx, ) out = torch.cat([out, out_], 2) out = unpatchify(out, patch_size=2) - self.clear_cache() return out def reparameterize(self, mu, log_var): @@ -715,12 +715,3 @@ class WanVAE(nn.Module): return mu std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0)) return mu + std * torch.randn_like(std) - - def clear_cache(self): - self._conv_num = count_conv3d(self.decoder) - self._conv_idx = [0] - self._feat_map = [None] * self._conv_num - # cache encode - self._enc_conv_num = count_conv3d(self.encoder) - self._enc_conv_idx = [0] - self._enc_feat_map = [None] * self._enc_conv_num diff --git a/comfy/model_base.py b/comfy/model_base.py index b0b9cde7d..7c788d085 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -134,10 +134,11 @@ class BaseModel(torch.nn.Module): if not unet_config.get("disable_unet_model_creation", False): if model_config.custom_operations is None: fp8 = model_config.optimizations.get("fp8", False) - operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8) + operations = comfy.ops.pick_operations(unet_config.get("dtype", None), self.manual_cast_dtype, fp8_optimizations=fp8, scaled_fp8=model_config.scaled_fp8, model_config=model_config) else: operations = model_config.custom_operations self.diffusion_model = unet_model(**unet_config, device=device, operations=operations) + self.diffusion_model.eval() if comfy.model_management.force_channels_last(): self.diffusion_model.to(memory_format=torch.channels_last) logging.debug("using channels last mode for diffusion model") @@ -196,8 +197,14 @@ class BaseModel(torch.nn.Module): extra_conds[o] = extra t = self.process_timestep(t, x=x, **extra_conds) - model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float() - return self.model_sampling.calculate_denoised(sigma, model_output, x) + if "latent_shapes" in extra_conds: + xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes")) + + model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds) + if len(model_output) > 1 and not torch.is_tensor(model_output): + model_output, _ = utils.pack_latents(model_output) + + return self.model_sampling.calculate_denoised(sigma, model_output.float(), x) def process_timestep(self, timestep, **kwargs): return timestep @@ -326,6 +333,14 @@ class BaseModel(torch.nn.Module): if self.model_config.scaled_fp8 is not None: unet_state_dict["scaled_fp8"] = torch.tensor([], dtype=self.model_config.scaled_fp8) + # Save mixed precision metadata + if hasattr(self.model_config, 'layer_quant_config') and self.model_config.layer_quant_config: + metadata = { + "format_version": "1.0", + "layers": self.model_config.layer_quant_config + } + unet_state_dict["_quantization_metadata"] = metadata + unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict) if self.model_type == ModelType.V_PREDICTION: @@ -669,7 +684,6 @@ class Lotus(BaseModel): class StableCascade_C(BaseModel): def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None): super().__init__(model_config, model_type, device=device, unet_model=StageC) - self.diffusion_model.eval().requires_grad_(False) def extra_conds(self, **kwargs): out = {} @@ -698,7 +712,6 @@ class StableCascade_C(BaseModel): class StableCascade_B(BaseModel): def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None): super().__init__(model_config, model_type, device=device, unet_model=StageB) - self.diffusion_model.eval().requires_grad_(False) def extra_conds(self, **kwargs): out = {} diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 46415c17a..3142a7fc3 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -6,6 +6,20 @@ import math import logging import torch + +def detect_layer_quantization(metadata): + quant_key = "_quantization_metadata" + if metadata is not None and quant_key in metadata: + quant_metadata = metadata.pop(quant_key) + quant_metadata = json.loads(quant_metadata) + if isinstance(quant_metadata, dict) and "layers" in quant_metadata: + logging.info(f"Found quantization metadata (version {quant_metadata.get('format_version', 'unknown')})") + return quant_metadata["layers"] + else: + raise ValueError("Invalid quantization metadata format") + return None + + def count_blocks(state_dict_keys, prefix_string): count = 0 while True: @@ -213,7 +227,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["nerf_mlp_ratio"] = 4 dit_config["nerf_depth"] = 4 dit_config["nerf_max_freqs"] = 8 - dit_config["nerf_tile_size"] = 32 + dit_config["nerf_tile_size"] = 512 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear" dit_config["nerf_embedder_dtype"] = torch.float32 else: @@ -365,8 +379,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["patch_size"] = 2 dit_config["in_channels"] = 16 dit_config["dim"] = 2304 - dit_config["cap_feat_dim"] = 2304 - dit_config["n_layers"] = 26 + dit_config["cap_feat_dim"] = state_dict['{}cap_embedder.1.weight'.format(key_prefix)].shape[1] + dit_config["n_layers"] = count_blocks(state_dict_keys, '{}layers.'.format(key_prefix) + '{}.') dit_config["n_heads"] = 24 dit_config["n_kv_heads"] = 8 dit_config["qk_norm"] = True @@ -701,6 +715,12 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal else: model_config.optimizations["fp8"] = True + # Detect per-layer quantization (mixed precision) + layer_quant_config = detect_layer_quantization(metadata) + if layer_quant_config: + model_config.layer_quant_config = layer_quant_config + logging.info(f"Detected mixed precision quantization: {len(layer_quant_config)} layers quantized") + return model_config def unet_prefix_from_state_dict(state_dict): diff --git a/comfy/model_management.py b/comfy/model_management.py index 855761279..f0ccfc5dd 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -89,6 +89,7 @@ if args.deterministic: directml_enabled = False if args.directml is not None: + logging.warning("WARNING: torch-directml barely works, is very slow, has not been updated in over 1 year and might be removed soon, please don't use it, there are better options.") import torch_directml directml_enabled = True device_index = args.directml @@ -331,13 +332,21 @@ except: SUPPORT_FP8_OPS = args.supports_fp8_compute + +AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"] + try: if is_amd(): + arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName + if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)): + torch.backends.cudnn.enabled = False # Seems to improve things a lot on AMD + logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.") + try: rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2])) except: rocm_version = (6, -1) - arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName + logging.info("AMD arch: {}".format(arch)) logging.info("ROCm version: {}".format(rocm_version)) if args.use_split_cross_attention == False and args.use_quad_cross_attention == False: @@ -345,11 +354,11 @@ try: if torch_version_numeric >= (2, 7): # works on 2.6 but doesn't actually seem to improve much if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]): # TODO: more arches, TODO: gfx950 ENABLE_PYTORCH_ATTENTION = True -# if torch_version_numeric >= (2, 8): -# if any((a in arch) for a in ["gfx1201"]): -# ENABLE_PYTORCH_ATTENTION = True + if rocm_version >= (7, 0): + if any((a in arch) for a in ["gfx1201"]): + ENABLE_PYTORCH_ATTENTION = True if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4): - if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]): # TODO: more arches + if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx950"]): # TODO: more arches, "gfx942" gives error on pytorch nightly 2.10 1013 rocm7.0 SUPPORT_FP8_OPS = True except: @@ -371,6 +380,9 @@ try: except: pass +if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: + torch.backends.cudnn.benchmark = True + try: if torch_version_numeric >= (2, 5): torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) @@ -493,6 +505,7 @@ class LoadedModel: if use_more_vram == 0: use_more_vram = 1e32 self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights) + real_model = self.model.model if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None: @@ -678,7 +691,10 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu current_free_mem = get_free_memory(torch_dev) + loaded_memory lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory())) - lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory) + lowvram_model_memory = lowvram_model_memory - loaded_memory + + if lowvram_model_memory == 0: + lowvram_model_memory = 0.1 if vram_set_state == VRAMState.NO_VRAM: lowvram_model_memory = 0.1 @@ -926,11 +942,7 @@ def vae_dtype(device=None, allowed_dtypes=[]): if d == torch.float16 and should_use_fp16(device): return d - # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32 - # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3 - # also a problem on RDNA4 except fp32 is also slow there. - # This is due to large bf16 convolutions being extremely slow. - if d == torch.bfloat16 and ((not is_amd()) or amd_min_version(device, min_rdna_version=4)) and should_use_bf16(device): + if d == torch.bfloat16 and should_use_bf16(device): return d return torch.float32 @@ -992,12 +1004,6 @@ def device_supports_non_blocking(device): return False return True -def device_should_use_non_blocking(device): - if not device_supports_non_blocking(device): - return False - return False - # return True #TODO: figure out why this causes memory issues on Nvidia and possibly others - def force_channels_last(): if args.force_channels_last: return True @@ -1012,6 +1018,16 @@ if args.async_offload: NUM_STREAMS = 2 logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS)) +def current_stream(device): + if device is None: + return None + if is_device_cuda(device): + return torch.cuda.current_stream() + elif is_device_xpu(device): + return torch.xpu.current_stream() + else: + return None + stream_counters = {} def get_offload_stream(device): stream_counter = stream_counters.get(device, 0) @@ -1020,21 +1036,17 @@ def get_offload_stream(device): if device in STREAMS: ss = STREAMS[device] - s = ss[stream_counter] + #Sync the oldest stream in the queue with the current + ss[stream_counter].wait_stream(current_stream(device)) stream_counter = (stream_counter + 1) % len(ss) - if is_device_cuda(device): - ss[stream_counter].wait_stream(torch.cuda.current_stream()) - elif is_device_xpu(device): - ss[stream_counter].wait_stream(torch.xpu.current_stream()) stream_counters[device] = stream_counter - return s + return ss[stream_counter] elif is_device_cuda(device): ss = [] for k in range(NUM_STREAMS): ss.append(torch.cuda.Stream(device=device, priority=0)) STREAMS[device] = ss s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s elif is_device_xpu(device): @@ -1043,18 +1055,14 @@ def get_offload_stream(device): ss.append(torch.xpu.Stream(device=device, priority=0)) STREAMS[device] = ss s = ss[stream_counter] - stream_counter = (stream_counter + 1) % len(ss) stream_counters[device] = stream_counter return s return None def sync_stream(device, stream): - if stream is None: + if stream is None or current_stream(device) is None: return - if is_device_cuda(device): - torch.cuda.current_stream().wait_stream(stream) - elif is_device_xpu(device): - torch.xpu.current_stream().wait_stream(stream) + current_stream(device).wait_stream(stream) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None): if device is None or weight.device == device: @@ -1079,6 +1087,79 @@ def cast_to_device(tensor, device, dtype, copy=False): non_blocking = device_supports_non_blocking(device) return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy) + +PINNED_MEMORY = {} +TOTAL_PINNED_MEMORY = 0 +MAX_PINNED_MEMORY = -1 +if not args.disable_pinned_memory: + if is_nvidia() or is_amd(): + if WINDOWS: + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.45 # Windows limit is apparently 50% + else: + MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 + logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) + + +def pin_memory(tensor): + global TOTAL_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0: + return False + + if type(tensor) is not torch.nn.parameter.Parameter: + return False + + if not is_device_cpu(tensor.device): + return False + + if tensor.is_pinned(): + #NOTE: Cuda does detect when a tensor is already pinned and would + #error below, but there are proven cases where this also queues an error + #on the GPU async. So dont trust the CUDA API and guard here + return False + + if not tensor.is_contiguous(): + return False + + size = tensor.numel() * tensor.element_size() + if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY: + return False + + ptr = tensor.data_ptr() + if torch.cuda.cudart().cudaHostRegister(ptr, size, 1) == 0: + PINNED_MEMORY[ptr] = size + TOTAL_PINNED_MEMORY += size + return True + + return False + +def unpin_memory(tensor): + global TOTAL_PINNED_MEMORY + if MAX_PINNED_MEMORY <= 0: + return False + + if not is_device_cpu(tensor.device): + return False + + ptr = tensor.data_ptr() + size = tensor.numel() * tensor.element_size() + + size_stored = PINNED_MEMORY.get(ptr, None) + if size_stored is None: + logging.warning("Tried to unpin tensor not pinned by ComfyUI") + return False + + if size != size_stored: + logging.warning("Size of pinned tensor changed") + return False + + if torch.cuda.cudart().cudaHostUnregister(ptr) == 0: + TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr) + if len(PINNED_MEMORY) == 0: + TOTAL_PINNED_MEMORY = 0 + return True + + return False + def sage_attention_enabled(): return args.use_sage_attention @@ -1331,7 +1412,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma if is_amd(): arch = torch.cuda.get_device_properties(device).gcnArchName - if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]): # RDNA2 and older don't support bf16 + if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH): # RDNA2 and older don't support bf16 if manual_cast: return True return False diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 1fd03d9d1..cf1b0d441 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -123,16 +123,30 @@ def move_weight_functions(m, device): return memory class LowVramPatch: - def __init__(self, key, patches): + def __init__(self, key, patches, convert_func=None, set_func=None): self.key = key self.patches = patches + self.convert_func = convert_func + self.set_func = set_func + def __call__(self, weight): intermediate_dtype = weight.dtype + if self.convert_func is not None: + weight = self.convert_func(weight.to(dtype=torch.float32, copy=True), inplace=True) + if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops intermediate_dtype = torch.float32 - return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key)) + out = comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype) + if self.set_func is None: + return comfy.float.stochastic_rounding(out, weight.dtype, seed=string_to_seed(self.key)) + else: + return self.set_func(out, seed=string_to_seed(self.key), return_weight=True) - return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype) + out = comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype) + if self.set_func is not None: + return self.set_func(out, seed=string_to_seed(self.key), return_weight=True).to(dtype=intermediate_dtype) + else: + return out def get_key_weight(model, key): set_func = None @@ -224,6 +238,7 @@ class ModelPatcher: self.force_cast_weights = False self.patches_uuid = uuid.uuid4() self.parent = None + self.pinned = set() self.attachments: dict[str] = {} self.additional_models: dict[str, list[ModelPatcher]] = {} @@ -261,6 +276,9 @@ class ModelPatcher: self.size = comfy.model_management.module_size(self.model) return self.size + def get_ram_usage(self): + return self.model_size() + def loaded_size(self): return self.model.model_loaded_weight_memory @@ -280,6 +298,7 @@ class ModelPatcher: n.backup = self.backup n.object_patches_backup = self.object_patches_backup n.parent = self + n.pinned = self.pinned n.force_cast_weights = self.force_cast_weights @@ -436,6 +455,19 @@ class ModelPatcher: def set_model_post_input_patch(self, patch): self.set_model_patch(patch, "post_input") + def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs): + rope_options = self.model_options["transformer_options"].get("rope_options", {}) + rope_options["scale_x"] = scale_x + rope_options["scale_y"] = scale_y + rope_options["scale_t"] = scale_t + + rope_options["shift_x"] = shift_x + rope_options["shift_y"] = shift_y + rope_options["shift_t"] = shift_t + + self.model_options["transformer_options"]["rope_options"] = rope_options + + def add_object_patch(self, name, obj): self.object_patches[name] = obj @@ -604,6 +636,21 @@ class ModelPatcher: else: set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key)) + def pin_weight_to_device(self, key): + weight, set_func, convert_func = get_key_weight(self.model, key) + if comfy.model_management.pin_memory(weight): + self.pinned.add(key) + + def unpin_weight(self, key): + if key in self.pinned: + weight, set_func, convert_func = get_key_weight(self.model, key) + comfy.model_management.unpin_memory(weight) + self.pinned.remove(key) + + def unpin_all_weights(self): + for key in list(self.pinned): + self.unpin_weight(key) + def _load_list(self): loading = [] for n, m in self.model.named_modules(): @@ -625,9 +672,11 @@ class ModelPatcher: mem_counter = 0 patch_counter = 0 lowvram_counter = 0 + lowvram_mem_counter = 0 loading = self._load_list() load_completely = [] + offloaded = [] loading.sort(reverse=True) for x in loading: n = x[1] @@ -644,6 +693,7 @@ class ModelPatcher: if mem_counter + module_mem >= lowvram_model_memory: lowvram_weight = True lowvram_counter += 1 + lowvram_mem_counter += module_mem if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed continue @@ -657,16 +707,19 @@ class ModelPatcher: if force_patch_weights: self.patch_weight_to_device(weight_key) else: - m.weight_function = [LowVramPatch(weight_key, self.patches)] + _, set_func, convert_func = get_key_weight(self.model, weight_key) + m.weight_function = [LowVramPatch(weight_key, self.patches, convert_func, set_func)] patch_counter += 1 if bias_key in self.patches: if force_patch_weights: self.patch_weight_to_device(bias_key) else: - m.bias_function = [LowVramPatch(bias_key, self.patches)] + _, set_func, convert_func = get_key_weight(self.model, bias_key) + m.bias_function = [LowVramPatch(bias_key, self.patches, convert_func, set_func)] patch_counter += 1 cast_weight = True + offloaded.append((module_mem, n, m, params)) else: if hasattr(m, "comfy_cast_weights"): wipe_lowvram_weight(m) @@ -697,7 +750,9 @@ class ModelPatcher: continue for param in params: - self.patch_weight_to_device("{}.{}".format(n, param), device_to=device_to) + key = "{}.{}".format(n, param) + self.unpin_weight(key) + self.patch_weight_to_device(key, device_to=device_to) logging.debug("lowvram: loaded module regularly {} {}".format(n, m)) m.comfy_patched_weights = True @@ -705,11 +760,17 @@ class ModelPatcher: for x in load_completely: x[2].to(device_to) + for x in offloaded: + n = x[1] + params = x[3] + for param in params: + self.pin_weight_to_device("{}.{}".format(n, param)) + if lowvram_counter > 0: - logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter)) + logging.info("loaded partially; {:.2f} MB usable, {:.2f} MB loaded, {:.2f} MB offloaded, lowvram patches: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), lowvram_mem_counter / (1024 * 1024), patch_counter)) self.model.model_lowvram = True else: - logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load)) + logging.info("loaded completely; {:.2f} MB usable, {:.2f} MB loaded, full load: {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load)) self.model.model_lowvram = False if full_load: self.model.to(device_to) @@ -746,6 +807,7 @@ class ModelPatcher: self.eject_model() if unpatch_weights: self.unpatch_hooks() + self.unpin_all_weights() if self.model.model_lowvram: for m in self.model.modules(): move_weight_functions(m, device_to) @@ -781,7 +843,7 @@ class ModelPatcher: self.object_patches_backup.clear() - def partially_unload(self, device_to, memory_to_free=0): + def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=False): with self.use_ejected(): hooks_unpatched = False memory_freed = 0 @@ -825,11 +887,19 @@ class ModelPatcher: module_mem += move_weight_functions(m, device_to) if lowvram_possible: if weight_key in self.patches: - m.weight_function.append(LowVramPatch(weight_key, self.patches)) - patch_counter += 1 + if force_patch_weights: + self.patch_weight_to_device(weight_key) + else: + _, set_func, convert_func = get_key_weight(self.model, weight_key) + m.weight_function.append(LowVramPatch(weight_key, self.patches, convert_func, set_func)) + patch_counter += 1 if bias_key in self.patches: - m.bias_function.append(LowVramPatch(bias_key, self.patches)) - patch_counter += 1 + if force_patch_weights: + self.patch_weight_to_device(bias_key) + else: + _, set_func, convert_func = get_key_weight(self.model, bias_key) + m.bias_function.append(LowVramPatch(bias_key, self.patches, convert_func, set_func)) + patch_counter += 1 cast_weight = True if cast_weight: @@ -839,9 +909,13 @@ class ModelPatcher: memory_freed += module_mem logging.debug("freed {}".format(n)) + for param in params: + self.pin_weight_to_device("{}.{}".format(n, param)) + self.model.model_lowvram = True self.model.lowvram_patch_counter += patch_counter self.model.model_loaded_weight_memory -= memory_freed + logging.info("loaded partially: {:.2f} MB loaded, lowvram patches: {}".format(self.model.model_loaded_weight_memory / (1024 * 1024), self.model.lowvram_patch_counter)) return memory_freed def partially_load(self, device_to, extra_memory=0, force_patch_weights=False): @@ -854,6 +928,9 @@ class ModelPatcher: extra_memory += (used - self.model.model_loaded_weight_memory) self.patch_model(load_weights=False) + if extra_memory < 0 and not unpatch_weights: + self.partially_unload(self.offload_device, -extra_memory, force_patch_weights=force_patch_weights) + return 0 full_load = False if self.model.model_lowvram == False and self.model.model_loaded_weight_memory > 0: self.apply_hooks(self.forced_hooks, force_apply=True) @@ -1241,5 +1318,6 @@ class ModelPatcher: self.clear_cached_hook_weights() def __del__(self): + self.unpin_all_weights() self.detach(unpatch_all=False) diff --git a/comfy/model_sampling.py b/comfy/model_sampling.py index b240b7f29..2a00ed819 100644 --- a/comfy/model_sampling.py +++ b/comfy/model_sampling.py @@ -21,17 +21,23 @@ def rescale_zero_terminal_snr_sigmas(sigmas): alphas_bar[-1] = 4.8973451890853435e-08 return ((1 - alphas_bar) / alphas_bar) ** 0.5 +def reshape_sigma(sigma, noise_dim): + if sigma.nelement() == 1: + return sigma.view(()) + else: + return sigma.view(sigma.shape[:1] + (1,) * (noise_dim - 1)) + class EPS: def calculate_input(self, sigma, noise): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) return noise / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input - model_output * sigma def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) if max_denoise: noise = noise * torch.sqrt(1.0 + sigma ** 2.0) else: @@ -45,12 +51,12 @@ class EPS: class V_PREDICTION(EPS): def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 class EDM(V_PREDICTION): def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 class CONST: @@ -58,15 +64,15 @@ class CONST: return noise def calculate_denoised(self, sigma, model_output, model_input): - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input - model_output * sigma def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) return sigma * noise + (1.0 - sigma) * latent_image def inverse_noise_scaling(self, sigma, latent): - sigma = sigma.view(sigma.shape[:1] + (1,) * (latent.ndim - 1)) + sigma = reshape_sigma(sigma, latent.ndim) return latent / (1.0 - sigma) class X0(EPS): @@ -80,16 +86,16 @@ class IMG_TO_IMG(X0): class COSMOS_RFLOW: def calculate_input(self, sigma, noise): sigma = (sigma / (sigma + 1)) - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) return noise * (1.0 - sigma) def calculate_denoised(self, sigma, model_output, model_input): sigma = (sigma / (sigma + 1)) - sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1)) + sigma = reshape_sigma(sigma, model_output.ndim) return model_input * (1.0 - sigma) - model_output * sigma def noise_scaling(self, sigma, noise, latent_image, max_denoise=False): - sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1)) + sigma = reshape_sigma(sigma, noise.ndim) noise = noise * sigma noise += latent_image return noise diff --git a/comfy/nested_tensor.py b/comfy/nested_tensor.py new file mode 100644 index 000000000..b700816fa --- /dev/null +++ b/comfy/nested_tensor.py @@ -0,0 +1,91 @@ +import torch + +class NestedTensor: + def __init__(self, tensors): + self.tensors = list(tensors) + self.is_nested = True + + def _copy(self): + return NestedTensor(self.tensors) + + def apply_operation(self, other, operation): + o = self._copy() + if isinstance(other, NestedTensor): + for i, t in enumerate(o.tensors): + o.tensors[i] = operation(t, other.tensors[i]) + else: + for i, t in enumerate(o.tensors): + o.tensors[i] = operation(t, other) + return o + + def __add__(self, b): + return self.apply_operation(b, lambda x, y: x + y) + + def __sub__(self, b): + return self.apply_operation(b, lambda x, y: x - y) + + def __mul__(self, b): + return self.apply_operation(b, lambda x, y: x * y) + + # def __itruediv__(self, b): + # return self.apply_operation(b, lambda x, y: x / y) + + def __truediv__(self, b): + return self.apply_operation(b, lambda x, y: x / y) + + def __getitem__(self, *args, **kwargs): + return self.apply_operation(None, lambda x, y: x.__getitem__(*args, **kwargs)) + + def unbind(self): + return self.tensors + + def to(self, *args, **kwargs): + o = self._copy() + for i, t in enumerate(o.tensors): + o.tensors[i] = t.to(*args, **kwargs) + return o + + def new_ones(self, *args, **kwargs): + return self.tensors[0].new_ones(*args, **kwargs) + + def float(self): + return self.to(dtype=torch.float) + + def chunk(self, *args, **kwargs): + return self.apply_operation(None, lambda x, y: x.chunk(*args, **kwargs)) + + def size(self): + return self.tensors[0].size() + + @property + def shape(self): + return self.tensors[0].shape + + @property + def ndim(self): + dims = 0 + for t in self.tensors: + dims = max(t.ndim, dims) + return dims + + @property + def device(self): + return self.tensors[0].device + + @property + def dtype(self): + return self.tensors[0].dtype + + @property + def layout(self): + return self.tensors[0].layout + + +def cat_nested(tensors, *args, **kwargs): + cated_tensors = [] + for i in range(len(tensors[0].tensors)): + tens = [] + for j in range(len(tensors)): + tens.append(tensors[j].tensors[i]) + cated_tensors.append(torch.cat(tens, *args, **kwargs)) + return NestedTensor(cated_tensors) diff --git a/comfy/ops.py b/comfy/ops.py index 9d7dedd37..2a90a5ba2 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -24,13 +24,18 @@ import comfy.float import comfy.rmsnorm import contextlib +def run_every_op(): + if torch.compiler.is_compiling(): + return + + comfy.model_management.throw_exception_if_processing_interrupted() def scaled_dot_product_attention(q, k, v, *args, **kwargs): return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs) try: - if torch.cuda.is_available(): + if torch.cuda.is_available() and comfy.model_management.WINDOWS: from torch.nn.attention import SDPBackend, sdpa_kernel import inspect if "set_priority" in inspect.signature(sdpa_kernel).parameters: @@ -50,49 +55,89 @@ try: except (ModuleNotFoundError, TypeError): logging.warning("Could not set sdpa backend priority.") -cast_to = comfy.model_management.cast_to #TODO: remove once no more references +NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False +try: + if comfy.model_management.is_nvidia(): + if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10): + #TODO: change upper bound version once it's fixed' + NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True + logging.info("working around nvidia conv3d memory bug.") +except: + pass -if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast: - torch.backends.cudnn.benchmark = True +cast_to = comfy.model_management.cast_to #TODO: remove once no more references def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) -def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None): + +def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False): + # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass + # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This + # will add async-offload support to your cast and improve performance. if input is not None: if dtype is None: - dtype = input.dtype + if isinstance(input, QuantizedTensor): + dtype = input._layout_params["orig_dtype"] + else: + dtype = input.dtype if bias_dtype is None: bias_dtype = dtype if device is None: device = input.device - offload_stream = comfy.model_management.get_offload_stream(device) + if offloadable and (device != s.weight.device or + (s.bias is not None and device != s.bias.device)): + offload_stream = comfy.model_management.get_offload_stream(device) + else: + offload_stream = None + if offload_stream is not None: wf_context = offload_stream else: wf_context = contextlib.nullcontext() - bias = None non_blocking = comfy.model_management.device_supports_non_blocking(device) - if s.bias is not None: - has_function = len(s.bias_function) > 0 - bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream) - if has_function: + weight_has_function = len(s.weight_function) > 0 + bias_has_function = len(s.bias_function) > 0 + + weight = comfy.model_management.cast_to(s.weight, None, device, non_blocking=non_blocking, copy=weight_has_function, stream=offload_stream) + + bias = None + if s.bias is not None: + bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=bias_has_function, stream=offload_stream) + + if bias_has_function: with wf_context: for f in s.bias_function: bias = f(bias) - has_function = len(s.weight_function) > 0 - weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream) - if has_function: + if weight_has_function or weight.dtype != dtype: with wf_context: + weight = weight.to(dtype=dtype) for f in s.weight_function: weight = f(weight) comfy.model_management.sync_stream(device, offload_stream) - return weight, bias + if offloadable: + return weight, bias, offload_stream + else: + #Legacy function signature + return weight, bias + + +def uncast_bias_weight(s, weight, bias, offload_stream): + if offload_stream is None: + return + if weight is not None: + device = weight.device + else: + if bias is None: + return + device = bias.device + offload_stream.wait_stream(comfy.model_management.current_stream(device)) + class CastWeightBiasOp: comfy_cast_weights = False @@ -105,10 +150,13 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.linear(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.linear(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -119,10 +167,13 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._conv_forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._conv_forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -133,10 +184,13 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._conv_forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._conv_forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -146,11 +200,23 @@ class disable_weight_init: def reset_parameters(self): return None + def _conv_forward(self, input, weight, bias, *args, **kwargs): + if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16): + out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True) + if bias is not None: + out += bias.reshape((1, -1) + (1,) * (out.ndim - 2)) + return out + else: + return super()._conv_forward(input, weight, bias, *args, **kwargs) + def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return self._conv_forward(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._conv_forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -161,10 +227,13 @@ class disable_weight_init: return None def forward_comfy_cast_weights(self, input): - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -176,13 +245,17 @@ class disable_weight_init: def forward_comfy_cast_weights(self, input): if self.weight is not None: - weight, bias = cast_bias_weight(self, input) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) else: weight = None bias = None - return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps) + offload_stream = None + x = torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -195,13 +268,18 @@ class disable_weight_init: def forward_comfy_cast_weights(self, input): if self.weight is not None: - weight, bias = cast_bias_weight(self, input) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) else: weight = None - return comfy.rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated - # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) + bias = None + offload_stream = None + x = comfy.rmsnorm.rms_norm(input, weight, self.eps) # TODO: switch to commented out line when old torch is deprecated + # x = torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -217,12 +295,15 @@ class disable_weight_init: input, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation) - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.conv_transpose2d( + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.conv_transpose2d( input, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -238,12 +319,15 @@ class disable_weight_init: input, output_size, self.stride, self.padding, self.kernel_size, num_spatial_dims, self.dilation) - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.conv_transpose1d( + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.conv_transpose1d( input, weight, bias, self.stride, self.padding, output_padding, self.groups, self.dilation) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -258,10 +342,14 @@ class disable_weight_init: output_dtype = out_dtype if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16: out_dtype = None - weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype) - return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype) + weight, bias, offload_stream = cast_bias_weight(self, device=input.device, dtype=out_dtype, offloadable=True) + x = torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype) + uncast_bias_weight(self, weight, bias, offload_stream) + return x + def forward(self, *args, **kwargs): + run_every_op() if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: return self.forward_comfy_cast_weights(*args, **kwargs) else: @@ -312,20 +400,18 @@ class manual_cast(disable_weight_init): def fp8_linear(self, input): + """ + Legacy FP8 linear function for backward compatibility. + Uses QuantizedTensor subclass for dispatch. + """ dtype = self.weight.dtype if dtype not in [torch.float8_e4m3fn]: return None - tensor_2d = False - if len(input.shape) == 2: - tensor_2d = True - input = input.unsqueeze(1) - - input_shape = input.shape input_dtype = input.dtype - if len(input.shape) == 3: - w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype) - w = w.t() + + if input.ndim == 3 or input.ndim == 2: + w, bias, offload_stream = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype, offloadable=True) scale_weight = self.scale_weight scale_input = self.scale_input @@ -337,23 +423,20 @@ def fp8_linear(self, input): if scale_input is None: scale_input = torch.ones((), device=input.device, dtype=torch.float32) input = torch.clamp(input, min=-448, max=448, out=input) - input = input.reshape(-1, input_shape[2]).to(dtype).contiguous() + layout_params_weight = {'scale': scale_input, 'orig_dtype': input_dtype} + quantized_input = QuantizedTensor(input.to(dtype).contiguous(), "TensorCoreFP8Layout", layout_params_weight) else: scale_input = scale_input.to(input.device) - input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous() + quantized_input = QuantizedTensor.from_float(input, "TensorCoreFP8Layout", scale=scale_input, dtype=dtype) - if bias is not None: - o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight) - else: - o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight) + # Wrap weight in QuantizedTensor - this enables unified dispatch + # Call F.linear - __torch_dispatch__ routes to fp8_linear handler in quant_ops.py! + layout_params_weight = {'scale': scale_weight, 'orig_dtype': input_dtype} + quantized_weight = QuantizedTensor(w, "TensorCoreFP8Layout", layout_params_weight) + o = torch.nn.functional.linear(quantized_input, quantized_weight, bias) - if isinstance(o, tuple): - o = o[0] - - if tensor_2d: - return o.reshape(input_shape[0], -1) - - return o.reshape((-1, input_shape[1], self.weight.shape[0])) + uncast_bias_weight(self, w, bias, offload_stream) + return o return None @@ -373,8 +456,10 @@ class fp8_ops(manual_cast): except Exception as e: logging.info("Exception during fp8 op: {}".format(e)) - weight, bias = cast_bias_weight(self, input) - return torch.nn.functional.linear(input, weight, bias) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = torch.nn.functional.linear(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None): logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input)) @@ -402,12 +487,14 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None if out is not None: return out - weight, bias = cast_bias_weight(self, input) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) if weight.numel() < input.numel(): #TODO: optimize - return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias) + x = torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias) else: - return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias) + x = torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x def convert_weight(self, weight, inplace=False, **kwargs): if inplace: @@ -416,8 +503,10 @@ def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None else: return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype) - def set_weight(self, weight, inplace_update=False, seed=None, **kwargs): + def set_weight(self, weight, inplace_update=False, seed=None, return_weight=False, **kwargs): weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed) + if return_weight: + return weight if inplace_update: self.weight.data.copy_(weight) else: @@ -444,7 +533,120 @@ if CUBLAS_IS_AVAILABLE: def forward(self, *args, **kwargs): return super().forward(*args, **kwargs) -def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None): + +# ============================================================================== +# Mixed Precision Operations +# ============================================================================== +from .quant_ops import QuantizedTensor, QUANT_ALGOS + +class MixedPrecisionOps(disable_weight_init): + _layer_quant_config = {} + _compute_dtype = torch.bfloat16 + + class Linear(torch.nn.Module, CastWeightBiasOp): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + ) -> None: + super().__init__() + + self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} + # self.factory_kwargs = {"device": device, "dtype": dtype} + + self.in_features = in_features + self.out_features = out_features + if bias: + self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) + else: + self.register_parameter("bias", None) + + self.tensor_class = None + + def reset_parameters(self): + return None + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, error_msgs): + + device = self.factory_kwargs["device"] + layer_name = prefix.rstrip('.') + weight_key = f"{prefix}weight" + weight = state_dict.pop(weight_key, None) + if weight is None: + raise ValueError(f"Missing weight for layer {layer_name}") + + manually_loaded_keys = [weight_key] + + if layer_name not in MixedPrecisionOps._layer_quant_config: + self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) + else: + quant_format = MixedPrecisionOps._layer_quant_config[layer_name].get("format", None) + if quant_format is None: + raise ValueError(f"Unknown quantization format for layer {layer_name}") + + qconfig = QUANT_ALGOS[quant_format] + self.layout_type = qconfig["comfy_tensor_layout"] + + weight_scale_key = f"{prefix}weight_scale" + layout_params = { + 'scale': state_dict.pop(weight_scale_key, None), + 'orig_dtype': MixedPrecisionOps._compute_dtype, + 'block_size': qconfig.get("group_size", None), + } + if layout_params['scale'] is not None: + manually_loaded_keys.append(weight_scale_key) + + self.weight = torch.nn.Parameter( + QuantizedTensor(weight.to(device=device), self.layout_type, layout_params), + requires_grad=False + ) + + for param_name in qconfig["parameters"]: + param_key = f"{prefix}{param_name}" + _v = state_dict.pop(param_key, None) + if _v is None: + continue + setattr(self, param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) + manually_loaded_keys.append(param_key) + + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + + for key in manually_loaded_keys: + if key in missing_keys: + missing_keys.remove(key) + + def _forward(self, input, weight, bias): + return torch.nn.functional.linear(input, weight, bias) + + def forward_comfy_cast_weights(self, input): + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + x = self._forward(input, weight, bias) + uncast_bias_weight(self, weight, bias, offload_stream) + return x + + def forward(self, input, *args, **kwargs): + run_every_op() + + if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0: + return self.forward_comfy_cast_weights(input, *args, **kwargs) + if (getattr(self, 'layout_type', None) is not None and + getattr(self, 'input_scale', None) is not None and + not isinstance(input, QuantizedTensor)): + input = QuantizedTensor.from_float(input, self.layout_type, scale=self.input_scale, dtype=self.weight.dtype) + return self._forward(input, self.weight, self.bias) + + +def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, model_config=None): + if model_config and hasattr(model_config, 'layer_quant_config') and model_config.layer_quant_config: + MixedPrecisionOps._layer_quant_config = model_config.layer_quant_config + MixedPrecisionOps._compute_dtype = compute_dtype + logging.info(f"Using mixed precision operations: {len(model_config.layer_quant_config)} quantized layers") + return MixedPrecisionOps + fp8_compute = comfy.model_management.supports_fp8_compute(load_device) if scaled_fp8 is not None: return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8) diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 46cc7b2a8..5ee4d5ee5 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -150,7 +150,7 @@ def merge_nested_dicts(dict1: dict, dict2: dict, copy_dict1=True): for key, value in dict2.items(): if isinstance(value, dict): curr_value = merged_dict.setdefault(key, {}) - merged_dict[key] = merge_nested_dicts(value, curr_value) + merged_dict[key] = merge_nested_dicts(curr_value, value) elif isinstance(value, list): merged_dict.setdefault(key, []).extend(value) else: diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py new file mode 100644 index 000000000..1d058bece --- /dev/null +++ b/comfy/quant_ops.py @@ -0,0 +1,545 @@ +import torch +import logging +from typing import Tuple, Dict + +_LAYOUT_REGISTRY = {} +_GENERIC_UTILS = {} + + +def register_layout_op(torch_op, layout_type): + """ + Decorator to register a layout-specific operation handler. + Args: + torch_op: PyTorch operation (e.g., torch.ops.aten.linear.default) + layout_type: Layout class (e.g., TensorCoreFP8Layout) + Example: + @register_layout_op(torch.ops.aten.linear.default, TensorCoreFP8Layout) + def fp8_linear(func, args, kwargs): + # FP8-specific linear implementation + ... + """ + def decorator(handler_func): + if torch_op not in _LAYOUT_REGISTRY: + _LAYOUT_REGISTRY[torch_op] = {} + _LAYOUT_REGISTRY[torch_op][layout_type] = handler_func + return handler_func + return decorator + + +def register_generic_util(torch_op): + """ + Decorator to register a generic utility that works for all layouts. + Args: + torch_op: PyTorch operation (e.g., torch.ops.aten.detach.default) + + Example: + @register_generic_util(torch.ops.aten.detach.default) + def generic_detach(func, args, kwargs): + # Works for any layout + ... + """ + def decorator(handler_func): + _GENERIC_UTILS[torch_op] = handler_func + return handler_func + return decorator + + +def _get_layout_from_args(args): + for arg in args: + if isinstance(arg, QuantizedTensor): + return arg._layout_type + elif isinstance(arg, (list, tuple)): + for item in arg: + if isinstance(item, QuantizedTensor): + return item._layout_type + return None + + +def _move_layout_params_to_device(params, device): + new_params = {} + for k, v in params.items(): + if isinstance(v, torch.Tensor): + new_params[k] = v.to(device=device) + else: + new_params[k] = v + return new_params + + +def _copy_layout_params(params): + new_params = {} + for k, v in params.items(): + if isinstance(v, torch.Tensor): + new_params[k] = v.clone() + else: + new_params[k] = v + return new_params + +def _copy_layout_params_inplace(src, dst, non_blocking=False): + for k, v in src.items(): + if isinstance(v, torch.Tensor): + dst[k].copy_(v, non_blocking=non_blocking) + else: + dst[k] = v + +class QuantizedLayout: + """ + Base class for quantization layouts. + + A layout encapsulates the format-specific logic for quantization/dequantization + and provides a uniform interface for extracting raw tensors needed for computation. + + New quantization formats should subclass this and implement the required methods. + """ + @classmethod + def quantize(cls, tensor, **kwargs) -> Tuple[torch.Tensor, Dict]: + raise NotImplementedError(f"{cls.__name__} must implement quantize()") + + @staticmethod + def dequantize(qdata, **layout_params) -> torch.Tensor: + raise NotImplementedError("TensorLayout must implement dequantize()") + + @classmethod + def get_plain_tensors(cls, qtensor) -> torch.Tensor: + raise NotImplementedError(f"{cls.__name__} must implement get_plain_tensors()") + + +class QuantizedTensor(torch.Tensor): + """ + Universal quantized tensor that works with any layout. + + This tensor subclass uses a pluggable layout system to support multiple + quantization formats (FP8, INT4, INT8, etc.) without code duplication. + + The layout_type determines format-specific behavior, while common operations + (detach, clone, to) are handled generically. + + Attributes: + _qdata: The quantized tensor data + _layout_type: Layout class (e.g., TensorCoreFP8Layout) + _layout_params: Dict with layout-specific params (scale, zero_point, etc.) + """ + + @staticmethod + def __new__(cls, qdata, layout_type, layout_params): + """ + Create a quantized tensor. + + Args: + qdata: The quantized data tensor + layout_type: Layout class (subclass of QuantizedLayout) + layout_params: Dict with layout-specific parameters + """ + return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, device=qdata.device, dtype=qdata.dtype, requires_grad=False) + + def __init__(self, qdata, layout_type, layout_params): + self._qdata = qdata + self._layout_type = layout_type + self._layout_params = layout_params + + def __repr__(self): + layout_name = self._layout_type + param_str = ", ".join(f"{k}={v}" for k, v in list(self._layout_params.items())[:2]) + return f"QuantizedTensor(shape={self.shape}, layout={layout_name}, {param_str})" + + @property + def layout_type(self): + return self._layout_type + + def __tensor_flatten__(self): + """ + Tensor flattening protocol for proper device movement. + """ + inner_tensors = ["_qdata"] + ctx = { + "layout_type": self._layout_type, + } + + tensor_params = {} + non_tensor_params = {} + for k, v in self._layout_params.items(): + if isinstance(v, torch.Tensor): + tensor_params[k] = v + else: + non_tensor_params[k] = v + + ctx["tensor_param_keys"] = list(tensor_params.keys()) + ctx["non_tensor_params"] = non_tensor_params + + for k, v in tensor_params.items(): + attr_name = f"_layout_param_{k}" + object.__setattr__(self, attr_name, v) + inner_tensors.append(attr_name) + + return inner_tensors, ctx + + @staticmethod + def __tensor_unflatten__(inner_tensors, ctx, outer_size, outer_stride): + """ + Tensor unflattening protocol for proper device movement. + Reconstructs the QuantizedTensor after device movement. + """ + layout_type = ctx["layout_type"] + layout_params = dict(ctx["non_tensor_params"]) + + for key in ctx["tensor_param_keys"]: + attr_name = f"_layout_param_{key}" + layout_params[key] = inner_tensors[attr_name] + + return QuantizedTensor(inner_tensors["_qdata"], layout_type, layout_params) + + @classmethod + def from_float(cls, tensor, layout_type, **quantize_kwargs) -> 'QuantizedTensor': + qdata, layout_params = LAYOUTS[layout_type].quantize(tensor, **quantize_kwargs) + return cls(qdata, layout_type, layout_params) + + def dequantize(self) -> torch.Tensor: + return LAYOUTS[self._layout_type].dequantize(self._qdata, **self._layout_params) + + @classmethod + def __torch_dispatch__(cls, func, types, args=(), kwargs=None): + kwargs = kwargs or {} + + # Step 1: Check generic utilities first (detach, clone, to, etc.) + if func in _GENERIC_UTILS: + return _GENERIC_UTILS[func](func, args, kwargs) + + # Step 2: Check layout-specific handlers (linear, matmul, etc.) + layout_type = _get_layout_from_args(args) + if layout_type and func in _LAYOUT_REGISTRY: + handler = _LAYOUT_REGISTRY[func].get(layout_type) + if handler: + return handler(func, args, kwargs) + + # Step 3: Fallback to dequantization + if isinstance(args[0] if args else None, QuantizedTensor): + logging.info(f"QuantizedTensor: Unhandled operation {func}, falling back to dequantization. kwargs={kwargs}") + return cls._dequant_and_fallback(func, args, kwargs) + + @classmethod + def _dequant_and_fallback(cls, func, args, kwargs): + def dequant_arg(arg): + if isinstance(arg, QuantizedTensor): + return arg.dequantize() + elif isinstance(arg, (list, tuple)): + return type(arg)(dequant_arg(a) for a in arg) + return arg + + new_args = dequant_arg(args) + new_kwargs = dequant_arg(kwargs) + return func(*new_args, **new_kwargs) + + +# ============================================================================== +# Generic Utilities (Layout-Agnostic Operations) +# ============================================================================== + +def _create_transformed_qtensor(qt, transform_fn): + new_data = transform_fn(qt._qdata) + new_params = _copy_layout_params(qt._layout_params) + return QuantizedTensor(new_data, qt._layout_type, new_params) + + +def _handle_device_transfer(qt, target_device, target_dtype=None, target_layout=None, op_name="to"): + if target_dtype is not None and target_dtype != qt.dtype: + logging.warning( + f"QuantizedTensor: dtype conversion requested to {target_dtype}, " + f"but not supported for quantized tensors. Ignoring dtype." + ) + + if target_layout is not None and target_layout != torch.strided: + logging.warning( + f"QuantizedTensor: layout change requested to {target_layout}, " + f"but not supported. Ignoring layout." + ) + + # Handle device transfer + current_device = qt._qdata.device + if target_device is not None: + # Normalize device for comparison + if isinstance(target_device, str): + target_device = torch.device(target_device) + if isinstance(current_device, str): + current_device = torch.device(current_device) + + if target_device != current_device: + logging.debug(f"QuantizedTensor.{op_name}: Moving from {current_device} to {target_device}") + new_q_data = qt._qdata.to(device=target_device) + new_params = _move_layout_params_to_device(qt._layout_params, target_device) + new_qt = QuantizedTensor(new_q_data, qt._layout_type, new_params) + logging.debug(f"QuantizedTensor.{op_name}: Created new tensor on {target_device}") + return new_qt + + logging.debug(f"QuantizedTensor.{op_name}: No device change needed, returning original") + return qt + + +@register_generic_util(torch.ops.aten.detach.default) +def generic_detach(func, args, kwargs): + """Detach operation - creates a detached copy of the quantized tensor.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _create_transformed_qtensor(qt, lambda x: x.detach()) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten.clone.default) +def generic_clone(func, args, kwargs): + """Clone operation - creates a deep copy of the quantized tensor.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _create_transformed_qtensor(qt, lambda x: x.clone()) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten._to_copy.default) +def generic_to_copy(func, args, kwargs): + """Device/dtype transfer operation - handles .to(device) calls.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _handle_device_transfer( + qt, + target_device=kwargs.get('device', None), + target_dtype=kwargs.get('dtype', None), + op_name="_to_copy" + ) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten.to.dtype_layout) +def generic_to_dtype_layout(func, args, kwargs): + """Handle .to(device) calls using the dtype_layout variant.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + return _handle_device_transfer( + qt, + target_device=kwargs.get('device', None), + target_dtype=kwargs.get('dtype', None), + target_layout=kwargs.get('layout', None), + op_name="to" + ) + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten.copy_.default) +def generic_copy_(func, args, kwargs): + qt_dest = args[0] + src = args[1] + non_blocking = args[2] if len(args) > 2 else False + if isinstance(qt_dest, QuantizedTensor): + if isinstance(src, QuantizedTensor): + # Copy from another quantized tensor + qt_dest._qdata.copy_(src._qdata, non_blocking=non_blocking) + qt_dest._layout_type = src._layout_type + _copy_layout_params_inplace(src._layout_params, qt_dest._layout_params, non_blocking=non_blocking) + else: + # Copy from regular tensor - just copy raw data + qt_dest._qdata.copy_(src) + return qt_dest + return func(*args, **kwargs) + + +@register_generic_util(torch.ops.aten._has_compatible_shallow_copy_type.default) +def generic_has_compatible_shallow_copy_type(func, args, kwargs): + return True + + +@register_generic_util(torch.ops.aten.empty_like.default) +def generic_empty_like(func, args, kwargs): + """Empty_like operation - creates an empty tensor with the same quantized structure.""" + qt = args[0] + if isinstance(qt, QuantizedTensor): + # Create empty tensor with same shape and dtype as the quantized data + hp_dtype = kwargs.pop('dtype', qt._layout_params["orig_dtype"]) + new_qdata = torch.empty_like(qt._qdata, **kwargs) + + # Handle device transfer for layout params + target_device = kwargs.get('device', new_qdata.device) + new_params = _move_layout_params_to_device(qt._layout_params, target_device) + + # Update orig_dtype if dtype is specified + new_params['orig_dtype'] = hp_dtype + + return QuantizedTensor(new_qdata, qt._layout_type, new_params) + return func(*args, **kwargs) + +# ============================================================================== +# FP8 Layout + Operation Handlers +# ============================================================================== +class TensorCoreFP8Layout(QuantizedLayout): + """ + Storage format: + - qdata: FP8 tensor (torch.float8_e4m3fn or torch.float8_e5m2) + - scale: Scalar tensor (float32) for dequantization + - orig_dtype: Original dtype before quantization (for casting back) + """ + @classmethod + def quantize(cls, tensor, scale=None, dtype=torch.float8_e4m3fn): + orig_dtype = tensor.dtype + + if scale is None: + scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max + + if not isinstance(scale, torch.Tensor): + scale = torch.tensor(scale) + scale = scale.to(device=tensor.device, dtype=torch.float32) + + tensor_scaled = tensor * (1.0 / scale).to(tensor.dtype) + # TODO: uncomment this if it's actually needed because the clamp has a small performance penality' + # lp_amax = torch.finfo(dtype).max + # torch.clamp(tensor_scaled, min=-lp_amax, max=lp_amax, out=tensor_scaled) + qdata = tensor_scaled.to(dtype, memory_format=torch.contiguous_format) + + layout_params = { + 'scale': scale, + 'orig_dtype': orig_dtype + } + return qdata, layout_params + + @staticmethod + def dequantize(qdata, scale, orig_dtype, **kwargs): + plain_tensor = torch.ops.aten._to_copy.default(qdata, dtype=orig_dtype) + return plain_tensor * scale + + @classmethod + def get_plain_tensors(cls, qtensor): + return qtensor._qdata, qtensor._layout_params['scale'] + +QUANT_ALGOS = { + "float8_e4m3fn": { + "storage_t": torch.float8_e4m3fn, + "parameters": {"weight_scale", "input_scale"}, + "comfy_tensor_layout": "TensorCoreFP8Layout", + }, +} + +LAYOUTS = { + "TensorCoreFP8Layout": TensorCoreFP8Layout, +} + + +@register_layout_op(torch.ops.aten.linear.default, "TensorCoreFP8Layout") +def fp8_linear(func, args, kwargs): + input_tensor = args[0] + weight = args[1] + bias = args[2] if len(args) > 2 else None + + if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight) + + out_dtype = kwargs.get("out_dtype") + if out_dtype is None: + out_dtype = input_tensor._layout_params['orig_dtype'] + + weight_t = plain_weight.t() + + tensor_2d = False + if len(plain_input.shape) == 2: + tensor_2d = True + plain_input = plain_input.unsqueeze(1) + + input_shape = plain_input.shape + if len(input_shape) != 3: + return None + + try: + output = torch._scaled_mm( + plain_input.reshape(-1, input_shape[2]).contiguous(), + weight_t, + bias=bias, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype, + ) + + if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4 + output = output[0] + + if not tensor_2d: + output = output.reshape((-1, input_shape[1], weight.shape[0])) + + if output.dtype in [torch.float8_e4m3fn, torch.float8_e5m2]: + output_scale = scale_a * scale_b + output_params = { + 'scale': output_scale, + 'orig_dtype': input_tensor._layout_params['orig_dtype'] + } + return QuantizedTensor(output, "TensorCoreFP8Layout", output_params) + else: + return output + + except Exception as e: + raise RuntimeError(f"FP8 _scaled_mm failed, falling back to dequantization: {e}") + + # Case 2: DQ Fallback + if isinstance(weight, QuantizedTensor): + weight = weight.dequantize() + if isinstance(input_tensor, QuantizedTensor): + input_tensor = input_tensor.dequantize() + + return torch.nn.functional.linear(input_tensor, weight, bias) + +def fp8_mm_(input_tensor, weight, bias=None, out_dtype=None): + if out_dtype is None: + out_dtype = input_tensor._layout_params['orig_dtype'] + + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + plain_weight, scale_b = TensorCoreFP8Layout.get_plain_tensors(weight) + + output = torch._scaled_mm( + plain_input.contiguous(), + plain_weight, + bias=bias, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=out_dtype, + ) + + if isinstance(output, tuple): # TODO: remove when we drop support for torch 2.4 + output = output[0] + return output + +@register_layout_op(torch.ops.aten.addmm.default, "TensorCoreFP8Layout") +def fp8_addmm(func, args, kwargs): + input_tensor = args[1] + weight = args[2] + bias = args[0] + + if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): + return fp8_mm_(input_tensor, weight, bias=bias, out_dtype=kwargs.get("out_dtype", None)) + + a = list(args) + if isinstance(args[0], QuantizedTensor): + a[0] = args[0].dequantize() + if isinstance(args[1], QuantizedTensor): + a[1] = args[1].dequantize() + if isinstance(args[2], QuantizedTensor): + a[2] = args[2].dequantize() + + return func(*a, **kwargs) + +@register_layout_op(torch.ops.aten.mm.default, "TensorCoreFP8Layout") +def fp8_mm(func, args, kwargs): + input_tensor = args[0] + weight = args[1] + + if isinstance(input_tensor, QuantizedTensor) and isinstance(weight, QuantizedTensor): + return fp8_mm_(input_tensor, weight, bias=None, out_dtype=kwargs.get("out_dtype", None)) + + a = list(args) + if isinstance(args[0], QuantizedTensor): + a[0] = args[0].dequantize() + if isinstance(args[1], QuantizedTensor): + a[1] = args[1].dequantize() + return func(*a, **kwargs) + +@register_layout_op(torch.ops.aten.view.default, "TensorCoreFP8Layout") +@register_layout_op(torch.ops.aten.t.default, "TensorCoreFP8Layout") +def fp8_func(func, args, kwargs): + input_tensor = args[0] + if isinstance(input_tensor, QuantizedTensor): + plain_input, scale_a = TensorCoreFP8Layout.get_plain_tensors(input_tensor) + ar = list(args) + ar[0] = plain_input + return QuantizedTensor(func(*ar, **kwargs), "TensorCoreFP8Layout", input_tensor._layout_params) + return func(*args, **kwargs) diff --git a/comfy/sample.py b/comfy/sample.py index be5a7e246..2f8f3a51c 100644 --- a/comfy/sample.py +++ b/comfy/sample.py @@ -4,13 +4,9 @@ import comfy.samplers import comfy.utils import numpy as np import logging +import comfy.nested_tensor -def prepare_noise(latent_image, seed, noise_inds=None): - """ - creates random noise given a latent image and a seed. - optional arg skip can be used to skip and discard x number of noise generations for a given seed - """ - generator = torch.manual_seed(seed) +def prepare_noise_inner(latent_image, generator, noise_inds=None): if noise_inds is None: return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu") @@ -21,10 +17,29 @@ def prepare_noise(latent_image, seed, noise_inds=None): if i in unique_inds: noises.append(noise) noises = [noises[i] for i in inverse] - noises = torch.cat(noises, axis=0) + return torch.cat(noises, axis=0) + +def prepare_noise(latent_image, seed, noise_inds=None): + """ + creates random noise given a latent image and a seed. + optional arg skip can be used to skip and discard x number of noise generations for a given seed + """ + generator = torch.manual_seed(seed) + + if latent_image.is_nested: + tensors = latent_image.unbind() + noises = [] + for t in tensors: + noises.append(prepare_noise_inner(t, generator, noise_inds)) + noises = comfy.nested_tensor.NestedTensor(noises) + else: + noises = prepare_noise_inner(latent_image, generator, noise_inds) + return noises def fix_empty_latent_channels(model, latent_image): + if latent_image.is_nested: + return latent_image latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0: latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1) diff --git a/comfy/samplers.py b/comfy/samplers.py index c59e296a1..fa4640842 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -306,17 +306,10 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens copy_dict1=False) if patches is not None: - # TODO: replace with merge_nested_dicts function - if "patches" in transformer_options: - cur_patches = transformer_options["patches"].copy() - for p in patches: - if p in cur_patches: - cur_patches[p] = cur_patches[p] + patches[p] - else: - cur_patches[p] = patches[p] - transformer_options["patches"] = cur_patches - else: - transformer_options["patches"] = patches + transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts( + transformer_options.get("patches", {}), + patches + ) transformer_options["cond_or_uncond"] = cond_or_uncond[:] transformer_options["uuids"] = uuids[:] @@ -789,7 +782,7 @@ def ksampler(sampler_name, extra_options={}, inpaint_options={}): return KSAMPLER(sampler_function, extra_options, inpaint_options) -def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None): +def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=None, seed=None, latent_shapes=None): for k in conds: conds[k] = conds[k][:] resolve_areas_and_cond_masks_multidim(conds[k], noise.shape[2:], device) @@ -799,7 +792,7 @@ def process_conds(model, noise, conds, device, latent_image=None, denoise_mask=N if hasattr(model, 'extra_conds'): for k in conds: - conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed) + conds[k] = encode_model_conds(model.extra_conds, conds[k], noise, device, k, latent_image=latent_image, denoise_mask=denoise_mask, seed=seed, latent_shapes=latent_shapes) #make sure each cond area has an opposite one with the same area for k in conds: @@ -969,11 +962,11 @@ class CFGGuider: def predict_noise(self, x, timestep, model_options={}, seed=None): return sampling_function(self.inner_model, x, timestep, self.conds.get("negative", None), self.conds.get("positive", None), self.cfg, model_options=model_options, seed=seed) - def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed): + def inner_sample(self, noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=None): if latent_image is not None and torch.count_nonzero(latent_image) > 0: #Don't shift the empty latent image. latent_image = self.inner_model.process_latent_in(latent_image) - self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed) + self.conds = process_conds(self.inner_model, noise, self.conds, device, latent_image, denoise_mask, seed, latent_shapes=latent_shapes) extra_model_options = comfy.model_patcher.create_model_options_clone(self.model_options) extra_model_options.setdefault("transformer_options", {})["sample_sigmas"] = sigmas @@ -987,7 +980,7 @@ class CFGGuider: samples = executor.execute(self, sigmas, extra_args, callback, noise, latent_image, denoise_mask, disable_pbar) return self.inner_model.process_latent_out(samples.to(torch.float32)) - def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None): + def outer_sample(self, noise, latent_image, sampler, sigmas, denoise_mask=None, callback=None, disable_pbar=False, seed=None, latent_shapes=None): self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options) device = self.model_patcher.load_device @@ -1001,7 +994,7 @@ class CFGGuider: try: self.model_patcher.pre_run() - output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed) + output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) finally: self.model_patcher.cleanup() @@ -1014,6 +1007,12 @@ class CFGGuider: if sigmas.shape[-1] == 0: return latent_image + if latent_image.is_nested: + latent_image, latent_shapes = comfy.utils.pack_latents(latent_image.unbind()) + noise, _ = comfy.utils.pack_latents(noise.unbind()) + else: + latent_shapes = [latent_image.shape] + self.conds = {} for k in self.original_conds: self.conds[k] = list(map(lambda a: a.copy(), self.original_conds[k])) @@ -1033,7 +1032,7 @@ class CFGGuider: self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, self.model_options, is_model_options=True) ) - output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed) + output = executor.execute(noise, latent_image, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) finally: cast_to_load_options(self.model_options, device=self.model_patcher.offload_device) self.model_options = orig_model_options @@ -1041,6 +1040,9 @@ class CFGGuider: self.model_patcher.restore_hook_patches() del self.conds + + if len(latent_shapes) > 1: + output = comfy.nested_tensor.NestedTensor(comfy.utils.unpack_latents(output, latent_shapes)) return output diff --git a/comfy/sd.py b/comfy/sd.py index 2df340739..9e5ebbf15 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -18,6 +18,7 @@ import comfy.ldm.wan.vae2_2 import comfy.ldm.hunyuan3d.vae import comfy.ldm.ace.vae.music_dcae_pipeline import comfy.ldm.hunyuan_video.vae +import comfy.ldm.mmaudio.vae.autoencoder import comfy.pixel_space_convert import yaml import math @@ -142,6 +143,9 @@ class CLIP: n.apply_hooks_to_conds = self.apply_hooks_to_conds return n + def get_ram_usage(self): + return self.patcher.get_ram_usage() + def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): return self.patcher.add_patches(patches, strength_patch, strength_model) @@ -275,8 +279,13 @@ class VAE: if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format sd = diffusers_convert.convert_vae_state_dict(sd) - self.memory_used_encode = lambda shape, dtype: (1767 * shape[2] * shape[3]) * model_management.dtype_size(dtype) #These are for AutoencoderKL and need tweaking (should be lower) - self.memory_used_decode = lambda shape, dtype: (2178 * shape[2] * shape[3] * 64) * model_management.dtype_size(dtype) + if model_management.is_amd(): + VAE_KL_MEM_RATIO = 2.73 + else: + VAE_KL_MEM_RATIO = 1.0 + + self.memory_used_encode = lambda shape, dtype: (1767 * shape[2] * shape[3]) * model_management.dtype_size(dtype) * VAE_KL_MEM_RATIO #These are for AutoencoderKL and need tweaking (should be lower) + self.memory_used_decode = lambda shape, dtype: (2178 * shape[2] * shape[3] * 64) * model_management.dtype_size(dtype) * VAE_KL_MEM_RATIO self.downscale_ratio = 8 self.upscale_ratio = 8 self.latent_channels = 4 @@ -287,10 +296,12 @@ class VAE: self.working_dtypes = [torch.bfloat16, torch.float32] self.disable_offload = False self.not_video = False + self.size = None self.downscale_index_formula = None self.upscale_index_formula = None self.extra_1d_channel = None + self.crop_input = True if config is None: if "decoder.mid.block_1.mix_factor" in sd: @@ -332,35 +343,51 @@ class VAE: self.first_stage_model = StageC_coder() self.downscale_ratio = 32 self.latent_channels = 16 - elif "decoder.conv_in.weight" in sd and sd['decoder.conv_in.weight'].shape[1] == 64: - ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True} - self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] - self.downscale_ratio = 32 - self.upscale_ratio = 32 - self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] - self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, - encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig}, - decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig}) - - self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype) - self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype) - elif "decoder.conv_in.weight" in sd: - #default SD1.x/SD2.x VAE parameters - ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} - - if 'encoder.down.2.downsample.conv.weight' not in sd and 'decoder.up.3.upsample.conv.weight' not in sd: #Stable diffusion x4 upscaler VAE - ddconfig['ch_mult'] = [1, 2, 4] - self.downscale_ratio = 4 - self.upscale_ratio = 4 - - self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] - if 'post_quant_conv.weight' in sd: - self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) - else: + if sd['decoder.conv_in.weight'].shape[1] == 64: + ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True} + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + self.downscale_ratio = 32 + self.upscale_ratio = 32 + self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, - encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig}, - decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig}) + encoder_config={'target': "comfy.ldm.hunyuan_video.vae.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.hunyuan_video.vae.Decoder", 'params': ddconfig}) + + self.memory_used_encode = lambda shape, dtype: (700 * shape[2] * shape[3]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (700 * shape[2] * shape[3] * 32 * 32) * model_management.dtype_size(dtype) + elif sd['decoder.conv_in.weight'].shape[1] == 32: + ddconfig = {"block_out_channels": [128, 256, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 16, "ffactor_temporal": 4, "downsample_match_channel": True, "upsample_match_channel": True, "refiner_vae": False} + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + self.working_dtypes = [torch.float16, torch.bfloat16, torch.float32] + self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 16, 16) + self.upscale_index_formula = (4, 16, 16) + self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 16, 16) + self.downscale_index_formula = (4, 16, 16) + self.latent_dim = 3 + self.not_video = True + self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, + encoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.hunyuan_video.vae_refiner.Decoder", 'params': ddconfig}) + + self.memory_used_encode = lambda shape, dtype: (2800 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (2800 * shape[-3] * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype) + else: + #default SD1.x/SD2.x VAE parameters + ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0} + + if 'encoder.down.2.downsample.conv.weight' not in sd and 'decoder.up.3.upsample.conv.weight' not in sd: #Stable diffusion x4 upscaler VAE + ddconfig['ch_mult'] = [1, 2, 4] + self.downscale_ratio = 4 + self.upscale_ratio = 4 + + self.latent_channels = ddconfig['z_channels'] = sd["decoder.conv_in.weight"].shape[1] + if 'post_quant_conv.weight' in sd: + self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=sd['post_quant_conv.weight'].shape[1]) + else: + self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"}, + encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': ddconfig}, + decoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Decoder", 'params': ddconfig}) elif "decoder.layers.1.layers.0.beta" in sd: self.first_stage_model = AudioOobleckVAE() self.memory_used_encode = lambda shape, dtype: (1000 * shape[2]) * model_management.dtype_size(dtype) @@ -526,6 +553,25 @@ class VAE: self.latent_channels = 3 self.latent_dim = 2 self.output_channels = 3 + elif "vocoder.activation_post.downsample.lowpass.filter" in sd: #MMAudio VAE + sample_rate = 16000 + if sample_rate == 16000: + mode = '16k' + else: + mode = '44k' + + self.first_stage_model = comfy.ldm.mmaudio.vae.autoencoder.AudioAutoencoder(mode=mode) + self.memory_used_encode = lambda shape, dtype: (30 * shape[2]) * model_management.dtype_size(dtype) + self.memory_used_decode = lambda shape, dtype: (90 * shape[2] * 1411.2) * model_management.dtype_size(dtype) + self.latent_channels = 20 + self.output_channels = 2 + self.upscale_ratio = 512 * (44100 / sample_rate) + self.downscale_ratio = 512 * (44100 / sample_rate) + self.latent_dim = 1 + self.process_output = lambda audio: audio + self.process_input = lambda audio: audio + self.working_dtypes = [torch.float32] + self.crop_input = False else: logging.warning("WARNING: No VAE weights detected, VAE not initalized.") self.first_stage_model = None @@ -553,12 +599,25 @@ class VAE: self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device) logging.info("VAE load device: {}, offload device: {}, dtype: {}".format(self.device, offload_device, self.vae_dtype)) + self.model_size() + + def model_size(self): + if self.size is not None: + return self.size + self.size = comfy.model_management.module_size(self.first_stage_model) + return self.size + + def get_ram_usage(self): + return self.model_size() def throw_exception_if_invalid(self): if self.first_stage_model is None: raise RuntimeError("ERROR: VAE is invalid: None\n\nIf the VAE is from a checkpoint loader node your checkpoint does not contain a valid VAE.") def vae_encode_crop_pixels(self, pixels): + if not self.crop_input: + return pixels + downscale_ratio = self.spacial_compression_encode() dims = pixels.shape[1:-1] @@ -636,6 +695,7 @@ class VAE: def decode(self, samples_in, vae_options={}): self.throw_exception_if_invalid() pixel_samples = None + do_tile = False try: memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) @@ -651,6 +711,13 @@ class VAE: pixel_samples[x:x+batch_number] = out except model_management.OOM_EXCEPTION: logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: dims = samples_in.ndim - 2 if dims == 1 or self.extra_1d_channel is not None: pixel_samples = self.decode_tiled_1d(samples_in) @@ -697,6 +764,7 @@ class VAE: self.throw_exception_if_invalid() pixel_samples = self.vae_encode_crop_pixels(pixel_samples) pixel_samples = pixel_samples.movedim(-1, 1) + do_tile = False if self.latent_dim == 3 and pixel_samples.ndim < 5: if not self.not_video: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) @@ -718,6 +786,13 @@ class VAE: except model_management.OOM_EXCEPTION: logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: if self.latent_dim == 3: tile = 256 overlap = tile // 4 @@ -858,6 +933,7 @@ class TEModel(Enum): QWEN25_3B = 10 QWEN25_7B = 11 BYT5_SMALL_GLYPH = 12 + GEMMA_3_4B = 13 def detect_te_model(sd): if "text_model.encoder.layers.30.mlp.fc1.weight" in sd: @@ -880,6 +956,8 @@ def detect_te_model(sd): return TEModel.BYT5_SMALL_GLYPH return TEModel.T5_BASE if 'model.layers.0.post_feedforward_layernorm.weight' in sd: + if 'model.layers.0.self_attn.q_norm.weight' in sd: + return TEModel.GEMMA_3_4B return TEModel.GEMMA_2_2B if 'model.layers.0.self_attn.k_proj.bias' in sd: weight = sd['model.layers.0.self_attn.k_proj.bias'] @@ -984,6 +1062,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) + elif te_model == TEModel.GEMMA_3_4B: + clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b") + clip_target.tokenizer = comfy.text_encoders.lumina2.NTokenizer + tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.LLAMA3_8: clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**llama_detect(clip_data), clip_l=False, clip_g=False, t5=False, llama=True, dtype_t5=None, t5xxl_scaled_fp8=None) @@ -1194,7 +1276,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c return (model_patcher, clip, vae, clipvision) -def load_diffusion_model_state_dict(sd, model_options={}): +def load_diffusion_model_state_dict(sd, model_options={}, metadata=None): """ Loads a UNet diffusion model from a state dictionary, supporting both diffusers and regular formats. @@ -1228,7 +1310,7 @@ def load_diffusion_model_state_dict(sd, model_options={}): weight_dtype = comfy.utils.weight_dtype(sd) load_device = model_management.get_torch_device() - model_config = model_detection.model_config_from_unet(sd, "") + model_config = model_detection.model_config_from_unet(sd, "", metadata=metadata) if model_config is not None: new_sd = sd @@ -1262,7 +1344,10 @@ def load_diffusion_model_state_dict(sd, model_options={}): else: unet_dtype = dtype - manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) + if model_config.layer_quant_config is not None: + manual_cast_dtype = model_management.unet_manual_cast(None, load_device, model_config.supported_inference_dtypes) + else: + manual_cast_dtype = model_management.unet_manual_cast(unet_dtype, load_device, model_config.supported_inference_dtypes) model_config.set_inference_dtype(unet_dtype, manual_cast_dtype) model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations) if model_options.get("fp8_optimizations", False): @@ -1278,8 +1363,8 @@ def load_diffusion_model_state_dict(sd, model_options={}): def load_diffusion_model(unet_path, model_options={}): - sd = comfy.utils.load_torch_file(unet_path) - model = load_diffusion_model_state_dict(sd, model_options=model_options) + sd, metadata = comfy.utils.load_torch_file(unet_path, return_metadata=True) + model = load_diffusion_model_state_dict(sd, model_options=model_options, metadata=metadata) if model is None: logging.error("ERROR UNSUPPORTED DIFFUSION MODEL {}".format(unet_path)) raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(unet_path, model_detection_error_hint(unet_path, sd))) diff --git a/comfy/sd1_clip.py b/comfy/sd1_clip.py index f8a7c2a1b..3066de2d7 100644 --- a/comfy/sd1_clip.py +++ b/comfy/sd1_clip.py @@ -460,7 +460,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No return embed_out class SDTokenizer: - def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, tokenizer_data={}, tokenizer_args={}): + def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data={}, tokenizer_args={}): if tokenizer_path is None: tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer") self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args) @@ -468,6 +468,7 @@ class SDTokenizer: self.min_length = tokenizer_data.get("{}_min_length".format(embedding_key), min_length) self.end_token = None self.min_padding = min_padding + self.pad_left = pad_left empty = self.tokenizer('')["input_ids"] self.tokenizer_adds_end_token = has_end_token @@ -522,6 +523,12 @@ class SDTokenizer: return (embed, "{} {}".format(embedding_name[len(stripped):], leftover)) return (embed, leftover) + def pad_tokens(self, tokens, amount): + if self.pad_left: + for i in range(amount): + tokens.insert(0, (self.pad_token, 1.0, 0)) + else: + tokens.extend([(self.pad_token, 1.0, 0)] * amount) def tokenize_with_weights(self, text:str, return_word_ids=False, tokenizer_options={}, **kwargs): ''' @@ -600,7 +607,7 @@ class SDTokenizer: if self.end_token is not None: batch.append((self.end_token, 1.0, 0)) if self.pad_to_max_length: - batch.extend([(self.pad_token, 1.0, 0)] * (remaining_length)) + self.pad_tokens(batch, remaining_length) #start new batch batch = [] if self.start_token is not None: @@ -614,11 +621,11 @@ class SDTokenizer: if self.end_token is not None: batch.append((self.end_token, 1.0, 0)) if min_padding is not None: - batch.extend([(self.pad_token, 1.0, 0)] * min_padding) + self.pad_tokens(batch, min_padding) if self.pad_to_max_length and len(batch) < self.max_length: - batch.extend([(self.pad_token, 1.0, 0)] * (self.max_length - len(batch))) + self.pad_tokens(batch, self.max_length - len(batch)) if min_length is not None and len(batch) < min_length: - batch.extend([(self.pad_token, 1.0, 0)] * (min_length - len(batch))) + self.pad_tokens(batch, min_length - len(batch)) if not return_word_ids: batched_tokens = [[(t, w) for t, w,_ in x] for x in batched_tokens] diff --git a/comfy/supported_models_base.py b/comfy/supported_models_base.py index 54573abb1..e4bd74514 100644 --- a/comfy/supported_models_base.py +++ b/comfy/supported_models_base.py @@ -50,6 +50,7 @@ class BASE: manual_cast_dtype = None custom_operations = None scaled_fp8 = None + layer_quant_config = None # Per-layer quantization configuration for mixed precision optimizations = {"fp8": False} @classmethod diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py index c5a48ba9f..c050759fe 100644 --- a/comfy/text_encoders/llama.py +++ b/comfy/text_encoders/llama.py @@ -3,6 +3,7 @@ import torch.nn as nn from dataclasses import dataclass from typing import Optional, Any import math +import logging from comfy.ldm.modules.attention import optimized_attention_for_device import comfy.model_management @@ -28,6 +29,9 @@ class Llama2Config: mlp_activation = "silu" qkv_bias = False rope_dims = None + q_norm = None + k_norm = None + rope_scale = None @dataclass class Qwen25_3BConfig: @@ -46,6 +50,9 @@ class Qwen25_3BConfig: mlp_activation = "silu" qkv_bias = True rope_dims = None + q_norm = None + k_norm = None + rope_scale = None @dataclass class Qwen25_7BVLI_Config: @@ -64,6 +71,9 @@ class Qwen25_7BVLI_Config: mlp_activation = "silu" qkv_bias = True rope_dims = [16, 24, 24] + q_norm = None + k_norm = None + rope_scale = None @dataclass class Gemma2_2B_Config: @@ -82,6 +92,32 @@ class Gemma2_2B_Config: mlp_activation = "gelu_pytorch_tanh" qkv_bias = False rope_dims = None + q_norm = None + k_norm = None + sliding_attention = None + rope_scale = None + +@dataclass +class Gemma3_4B_Config: + vocab_size: int = 262208 + hidden_size: int = 2560 + intermediate_size: int = 10240 + num_hidden_layers: int = 34 + num_attention_heads: int = 8 + num_key_value_heads: int = 4 + max_position_embeddings: int = 131072 + rms_norm_eps: float = 1e-6 + rope_theta = [10000.0, 1000000.0] + transformer_type: str = "gemma3" + head_dim = 256 + rms_norm_add = True + mlp_activation = "gelu_pytorch_tanh" + qkv_bias = False + rope_dims = None + q_norm = "gemma3" + k_norm = "gemma3" + sliding_attention = [False, False, False, False, False, 1024] + rope_scale = [1.0, 8.0] class RMSNorm(nn.Module): def __init__(self, dim: int, eps: float = 1e-5, add=False, device=None, dtype=None): @@ -106,25 +142,40 @@ def rotate_half(x): return torch.cat((-x2, x1), dim=-1) -def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=None): - theta_numerator = torch.arange(0, head_dim, 2, device=device).float() - inv_freq = 1.0 / (theta ** (theta_numerator / head_dim)) +def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None): + if not isinstance(theta, list): + theta = [theta] - inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - if rope_dims is not None and position_ids.shape[0] > 1: - mrope_section = rope_dims * 2 - cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) - sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) - else: - cos = cos.unsqueeze(1) - sin = sin.unsqueeze(1) + out = [] + for index, t in enumerate(theta): + theta_numerator = torch.arange(0, head_dim, 2, device=device).float() + inv_freq = 1.0 / (t ** (theta_numerator / head_dim)) - return (cos, sin) + if rope_scale is not None: + if isinstance(rope_scale, list): + inv_freq /= rope_scale[index] + else: + inv_freq /= rope_scale + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + if rope_dims is not None and position_ids.shape[0] > 1: + mrope_section = rope_dims * 2 + cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0) + else: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + out.append((cos, sin)) + + if len(out) == 1: + return out[0] + + return out def apply_rope(xq, xk, freqs_cis): @@ -152,6 +203,14 @@ class Attention(nn.Module): self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=config.qkv_bias, device=device, dtype=dtype) self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype) + self.q_norm = None + self.k_norm = None + + if config.q_norm == "gemma3": + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + if config.k_norm == "gemma3": + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + def forward( self, hidden_states: torch.Tensor, @@ -168,6 +227,11 @@ class Attention(nn.Module): xk = xk.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2) xv = xv.view(batch_size, seq_length, self.num_kv_heads, self.head_dim).transpose(1, 2) + if self.q_norm is not None: + xq = self.q_norm(xq) + if self.k_norm is not None: + xk = self.k_norm(xk) + xq, xk = apply_rope(xq, xk, freqs_cis=freqs_cis) xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1) @@ -192,7 +256,7 @@ class MLP(nn.Module): return self.down_proj(self.activation(self.gate_proj(x)) * self.up_proj(x)) class TransformerBlock(nn.Module): - def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): + def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None): super().__init__() self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops) self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) @@ -226,7 +290,7 @@ class TransformerBlock(nn.Module): return x class TransformerBlockGemma2(nn.Module): - def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None): + def __init__(self, config: Llama2Config, index, device=None, dtype=None, ops: Any = None): super().__init__() self.self_attn = Attention(config, device=device, dtype=dtype, ops=ops) self.mlp = MLP(config, device=device, dtype=dtype, ops=ops) @@ -235,6 +299,13 @@ class TransformerBlockGemma2(nn.Module): self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) + if config.sliding_attention is not None: # TODO: implement. (Not that necessary since models are trained on less than 1024 tokens) + self.sliding_attention = config.sliding_attention[index % len(config.sliding_attention)] + else: + self.sliding_attention = False + + self.transformer_type = config.transformer_type + def forward( self, x: torch.Tensor, @@ -242,6 +313,14 @@ class TransformerBlockGemma2(nn.Module): freqs_cis: Optional[torch.Tensor] = None, optimized_attention=None, ): + if self.transformer_type == 'gemma3': + if self.sliding_attention: + if x.shape[1] > self.sliding_attention: + logging.warning("Warning: sliding attention not implemented, results may be incorrect") + freqs_cis = freqs_cis[1] + else: + freqs_cis = freqs_cis[0] + # Self Attention residual = x x = self.input_layernorm(x) @@ -276,7 +355,7 @@ class Llama2_(nn.Module): device=device, dtype=dtype ) - if self.config.transformer_type == "gemma2": + if self.config.transformer_type == "gemma2" or self.config.transformer_type == "gemma3": transformer = TransformerBlockGemma2 self.normalize_in = True else: @@ -284,8 +363,8 @@ class Llama2_(nn.Module): self.normalize_in = False self.layers = nn.ModuleList([ - transformer(config, device=device, dtype=dtype, ops=ops) - for _ in range(config.num_hidden_layers) + transformer(config, index=i, device=device, dtype=dtype, ops=ops) + for i in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype) # self.lm_head = ops.Linear(config.hidden_size, config.vocab_size, bias=False, device=device, dtype=dtype) @@ -305,6 +384,7 @@ class Llama2_(nn.Module): freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, + self.config.rope_scale, self.config.rope_dims, device=x.device) @@ -433,3 +513,12 @@ class Gemma2_2B(BaseLlama, torch.nn.Module): self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) self.dtype = dtype + +class Gemma3_4B(BaseLlama, torch.nn.Module): + def __init__(self, config_dict, dtype, device, operations): + super().__init__() + config = Gemma3_4B_Config(**config_dict) + self.num_layers = config.num_hidden_layers + + self.model = Llama2_(config, device=device, dtype=dtype, ops=operations) + self.dtype = dtype diff --git a/comfy/text_encoders/lumina2.py b/comfy/text_encoders/lumina2.py index 674461b75..fd986e2c1 100644 --- a/comfy/text_encoders/lumina2.py +++ b/comfy/text_encoders/lumina2.py @@ -11,23 +11,41 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer): def state_dict(self): return {"spiece_model": self.tokenizer.serialize_model()} +class Gemma3_4BTokenizer(sd1_clip.SDTokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer = tokenizer_data.get("spiece_model", None) + super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data) + + def state_dict(self): + return {"spiece_model": self.tokenizer.serialize_model()} class LuminaTokenizer(sd1_clip.SD1Tokenizer): def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma2_2b", tokenizer=Gemma2BTokenizer) +class NTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma3_4b", tokenizer=Gemma3_4BTokenizer) class Gemma2_2BModel(sd1_clip.SDClipModel): def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma2_2B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) +class Gemma3_4BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}): + super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options) class LuminaModel(sd1_clip.SD1ClipModel): - def __init__(self, device="cpu", dtype=None, model_options={}): - super().__init__(device=device, dtype=dtype, name="gemma2_2b", clip_model=Gemma2_2BModel, model_options=model_options) + def __init__(self, device="cpu", dtype=None, model_options={}, name="gemma2_2b", clip_model=Gemma2_2BModel): + super().__init__(device=device, dtype=dtype, name=name, clip_model=clip_model, model_options=model_options) -def te(dtype_llama=None, llama_scaled_fp8=None): +def te(dtype_llama=None, llama_scaled_fp8=None, model_type="gemma2_2b"): + if model_type == "gemma2_2b": + model = Gemma2_2BModel + elif model_type == "gemma3_4b": + model = Gemma3_4BModel + class LuminaTEModel_(LuminaModel): def __init__(self, device="cpu", dtype=None, model_options={}): if llama_scaled_fp8 is not None and "scaled_fp8" not in model_options: @@ -35,5 +53,5 @@ def te(dtype_llama=None, llama_scaled_fp8=None): model_options["scaled_fp8"] = llama_scaled_fp8 if dtype_llama is not None: dtype = dtype_llama - super().__init__(device=device, dtype=dtype, model_options=model_options) + super().__init__(device=device, dtype=dtype, name=model_type, model_options=model_options, clip_model=model) return LuminaTEModel_ diff --git a/comfy/utils.py b/comfy/utils.py index fab28cf08..4bd281057 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -39,7 +39,11 @@ if hasattr(torch.serialization, "add_safe_globals"): # TODO: this was added in pass ModelCheckpoint.__module__ = "pytorch_lightning.callbacks.model_checkpoint" - from numpy.core.multiarray import scalar + def scalar(*args, **kwargs): + from numpy.core.multiarray import scalar as sc + return sc(*args, **kwargs) + scalar.__module__ = "numpy.core.multiarray" + from numpy import dtype from numpy.dtypes import Float64DType from _codecs import encode @@ -1102,3 +1106,25 @@ def upscale_dit_mask(mask: torch.Tensor, img_size_in, img_size_out): dim=1 ) return out + +def pack_latents(latents): + latent_shapes = [] + tensors = [] + for tensor in latents: + latent_shapes.append(tensor.shape) + tensors.append(tensor.reshape(tensor.shape[0], 1, -1)) + + latent = torch.cat(tensors, dim=-1) + return latent, latent_shapes + +def unpack_latents(combined_latent, latent_shapes): + if len(latent_shapes) > 1: + output_tensors = [] + for shape in latent_shapes: + cut = math.prod(shape[1:]) + tens = combined_latent[:, :, :cut] + combined_latent = combined_latent[:, :, cut:] + output_tensors.append(tens.reshape([tens.shape[0]] + list(shape)[1:])) + else: + output_tensors = combined_latent + return output_tensors diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 2cee65aa9..b7a3fa9c1 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -8,8 +8,8 @@ from comfy_api.internal.async_to_sync import create_sync_class from comfy_api.latest._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from comfy_api.latest._input_impl import VideoFromFile, VideoFromComponents from comfy_api.latest._util import VideoCodec, VideoContainer, VideoComponents -from comfy_api.latest._io import _IO as io #noqa: F401 -from comfy_api.latest._ui import _UI as ui #noqa: F401 +from . import _io as io +from . import _ui as ui # from comfy_api.latest._resources import _RESOURCES as resources #noqa: F401 from comfy_execution.utils import get_executing_context from comfy_execution.progress import get_progress_state, PreviewImageTuple @@ -114,6 +114,10 @@ if TYPE_CHECKING: ComfyAPISync: Type[comfy_api.latest.generated.ComfyAPISyncStub.ComfyAPISyncStub] ComfyAPISync = create_sync_class(ComfyAPI_latest) +# create new aliases for io and ui +IO = io +UI = ui + __all__ = [ "ComfyAPI", "ComfyAPISync", @@ -121,4 +125,8 @@ __all__ = [ "InputImpl", "Types", "ComfyExtension", + "io", + "IO", + "ui", + "UI", ] diff --git a/comfy_api/latest/_input/video_types.py b/comfy_api/latest/_input/video_types.py index 5d95dc507..a335df4d0 100644 --- a/comfy_api/latest/_input/video_types.py +++ b/comfy_api/latest/_input/video_types.py @@ -1,6 +1,6 @@ from __future__ import annotations from abc import ABC, abstractmethod -from typing import Optional, Union +from typing import Optional, Union, IO import io import av from comfy_api.util import VideoContainer, VideoCodec, VideoComponents @@ -23,7 +23,7 @@ class VideoInput(ABC): @abstractmethod def save_to( self, - path: str, + path: Union[str, IO[bytes]], format: VideoContainer = VideoContainer.AUTO, codec: VideoCodec = VideoCodec.AUTO, metadata: Optional[dict] = None diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 4826818df..0b701260f 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -336,11 +336,25 @@ class Combo(ComfyTypeIO): class Input(WidgetInput): """Combo input (dropdown).""" Type = str - def __init__(self, id: str, options: list[str]=None, display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None, - default: str=None, control_after_generate: bool=None, - upload: UploadType=None, image_folder: FolderType=None, - remote: RemoteOptions=None, - socketless: bool=None): + def __init__( + self, + id: str, + options: list[str] | list[int] | type[Enum] = None, + display_name: str=None, + optional=False, + tooltip: str=None, + lazy: bool=None, + default: str | int | Enum = None, + control_after_generate: bool=None, + upload: UploadType=None, + image_folder: FolderType=None, + remote: RemoteOptions=None, + socketless: bool=None, + ): + if isinstance(options, type) and issubclass(options, Enum): + options = [v.value for v in options] + if isinstance(default, Enum): + default = default.value super().__init__(id, display_name, optional, tooltip, lazy, default, socketless) self.multiselect = False self.options = options @@ -1568,77 +1582,78 @@ class _UIOutput(ABC): ... -class _IO: - FolderType = FolderType - UploadType = UploadType - RemoteOptions = RemoteOptions - NumberDisplay = NumberDisplay +__all__ = [ + "FolderType", + "UploadType", + "RemoteOptions", + "NumberDisplay", - comfytype = staticmethod(comfytype) - Custom = staticmethod(Custom) - Input = Input - WidgetInput = WidgetInput - Output = Output - ComfyTypeI = ComfyTypeI - ComfyTypeIO = ComfyTypeIO - #--------------------------------- + "comfytype", + "Custom", + "Input", + "WidgetInput", + "Output", + "ComfyTypeI", + "ComfyTypeIO", # Supported Types - Boolean = Boolean - Int = Int - Float = Float - String = String - Combo = Combo - MultiCombo = MultiCombo - Image = Image - WanCameraEmbedding = WanCameraEmbedding - Webcam = Webcam - Mask = Mask - Latent = Latent - Conditioning = Conditioning - Sampler = Sampler - Sigmas = Sigmas - Noise = Noise - Guider = Guider - Clip = Clip - ControlNet = ControlNet - Vae = Vae - Model = Model - ClipVision = ClipVision - ClipVisionOutput = ClipVisionOutput - AudioEncoderOutput = AudioEncoderOutput - StyleModel = StyleModel - Gligen = Gligen - UpscaleModel = UpscaleModel - Audio = Audio - Video = Video - SVG = SVG - LoraModel = LoraModel - LossMap = LossMap - Voxel = Voxel - Mesh = Mesh - Hooks = Hooks - HookKeyframes = HookKeyframes - TimestepsRange = TimestepsRange - LatentOperation = LatentOperation - FlowControl = FlowControl - Accumulation = Accumulation - Load3DCamera = Load3DCamera - Load3D = Load3D - Load3DAnimation = Load3DAnimation - Photomaker = Photomaker - Point = Point - FaceAnalysis = FaceAnalysis - BBOX = BBOX - SEGS = SEGS - AnyType = AnyType - MultiType = MultiType - #--------------------------------- - HiddenHolder = HiddenHolder - Hidden = Hidden - NodeInfoV1 = NodeInfoV1 - NodeInfoV3 = NodeInfoV3 - Schema = Schema - ComfyNode = ComfyNode - NodeOutput = NodeOutput - add_to_dict_v1 = staticmethod(add_to_dict_v1) - add_to_dict_v3 = staticmethod(add_to_dict_v3) + "Boolean", + "Int", + "Float", + "String", + "Combo", + "MultiCombo", + "Image", + "WanCameraEmbedding", + "Webcam", + "Mask", + "Latent", + "Conditioning", + "Sampler", + "Sigmas", + "Noise", + "Guider", + "Clip", + "ControlNet", + "Vae", + "Model", + "ClipVision", + "ClipVisionOutput", + "AudioEncoder", + "AudioEncoderOutput", + "StyleModel", + "Gligen", + "UpscaleModel", + "Audio", + "Video", + "SVG", + "LoraModel", + "LossMap", + "Voxel", + "Mesh", + "Hooks", + "HookKeyframes", + "TimestepsRange", + "LatentOperation", + "FlowControl", + "Accumulation", + "Load3DCamera", + "Load3D", + "Load3DAnimation", + "Photomaker", + "Point", + "FaceAnalysis", + "BBOX", + "SEGS", + "AnyType", + "MultiType", + # Other classes + "HiddenHolder", + "Hidden", + "NodeInfoV1", + "NodeInfoV3", + "Schema", + "ComfyNode", + "NodeOutput", + "add_to_dict_v1", + "add_to_dict_v3", +] diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index 26a55615f..b0bbabe2a 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -449,15 +449,16 @@ class PreviewText(_UIOutput): return {"text": (self.value,)} -class _UI: - SavedResult = SavedResult - SavedImages = SavedImages - SavedAudios = SavedAudios - ImageSaveHelper = ImageSaveHelper - AudioSaveHelper = AudioSaveHelper - PreviewImage = PreviewImage - PreviewMask = PreviewMask - PreviewAudio = PreviewAudio - PreviewVideo = PreviewVideo - PreviewUI3D = PreviewUI3D - PreviewText = PreviewText +__all__ = [ + "SavedResult", + "SavedImages", + "SavedAudios", + "ImageSaveHelper", + "AudioSaveHelper", + "PreviewImage", + "PreviewMask", + "PreviewAudio", + "PreviewVideo", + "PreviewUI3D", + "PreviewText", +] diff --git a/comfy_api_nodes/apinode_utils.py b/comfy_api_nodes/apinode_utils.py deleted file mode 100644 index 37438f835..000000000 --- a/comfy_api_nodes/apinode_utils.py +++ /dev/null @@ -1,691 +0,0 @@ -from __future__ import annotations -import aiohttp -import io -import logging -import mimetypes -from typing import Optional, Union -from comfy.utils import common_upscale -from comfy_api.input_impl import VideoFromFile -from comfy_api.util import VideoContainer, VideoCodec -from comfy_api.input.video_types import VideoInput -from comfy_api.input.basic_types import AudioInput -from comfy_api_nodes.apis.client import ( - ApiClient, - ApiEndpoint, - HttpMethod, - SynchronousOperation, - UploadRequest, - UploadResponse, -) -from server import PromptServer - - -import numpy as np -from PIL import Image -import torch -import math -import base64 -import uuid -from io import BytesIO -import av - - -async def download_url_to_video_output(video_url: str, timeout: int = None) -> VideoFromFile: - """Downloads a video from a URL and returns a `VIDEO` output. - - Args: - video_url: The URL of the video to download. - - Returns: - A Comfy node `VIDEO` output. - """ - video_io = await download_url_to_bytesio(video_url, timeout) - if video_io is None: - error_msg = f"Failed to download video from {video_url}" - logging.error(error_msg) - raise ValueError(error_msg) - return VideoFromFile(video_io) - - -def downscale_image_tensor(image, total_pixels=1536 * 1024) -> torch.Tensor: - """Downscale input image tensor to roughly the specified total pixels.""" - samples = image.movedim(-1, 1) - total = int(total_pixels) - scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) - if scale_by >= 1: - return image - width = round(samples.shape[3] * scale_by) - height = round(samples.shape[2] * scale_by) - - s = common_upscale(samples, width, height, "lanczos", "disabled") - s = s.movedim(1, -1) - return s - - -async def validate_and_cast_response( - response, timeout: int = None, node_id: Union[str, None] = None -) -> torch.Tensor: - """Validates and casts a response to a torch.Tensor. - - Args: - response: The response to validate and cast. - timeout: Request timeout in seconds. Defaults to None (no timeout). - - Returns: - A torch.Tensor representing the image (1, H, W, C). - - Raises: - ValueError: If the response is not valid. - """ - # validate raw JSON response - data = response.data - if not data or len(data) == 0: - raise ValueError("No images returned from API endpoint") - - # Initialize list to store image tensors - image_tensors: list[torch.Tensor] = [] - - # Process each image in the data array - async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=timeout)) as session: - for img_data in data: - img_bytes: bytes - if img_data.b64_json: - img_bytes = base64.b64decode(img_data.b64_json) - elif img_data.url: - if node_id: - PromptServer.instance.send_progress_text(f"Result URL: {img_data.url}", node_id) - async with session.get(img_data.url) as resp: - if resp.status != 200: - raise ValueError("Failed to download generated image") - img_bytes = await resp.read() - else: - raise ValueError("Invalid image payload – neither URL nor base64 data present.") - - pil_img = Image.open(BytesIO(img_bytes)).convert("RGBA") - arr = np.asarray(pil_img).astype(np.float32) / 255.0 - image_tensors.append(torch.from_numpy(arr)) - - return torch.stack(image_tensors, dim=0) - - -def validate_aspect_ratio( - aspect_ratio: str, - minimum_ratio: float, - maximum_ratio: float, - minimum_ratio_str: str, - maximum_ratio_str: str, -) -> float: - """Validates and casts an aspect ratio string to a float. - - Args: - aspect_ratio: The aspect ratio string to validate. - minimum_ratio: The minimum aspect ratio. - maximum_ratio: The maximum aspect ratio. - minimum_ratio_str: The minimum aspect ratio string. - maximum_ratio_str: The maximum aspect ratio string. - - Returns: - The validated and cast aspect ratio. - - Raises: - Exception: If the aspect ratio is not valid. - """ - # get ratio values - numbers = aspect_ratio.split(":") - if len(numbers) != 2: - raise TypeError( - f"Aspect ratio must be in the format X:Y, such as 16:9, but was {aspect_ratio}." - ) - try: - numerator = int(numbers[0]) - denominator = int(numbers[1]) - except ValueError as exc: - raise TypeError( - f"Aspect ratio must contain numbers separated by ':', such as 16:9, but was {aspect_ratio}." - ) from exc - calculated_ratio = numerator / denominator - # if not close to minimum and maximum, check bounds - if not math.isclose(calculated_ratio, minimum_ratio) or not math.isclose( - calculated_ratio, maximum_ratio - ): - if calculated_ratio < minimum_ratio: - raise TypeError( - f"Aspect ratio cannot reduce to any less than {minimum_ratio_str} ({minimum_ratio}), but was {aspect_ratio} ({calculated_ratio})." - ) - elif calculated_ratio > maximum_ratio: - raise TypeError( - f"Aspect ratio cannot reduce to any greater than {maximum_ratio_str} ({maximum_ratio}), but was {aspect_ratio} ({calculated_ratio})." - ) - return aspect_ratio - - -def mimetype_to_extension(mime_type: str) -> str: - """Converts a MIME type to a file extension.""" - return mime_type.split("/")[-1].lower() - - -async def download_url_to_bytesio(url: str, timeout: int = None) -> BytesIO: - """Downloads content from a URL using requests and returns it as BytesIO. - - Args: - url: The URL to download. - timeout: Request timeout in seconds. Defaults to None (no timeout). - - Returns: - BytesIO object containing the downloaded content. - """ - timeout_cfg = aiohttp.ClientTimeout(total=timeout) if timeout else None - async with aiohttp.ClientSession(timeout=timeout_cfg) as session: - async with session.get(url) as resp: - resp.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX) - return BytesIO(await resp.read()) - - -def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch.Tensor: - """Converts image data from BytesIO to a torch.Tensor. - - Args: - image_bytesio: BytesIO object containing the image data. - mode: The PIL mode to convert the image to (e.g., "RGB", "RGBA"). - - Returns: - A torch.Tensor representing the image (1, H, W, C). - - Raises: - PIL.UnidentifiedImageError: If the image data cannot be identified. - ValueError: If the specified mode is invalid. - """ - image = Image.open(image_bytesio) - image = image.convert(mode) - image_array = np.array(image).astype(np.float32) / 255.0 - return torch.from_numpy(image_array).unsqueeze(0) - - -async def download_url_to_image_tensor(url: str, timeout: int = None) -> torch.Tensor: - """Downloads an image from a URL and returns a [B, H, W, C] tensor.""" - image_bytesio = await download_url_to_bytesio(url, timeout) - return bytesio_to_image_tensor(image_bytesio) - - -def process_image_response(response_content: bytes | str) -> torch.Tensor: - """Uses content from a Response object and converts it to a torch.Tensor""" - return bytesio_to_image_tensor(BytesIO(response_content)) - - -def _tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: - """Converts a single torch.Tensor image [H, W, C] to a PIL Image, optionally downscaling.""" - if len(image.shape) > 3: - image = image[0] - # TODO: remove alpha if not allowed and present - input_tensor = image.cpu() - input_tensor = downscale_image_tensor( - input_tensor.unsqueeze(0), total_pixels=total_pixels - ).squeeze() - image_np = (input_tensor.numpy() * 255).astype(np.uint8) - img = Image.fromarray(image_np) - return img - - -def _pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO: - """Converts a PIL Image to a BytesIO object.""" - if not mime_type: - mime_type = "image/png" - - img_byte_arr = io.BytesIO() - # Derive PIL format from MIME type (e.g., 'image/png' -> 'PNG') - pil_format = mime_type.split("/")[-1].upper() - if pil_format == "JPG": - pil_format = "JPEG" - img.save(img_byte_arr, format=pil_format) - img_byte_arr.seek(0) - return img_byte_arr - - -def tensor_to_bytesio( - image: torch.Tensor, - name: Optional[str] = None, - total_pixels: int = 2048 * 2048, - mime_type: str = "image/png", -) -> BytesIO: - """Converts a torch.Tensor image to a named BytesIO object. - - Args: - image: Input torch.Tensor image. - name: Optional filename for the BytesIO object. - total_pixels: Maximum total pixels for potential downscaling. - mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). - - Returns: - Named BytesIO object containing the image data. - """ - if not mime_type: - mime_type = "image/png" - - pil_image = _tensor_to_pil(image, total_pixels=total_pixels) - img_binary = _pil_to_bytesio(pil_image, mime_type=mime_type) - img_binary.name = ( - f"{name if name else uuid.uuid4()}.{mimetype_to_extension(mime_type)}" - ) - return img_binary - - -def tensor_to_base64_string( - image_tensor: torch.Tensor, - total_pixels: int = 2048 * 2048, - mime_type: str = "image/png", -) -> str: - """Convert [B, H, W, C] or [H, W, C] tensor to a base64 string. - - Args: - image_tensor: Input torch.Tensor image. - total_pixels: Maximum total pixels for potential downscaling. - mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). - - Returns: - Base64 encoded string of the image. - """ - pil_image = _tensor_to_pil(image_tensor, total_pixels=total_pixels) - img_byte_arr = _pil_to_bytesio(pil_image, mime_type=mime_type) - img_bytes = img_byte_arr.getvalue() - # Encode bytes to base64 string - base64_encoded_string = base64.b64encode(img_bytes).decode("utf-8") - return base64_encoded_string - - -def tensor_to_data_uri( - image_tensor: torch.Tensor, - total_pixels: int = 2048 * 2048, - mime_type: str = "image/png", -) -> str: - """Converts a tensor image to a Data URI string. - - Args: - image_tensor: Input torch.Tensor image. - total_pixels: Maximum total pixels for potential downscaling. - mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp'). - - Returns: - Data URI string (e.g., 'data:image/png;base64,...'). - """ - base64_string = tensor_to_base64_string(image_tensor, total_pixels, mime_type) - return f"data:{mime_type};base64,{base64_string}" - - -def text_filepath_to_base64_string(filepath: str) -> str: - """Converts a text file to a base64 string.""" - with open(filepath, "rb") as f: - file_content = f.read() - return base64.b64encode(file_content).decode("utf-8") - - -def text_filepath_to_data_uri(filepath: str) -> str: - """Converts a text file to a data URI.""" - base64_string = text_filepath_to_base64_string(filepath) - mime_type, _ = mimetypes.guess_type(filepath) - if mime_type is None: - mime_type = "application/octet-stream" - return f"data:{mime_type};base64,{base64_string}" - - -async def upload_file_to_comfyapi( - file_bytes_io: BytesIO, - filename: str, - upload_mime_type: Optional[str], - auth_kwargs: Optional[dict[str, str]] = None, -) -> str: - """ - Uploads a single file to ComfyUI API and returns its download URL. - - Args: - file_bytes_io: BytesIO object containing the file data. - filename: The filename of the file. - upload_mime_type: MIME type of the file. - auth_kwargs: Optional authentication token(s). - - Returns: - The download URL for the uploaded file. - """ - if upload_mime_type is None: - request_object = UploadRequest(file_name=filename) - else: - request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/customers/storage", - method=HttpMethod.POST, - request_model=UploadRequest, - response_model=UploadResponse, - ), - request=request_object, - auth_kwargs=auth_kwargs, - ) - - response: UploadResponse = await operation.execute() - await ApiClient.upload_file(response.upload_url, file_bytes_io, content_type=upload_mime_type) - return response.download_url - - -def video_to_base64_string( - video: VideoInput, - container_format: VideoContainer = None, - codec: VideoCodec = None -) -> str: - """ - Converts a video input to a base64 string. - - Args: - video: The video input to convert - container_format: Optional container format to use (defaults to video.container if available) - codec: Optional codec to use (defaults to video.codec if available) - """ - video_bytes_io = io.BytesIO() - - # Use provided format/codec if specified, otherwise use video's own if available - format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4) - codec_to_use = codec if codec is not None else getattr(video, 'codec', VideoCodec.H264) - - video.save_to(video_bytes_io, format=format_to_use, codec=codec_to_use) - video_bytes_io.seek(0) - return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") - - -async def upload_video_to_comfyapi( - video: VideoInput, - auth_kwargs: Optional[dict[str, str]] = None, - container: VideoContainer = VideoContainer.MP4, - codec: VideoCodec = VideoCodec.H264, - max_duration: Optional[int] = None, -) -> str: - """ - Uploads a single video to ComfyUI API and returns its download URL. - Uses the specified container and codec for saving the video before upload. - - Args: - video: VideoInput object (Comfy VIDEO type). - auth_kwargs: Optional authentication token(s). - container: The video container format to use (default: MP4). - codec: The video codec to use (default: H264). - max_duration: Optional maximum duration of the video in seconds. If the video is longer than this, an error will be raised. - - Returns: - The download URL for the uploaded video file. - """ - if max_duration is not None: - try: - actual_duration = video.duration_seconds - if actual_duration is not None and actual_duration > max_duration: - raise ValueError( - f"Video duration ({actual_duration:.2f}s) exceeds the maximum allowed ({max_duration}s)." - ) - except Exception as e: - logging.error(f"Error getting video duration: {e}") - raise ValueError(f"Could not verify video duration from source: {e}") from e - - upload_mime_type = f"video/{container.value.lower()}" - filename = f"uploaded_video.{container.value.lower()}" - - # Convert VideoInput to BytesIO using specified container/codec - video_bytes_io = io.BytesIO() - video.save_to(video_bytes_io, format=container, codec=codec) - video_bytes_io.seek(0) - - return await upload_file_to_comfyapi(video_bytes_io, filename, upload_mime_type, auth_kwargs) - - -def audio_tensor_to_contiguous_ndarray(waveform: torch.Tensor) -> np.ndarray: - """ - Prepares audio waveform for av library by converting to a contiguous numpy array. - - Args: - waveform: a tensor of shape (1, channels, samples) derived from a Comfy `AUDIO` type. - - Returns: - Contiguous numpy array of the audio waveform. If the audio was batched, - the first item is taken. - """ - if waveform.ndim != 3 or waveform.shape[0] != 1: - raise ValueError("Expected waveform tensor shape (1, channels, samples)") - - # If batch is > 1, take first item - if waveform.shape[0] > 1: - waveform = waveform[0] - - # Prepare for av: remove batch dim, move to CPU, make contiguous, convert to numpy array - audio_data_np = waveform.squeeze(0).cpu().contiguous().numpy() - if audio_data_np.dtype != np.float32: - audio_data_np = audio_data_np.astype(np.float32) - - return audio_data_np - - -def audio_ndarray_to_bytesio( - audio_data_np: np.ndarray, - sample_rate: int, - container_format: str = "mp4", - codec_name: str = "aac", -) -> BytesIO: - """ - Encodes a numpy array of audio data into a BytesIO object. - """ - audio_bytes_io = io.BytesIO() - with av.open(audio_bytes_io, mode="w", format=container_format) as output_container: - audio_stream = output_container.add_stream(codec_name, rate=sample_rate) - frame = av.AudioFrame.from_ndarray( - audio_data_np, - format="fltp", - layout="stereo" if audio_data_np.shape[0] > 1 else "mono", - ) - frame.sample_rate = sample_rate - frame.pts = 0 - - for packet in audio_stream.encode(frame): - output_container.mux(packet) - - # Flush stream - for packet in audio_stream.encode(None): - output_container.mux(packet) - - audio_bytes_io.seek(0) - return audio_bytes_io - - -async def upload_audio_to_comfyapi( - audio: AudioInput, - auth_kwargs: Optional[dict[str, str]] = None, - container_format: str = "mp4", - codec_name: str = "aac", - mime_type: str = "audio/mp4", - filename: str = "uploaded_audio.mp4", -) -> str: - """ - Uploads a single audio input to ComfyUI API and returns its download URL. - Encodes the raw waveform into the specified format before uploading. - - Args: - audio: a Comfy `AUDIO` type (contains waveform tensor and sample_rate) - auth_kwargs: Optional authentication token(s). - - Returns: - The download URL for the uploaded audio file. - """ - sample_rate: int = audio["sample_rate"] - waveform: torch.Tensor = audio["waveform"] - audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) - audio_bytes_io = audio_ndarray_to_bytesio( - audio_data_np, sample_rate, container_format, codec_name - ) - - return await upload_file_to_comfyapi(audio_bytes_io, filename, mime_type, auth_kwargs) - - -def f32_pcm(wav: torch.Tensor) -> torch.Tensor: - """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file.""" - if wav.dtype.is_floating_point: - return wav - elif wav.dtype == torch.int16: - return wav.float() / (2 ** 15) - elif wav.dtype == torch.int32: - return wav.float() / (2 ** 31) - raise ValueError(f"Unsupported wav dtype: {wav.dtype}") - - -def audio_bytes_to_audio_input(audio_bytes: bytes,) -> dict: - """ - Decode any common audio container from bytes using PyAV and return - a Comfy AUDIO dict: {"waveform": [1, C, T] float32, "sample_rate": int}. - """ - with av.open(io.BytesIO(audio_bytes)) as af: - if not af.streams.audio: - raise ValueError("No audio stream found in response.") - stream = af.streams.audio[0] - - in_sr = int(stream.codec_context.sample_rate) - out_sr = in_sr - - frames: list[torch.Tensor] = [] - n_channels = stream.channels or 1 - - for frame in af.decode(streams=stream.index): - arr = frame.to_ndarray() # shape can be [C, T] or [T, C] or [T] - buf = torch.from_numpy(arr) - if buf.ndim == 1: - buf = buf.unsqueeze(0) # [T] -> [1, T] - elif buf.shape[0] != n_channels and buf.shape[-1] == n_channels: - buf = buf.transpose(0, 1).contiguous() # [T, C] -> [C, T] - elif buf.shape[0] != n_channels: - buf = buf.reshape(-1, n_channels).t().contiguous() # fallback to [C, T] - frames.append(buf) - - if not frames: - raise ValueError("Decoded zero audio frames.") - - wav = torch.cat(frames, dim=1) # [C, T] - wav = f32_pcm(wav) - return {"waveform": wav.unsqueeze(0).contiguous(), "sample_rate": out_sr} - - -def audio_input_to_mp3(audio: AudioInput) -> io.BytesIO: - waveform = audio["waveform"].cpu() - - output_buffer = io.BytesIO() - output_container = av.open(output_buffer, mode='w', format="mp3") - - out_stream = output_container.add_stream("libmp3lame", rate=audio["sample_rate"]) - out_stream.bit_rate = 320000 - - frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') - frame.sample_rate = audio["sample_rate"] - frame.pts = 0 - output_container.mux(out_stream.encode(frame)) - output_container.mux(out_stream.encode(None)) - output_container.close() - output_buffer.seek(0) - return output_buffer - - -def audio_to_base64_string( - audio: AudioInput, container_format: str = "mp4", codec_name: str = "aac" -) -> str: - """Converts an audio input to a base64 string.""" - sample_rate: int = audio["sample_rate"] - waveform: torch.Tensor = audio["waveform"] - audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) - audio_bytes_io = audio_ndarray_to_bytesio( - audio_data_np, sample_rate, container_format, codec_name - ) - audio_bytes = audio_bytes_io.getvalue() - return base64.b64encode(audio_bytes).decode("utf-8") - - -async def upload_images_to_comfyapi( - image: torch.Tensor, - max_images=8, - auth_kwargs: Optional[dict[str, str]] = None, - mime_type: Optional[str] = None, -) -> list[str]: - """ - Uploads images to ComfyUI API and returns download URLs. - To upload multiple images, stack them in the batch dimension first. - - Args: - image: Input torch.Tensor image. - max_images: Maximum number of images to upload. - auth_kwargs: Optional authentication token(s). - mime_type: Optional MIME type for the image. - """ - # if batch, try to upload each file if max_images is greater than 0 - download_urls: list[str] = [] - is_batch = len(image.shape) > 3 - batch_len = image.shape[0] if is_batch else 1 - - for idx in range(min(batch_len, max_images)): - tensor = image[idx] if is_batch else image - img_io = tensor_to_bytesio(tensor, mime_type=mime_type) - url = await upload_file_to_comfyapi(img_io, img_io.name, mime_type, auth_kwargs) - download_urls.append(url) - return download_urls - - -def resize_mask_to_image( - mask: torch.Tensor, - image: torch.Tensor, - upscale_method="nearest-exact", - crop="disabled", - allow_gradient=True, - add_channel_dim=False, -): - """ - Resize mask to be the same dimensions as an image, while maintaining proper format for API calls. - """ - _, H, W, _ = image.shape - mask = mask.unsqueeze(-1) - mask = mask.movedim(-1, 1) - mask = common_upscale( - mask, width=W, height=H, upscale_method=upscale_method, crop=crop - ) - mask = mask.movedim(1, -1) - if not add_channel_dim: - mask = mask.squeeze(-1) - if not allow_gradient: - mask = (mask > 0.5).float() - return mask - - -def validate_string( - string: str, - strip_whitespace=True, - field_name="prompt", - min_length=None, - max_length=None, -): - if string is None: - raise Exception(f"Field '{field_name}' cannot be empty.") - if strip_whitespace: - string = string.strip() - if min_length and len(string) < min_length: - raise Exception( - f"Field '{field_name}' cannot be shorter than {min_length} characters; was {len(string)} characters long." - ) - if max_length and len(string) > max_length: - raise Exception( - f" Field '{field_name} cannot be longer than {max_length} characters; was {len(string)} characters long." - ) - - -def image_tensor_pair_to_batch( - image1: torch.Tensor, image2: torch.Tensor -) -> torch.Tensor: - """ - Converts a pair of image tensors to a batch tensor. - If the images are not the same size, the smaller image is resized to - match the larger image. - """ - if image1.shape[1:] != image2.shape[1:]: - image2 = common_upscale( - image2.movedim(-1, 1), - image1.shape[2], - image1.shape[1], - "bilinear", - "center", - ).movedim(1, -1) - return torch.cat((image1, image2), dim=0) diff --git a/comfy_api_nodes/apis/PixverseController.py b/comfy_api_nodes/apis/PixverseController.py deleted file mode 100644 index 310c0f546..000000000 --- a/comfy_api_nodes/apis/PixverseController.py +++ /dev/null @@ -1,17 +0,0 @@ -# generated by datamodel-codegen: -# filename: filtered-openapi.yaml -# timestamp: 2025-04-29T23:44:54+00:00 - -from __future__ import annotations - -from typing import Optional - -from pydantic import BaseModel - -from . import PixverseDto - - -class ResponseData(BaseModel): - ErrCode: Optional[int] = None - ErrMsg: Optional[str] = None - Resp: Optional[PixverseDto.V2OpenAPII2VResp] = None diff --git a/comfy_api_nodes/apis/PixverseDto.py b/comfy_api_nodes/apis/PixverseDto.py deleted file mode 100644 index 323c38e96..000000000 --- a/comfy_api_nodes/apis/PixverseDto.py +++ /dev/null @@ -1,57 +0,0 @@ -# generated by datamodel-codegen: -# filename: filtered-openapi.yaml -# timestamp: 2025-04-29T23:44:54+00:00 - -from __future__ import annotations - -from typing import Optional - -from pydantic import BaseModel, Field - - -class V2OpenAPII2VResp(BaseModel): - video_id: Optional[int] = Field(None, description='Video_id') - - -class V2OpenAPIT2VReq(BaseModel): - aspect_ratio: str = Field( - ..., description='Aspect ratio (16:9, 4:3, 1:1, 3:4, 9:16)', examples=['16:9'] - ) - duration: int = Field( - ..., - description='Video duration (5, 8 seconds, --model=v3.5 only allows 5,8; --quality=1080p does not support 8s)', - examples=[5], - ) - model: str = Field( - ..., description='Model version (only supports v3.5)', examples=['v3.5'] - ) - motion_mode: Optional[str] = Field( - 'normal', - description='Motion mode (normal, fast, --fast only available when duration=5; --quality=1080p does not support fast)', - examples=['normal'], - ) - negative_prompt: Optional[str] = Field( - None, description='Negative prompt\n', max_length=2048 - ) - prompt: str = Field(..., description='Prompt', max_length=2048) - quality: str = Field( - ..., - description='Video quality ("360p"(Turbo model), "540p", "720p", "1080p")', - examples=['540p'], - ) - seed: Optional[int] = Field(None, description='Random seed, range: 0 - 2147483647') - style: Optional[str] = Field( - None, - description='Style (effective when model=v3.5, "anime", "3d_animation", "clay", "comic", "cyberpunk") Do not include style parameter unless needed', - examples=['anime'], - ) - template_id: Optional[int] = Field( - None, - description='Template ID (template_id must be activated before use)', - examples=[302325299692608], - ) - water_mark: Optional[bool] = Field( - False, - description='Watermark (true: add watermark, false: no watermark)', - examples=[False], - ) diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 78a23db30..ee2aa1ce6 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -2,6 +2,7 @@ # filename: filtered-openapi.yaml # timestamp: 2025-07-30T08:54:00+00:00 +# pylint: disable from __future__ import annotations from datetime import date, datetime @@ -1320,6 +1321,7 @@ class KlingTextToVideoModelName(str, Enum): kling_v1 = 'kling-v1' kling_v1_6 = 'kling-v1-6' kling_v2_1_master = 'kling-v2-1-master' + kling_v2_5_turbo = 'kling-v2-5-turbo' class KlingVideoGenAspectRatio(str, Enum): @@ -1354,6 +1356,7 @@ class KlingVideoGenModelName(str, Enum): kling_v2_master = 'kling-v2-master' kling_v2_1 = 'kling-v2-1' kling_v2_1_master = 'kling-v2-1-master' + kling_v2_5_turbo = 'kling-v2-5-turbo' class KlingVideoResult(BaseModel): diff --git a/comfy_api_nodes/apis/bfl_api.py b/comfy_api_nodes/apis/bfl_api.py index 0e90aef7c..0fc8c0607 100644 --- a/comfy_api_nodes/apis/bfl_api.py +++ b/comfy_api_nodes/apis/bfl_api.py @@ -50,44 +50,6 @@ class BFLFluxFillImageRequest(BaseModel): mask: str = Field(None, description='A Base64-encoded string representing the mask of the areas you with to modify.') -class BFLFluxCannyImageRequest(BaseModel): - prompt: str = Field(..., description='Text prompt for image generation') - prompt_upsampling: Optional[bool] = Field( - None, description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation.' - ) - canny_low_threshold: Optional[int] = Field(None, description='Low threshold for Canny edge detection') - canny_high_threshold: Optional[int] = Field(None, description='High threshold for Canny edge detection') - seed: Optional[int] = Field(None, description='The seed value for reproducibility.') - steps: conint(ge=15, le=50) = Field(..., description='Number of steps for the image generation process') - guidance: confloat(ge=1, le=100) = Field(..., description='Guidance strength for the image generation process') - safety_tolerance: Optional[conint(ge=0, le=6)] = Field( - 6, description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict. Defaults to 2.' - ) - output_format: Optional[BFLOutputFormat] = Field( - BFLOutputFormat.png, description="Output format for the generated image. Can be 'jpeg' or 'png'.", examples=['png'] - ) - control_image: Optional[str] = Field(None, description='Base64 encoded image to use as control input if no preprocessed image is provided') - preprocessed_image: Optional[str] = Field(None, description='Optional pre-processed image that will bypass the control preprocessing step') - - -class BFLFluxDepthImageRequest(BaseModel): - prompt: str = Field(..., description='Text prompt for image generation') - prompt_upsampling: Optional[bool] = Field( - None, description='Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation.' - ) - seed: Optional[int] = Field(None, description='The seed value for reproducibility.') - steps: conint(ge=15, le=50) = Field(..., description='Number of steps for the image generation process') - guidance: confloat(ge=1, le=100) = Field(..., description='Guidance strength for the image generation process') - safety_tolerance: Optional[conint(ge=0, le=6)] = Field( - 6, description='Tolerance level for input and output moderation. Between 0 and 6, 0 being most strict, 6 being least strict. Defaults to 2.' - ) - output_format: Optional[BFLOutputFormat] = Field( - BFLOutputFormat.png, description="Output format for the generated image. Can be 'jpeg' or 'png'.", examples=['png'] - ) - control_image: Optional[str] = Field(None, description='Base64 encoded image to use as control input if no preprocessed image is provided') - preprocessed_image: Optional[str] = Field(None, description='Optional pre-processed image that will bypass the control preprocessing step') - - class BFLFluxProGenerateRequest(BaseModel): prompt: str = Field(..., description='The text prompt for image generation.') prompt_upsampling: Optional[bool] = Field( @@ -160,15 +122,8 @@ class BFLStatus(str, Enum): error = "Error" -class BFLFluxProStatusResponse(BaseModel): +class BFLFluxStatusResponse(BaseModel): id: str = Field(..., description="The unique identifier for the generation task.") status: BFLStatus = Field(..., description="The status of the task.") - result: Optional[Dict[str, Any]] = Field( - None, description="The result of the task (null if not completed)." - ) - progress: confloat(ge=0.0, le=1.0) = Field( - ..., description="The progress of the task (0.0 to 1.0)." - ) - details: Optional[Dict[str, Any]] = Field( - None, description="Additional details about the task (null if not available)." - ) + result: Optional[Dict[str, Any]] = Field(None, description="The result of the task (null if not completed).") + progress: Optional[float] = Field(None, description="The progress of the task (0.0 to 1.0).", ge=0.0, le=1.0) diff --git a/comfy_api_nodes/apis/client.py b/comfy_api_nodes/apis/client.py deleted file mode 100644 index 0aed906fb..000000000 --- a/comfy_api_nodes/apis/client.py +++ /dev/null @@ -1,957 +0,0 @@ -""" -API Client Framework for api.comfy.org. - -This module provides a flexible framework for making API requests from ComfyUI nodes. -It supports both synchronous and asynchronous API operations with proper type validation. - -Key Components: --------------- -1. ApiClient - Handles HTTP requests with authentication and error handling -2. ApiEndpoint - Defines a single HTTP endpoint with its request/response models -3. ApiOperation - Executes a single synchronous API operation - -Usage Examples: --------------- - -# Example 1: Synchronous API Operation -# ------------------------------------ -# For a simple API call that returns the result immediately: - -# 1. Create the API client -api_client = ApiClient( - base_url="https://api.example.com", - auth_token="your_auth_token_here", - comfy_api_key="your_comfy_api_key_here", - timeout=30.0, - verify_ssl=True -) - -# 2. Define the endpoint -user_info_endpoint = ApiEndpoint( - path="/v1/users/me", - method=HttpMethod.GET, - request_model=EmptyRequest, # No request body needed - response_model=UserProfile, # Pydantic model for the response - query_params=None -) - -# 3. Create the request object -request = EmptyRequest() - -# 4. Create and execute the operation -operation = ApiOperation( - endpoint=user_info_endpoint, - request=request -) -user_profile = await operation.execute(client=api_client) # Returns immediately with the result - - -# Example 2: Asynchronous API Operation with Polling -# ------------------------------------------------- -# For an API that starts a task and requires polling for completion: - -# 1. Define the endpoints (initial request and polling) -generate_image_endpoint = ApiEndpoint( - path="/v1/images/generate", - method=HttpMethod.POST, - request_model=ImageGenerationRequest, - response_model=TaskCreatedResponse, - query_params=None -) - -check_task_endpoint = ApiEndpoint( - path="/v1/tasks/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=ImageGenerationResult, - query_params=None -) - -# 2. Create the request object -request = ImageGenerationRequest( - prompt="a beautiful sunset over mountains", - width=1024, - height=1024, - num_images=1 -) - -# 3. Create and execute the polling operation -operation = PollingOperation( - initial_endpoint=generate_image_endpoint, - initial_request=request, - poll_endpoint=check_task_endpoint, - task_id_field="task_id", - status_field="status", - completed_statuses=["completed"], - failed_statuses=["failed", "error"] -) - -# This will make the initial request and then poll until completion -result = await operation.execute(client=api_client) # Returns the final ImageGenerationResult when done -""" - -from __future__ import annotations -import aiohttp -import asyncio -import logging -import io -import socket -from aiohttp.client_exceptions import ClientError, ClientResponseError -from typing import Dict, Type, Optional, Any, TypeVar, Generic, Callable, Tuple -from enum import Enum -import json -from urllib.parse import urljoin, urlparse -from pydantic import BaseModel, Field -import uuid # For generating unique operation IDs - -from server import PromptServer -from comfy.cli_args import args -from comfy import utils -from . import request_logger - -T = TypeVar("T", bound=BaseModel) -R = TypeVar("R", bound=BaseModel) -P = TypeVar("P", bound=BaseModel) # For poll response - -PROGRESS_BAR_MAX = 100 - - -class NetworkError(Exception): - """Base exception for network-related errors with diagnostic information.""" - pass - - -class LocalNetworkError(NetworkError): - """Exception raised when local network connectivity issues are detected.""" - pass - - -class ApiServerError(NetworkError): - """Exception raised when the API server is unreachable but internet is working.""" - pass - - -class EmptyRequest(BaseModel): - """Base class for empty request bodies. - For GET requests, fields will be sent as query parameters.""" - - pass - - -class UploadRequest(BaseModel): - file_name: str = Field(..., description="Filename to upload") - content_type: Optional[str] = Field( - None, - description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.", - ) - - -class UploadResponse(BaseModel): - download_url: str = Field(..., description="URL to GET uploaded file") - upload_url: str = Field(..., description="URL to PUT file to upload") - - -class HttpMethod(str, Enum): - GET = "GET" - POST = "POST" - PUT = "PUT" - DELETE = "DELETE" - PATCH = "PATCH" - - -class ApiClient: - """ - Client for making HTTP requests to an API with authentication, error handling, and retry logic. - """ - - def __init__( - self, - base_url: str, - auth_token: Optional[str] = None, - comfy_api_key: Optional[str] = None, - timeout: float = 3600.0, - verify_ssl: bool = True, - max_retries: int = 3, - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - retry_status_codes: Optional[Tuple[int, ...]] = None, - session: Optional[aiohttp.ClientSession] = None, - ): - self.base_url = base_url - self.auth_token = auth_token - self.comfy_api_key = comfy_api_key - self.timeout = timeout - self.verify_ssl = verify_ssl - self.max_retries = max_retries - self.retry_delay = retry_delay - self.retry_backoff_factor = retry_backoff_factor - # Default retry status codes: 408 (Request Timeout), 429 (Too Many Requests), - # 500, 502, 503, 504 (Server Errors) - self.retry_status_codes = retry_status_codes or (408, 429, 500, 502, 503, 504) - self._session: Optional[aiohttp.ClientSession] = session - self._owns_session = session is None # Track if we have to close it - - @staticmethod - def _generate_operation_id(path: str) -> str: - """Generates a unique operation ID for logging.""" - return f"{path.strip('/').replace('/', '_')}_{uuid.uuid4().hex[:8]}" - - @staticmethod - def _create_json_payload_args( - data: Optional[Dict[str, Any]] = None, - headers: Optional[Dict[str, str]] = None, - ) -> Dict[str, Any]: - return { - "json": data, - "headers": headers, - } - - def _create_form_data_args( - self, - data: Dict[str, Any] | None, - files: Dict[str, Any] | None, - headers: Optional[Dict[str, str]] = None, - multipart_parser: Callable | None = None, - ) -> Dict[str, Any]: - if headers and "Content-Type" in headers: - del headers["Content-Type"] - - if multipart_parser and data: - data = multipart_parser(data) - - form = aiohttp.FormData(default_to_multipart=True) - if data: # regular text fields - for k, v in data.items(): - if v is None: - continue # aiohttp fails to serialize "None" values - # aiohttp expects strings or bytes; convert enums etc. - form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) - - if files: - file_iter = files if isinstance(files, list) else files.items() - for field_name, file_obj in file_iter: - if file_obj is None: - continue # aiohttp fails to serialize "None" values - # file_obj can be (filename, bytes/io.BytesIO, content_type) tuple - if isinstance(file_obj, tuple): - filename, file_value, content_type = self._unpack_tuple(file_obj) - else: - file_value = file_obj - filename = getattr(file_obj, "name", field_name) - content_type = "application/octet-stream" - - form.add_field( - name=field_name, - value=file_value, - filename=filename, - content_type=content_type, - ) - return {"data": form, "headers": headers or {}} - - @staticmethod - def _create_urlencoded_form_data_args( - data: Dict[str, Any], - headers: Optional[Dict[str, str]] = None, - ) -> Dict[str, Any]: - headers = headers or {} - headers["Content-Type"] = "application/x-www-form-urlencoded" - return { - "data": data, - "headers": headers, - } - - def get_headers(self) -> Dict[str, str]: - """Get headers for API requests, including authentication if available""" - headers = {"Content-Type": "application/json", "Accept": "application/json"} - - if self.auth_token: - headers["Authorization"] = f"Bearer {self.auth_token}" - elif self.comfy_api_key: - headers["X-API-KEY"] = self.comfy_api_key - - return headers - - async def _check_connectivity(self, target_url: str) -> Dict[str, bool]: - """ - Check connectivity to determine if network issues are local or server-related. - - Args: - target_url: URL to check connectivity to - - Returns: - Dictionary with connectivity status details - """ - results = { - "internet_accessible": False, - "api_accessible": False, - "is_local_issue": False, - "is_api_issue": False, - } - timeout = aiohttp.ClientTimeout(total=5.0) - async with aiohttp.ClientSession(timeout=timeout) as session: - try: - async with session.get("https://www.google.com", ssl=self.verify_ssl) as resp: - results["internet_accessible"] = resp.status < 500 - except (ClientError, asyncio.TimeoutError, socket.gaierror): - results["is_local_issue"] = True - return results # cannot reach the internet – early exit - - # Now check API health endpoint - parsed = urlparse(target_url) - health_url = f"{parsed.scheme}://{parsed.netloc}/health" - try: - async with session.get(health_url, ssl=self.verify_ssl) as resp: - results["api_accessible"] = resp.status < 500 - except ClientError: - pass # leave as False - - results["is_api_issue"] = results["internet_accessible"] and not results["api_accessible"] - return results - - async def request( - self, - method: str, - path: str, - params: Optional[Dict[str, Any]] = None, - data: Optional[Dict[str, Any]] = None, - files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, - headers: Optional[Dict[str, str]] = None, - content_type: str = "application/json", - multipart_parser: Callable | None = None, - retry_count: int = 0, # Used internally for tracking retries - ) -> Dict[str, Any]: - """ - Make an HTTP request to the API with automatic retries for transient errors. - - Args: - method: HTTP method (GET, POST, etc.) - path: API endpoint path (will be joined with base_url) - params: Query parameters - data: body data - files: Files to upload - headers: Additional headers - content_type: Content type of the request. Defaults to application/json. - retry_count: Internal parameter for tracking retries, do not set manually - - Returns: - Parsed JSON response - - Raises: - LocalNetworkError: If local network connectivity issues are detected - ApiServerError: If the API server is unreachable but internet is working - Exception: For other request failures - """ - - # Build full URL and merge headers - relative_path = path.lstrip("/") - url = urljoin(self.base_url, relative_path) - self._check_auth(self.auth_token, self.comfy_api_key) - - request_headers = self.get_headers() - if headers: - request_headers.update(headers) - if files: - request_headers.pop("Content-Type", None) - if params: - params = {k: v for k, v in params.items() if v is not None} # aiohttp fails to serialize None values - - logging.debug(f"[DEBUG] Request Headers: {request_headers}") - logging.debug(f"[DEBUG] Files: {files}") - logging.debug(f"[DEBUG] Params: {params}") - logging.debug(f"[DEBUG] Data: {data}") - - if content_type == "application/x-www-form-urlencoded": - payload_args = self._create_urlencoded_form_data_args(data or {}, request_headers) - elif content_type == "multipart/form-data": - payload_args = self._create_form_data_args(data, files, request_headers, multipart_parser) - else: - payload_args = self._create_json_payload_args(data, request_headers) - - operation_id = self._generate_operation_id(path) - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - request_headers=request_headers, - request_params=params, - request_data=data if content_type == "application/json" else "[form-data or other]", - ) - - session = await self._get_session() - try: - async with session.request( - method, - url, - params=params, - ssl=self.verify_ssl, - **payload_args, - ) as resp: - if resp.status >= 400: - try: - error_data = await resp.json() - except (aiohttp.ContentTypeError, json.JSONDecodeError): - error_data = await resp.text() - - return await self._handle_http_error( - ClientResponseError(resp.request_info, resp.history, status=resp.status, message=error_data), - operation_id, - method, - url, - params, - data, - files, - headers, - content_type, - multipart_parser, - retry_count=retry_count, - response_content=error_data, - ) - - # Success – parse JSON (safely) and log - try: - payload = await resp.json() - response_content_to_log = payload - except (aiohttp.ContentTypeError, json.JSONDecodeError): - payload = {} - response_content_to_log = await resp.text() - - request_logger.log_request_response( - operation_id=operation_id, - request_method=method, - request_url=url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content=response_content_to_log, - ) - return payload - - except (ClientError, asyncio.TimeoutError, socket.gaierror) as e: - # Treat as *connection* problem – optionally retry, else escalate - if retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning("Connection error. Retrying in %.2fs (%s/%s): %s", delay, retry_count + 1, - self.max_retries, str(e)) - await asyncio.sleep(delay) - return await self.request( - method, - path, - params=params, - data=data, - files=files, - headers=headers, - content_type=content_type, - multipart_parser=multipart_parser, - retry_count=retry_count + 1, - ) - # One final connectivity check for diagnostics - connectivity = await self._check_connectivity(self.base_url) - if connectivity["is_local_issue"]: - raise LocalNetworkError( - "Unable to connect to the API server due to local network issues. " - "Please check your internet connection and try again." - ) from e - raise ApiServerError( - f"The API server at {self.base_url} is currently unreachable. " - f"The service may be experiencing issues. Please try again later." - ) from e - - @staticmethod - def _check_auth(auth_token, comfy_api_key): - """Verify that an auth token is present or comfy_api_key is present""" - if auth_token is None and comfy_api_key is None: - raise Exception("Unauthorized: Please login first to use this node.") - return auth_token or comfy_api_key - - @staticmethod - async def upload_file( - upload_url: str, - file: io.BytesIO | str, - content_type: str | None = None, - max_retries: int = 3, - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - ) -> aiohttp.ClientResponse: - """Upload a file to the API with retry logic. - - Args: - upload_url: The URL to upload to - file: Either a file path string, BytesIO object, or tuple of (file_path, filename) - content_type: Optional mime type to set for the upload - max_retries: Maximum number of retry attempts - retry_delay: Initial delay between retries in seconds - retry_backoff_factor: Multiplier for the delay after each retry - """ - headers: Dict[str, str] = {} - skip_auto_headers: set[str] = set() - if content_type: - headers["Content-Type"] = content_type - else: - # tell aiohttp not to add Content-Type that will break the request signature and result in a 403 status. - skip_auto_headers.add("Content-Type") - - # Extract file bytes - if isinstance(file, io.BytesIO): - file.seek(0) - data = file.read() - elif isinstance(file, str): - with open(file, "rb") as f: - data = f.read() - else: - raise ValueError("File must be BytesIO or str path") - - operation_id = f"upload_{upload_url.split('/')[-1]}_{uuid.uuid4().hex[:8]}" - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - request_headers=headers, - request_data=f"[File data {len(data)} bytes]", - ) - - delay = retry_delay - for attempt in range(max_retries + 1): - try: - timeout = aiohttp.ClientTimeout(total=None) # honour server side timeouts - async with aiohttp.ClientSession(timeout=timeout) as session: - async with session.put( - upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers, - ) as resp: - resp.raise_for_status() - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - response_status_code=resp.status, - response_headers=dict(resp.headers), - response_content="File uploaded successfully.", - ) - return resp - except (ClientError, asyncio.TimeoutError) as e: - request_logger.log_request_response( - operation_id=operation_id, - request_method="PUT", - request_url=upload_url, - response_status_code=e.status if hasattr(e, "status") else None, - response_headers=dict(e.headers) if getattr(e, "headers") else None, - response_content=None, - error_message=f"{type(e).__name__}: {str(e)}", - ) - if attempt < max_retries: - logging.warning( - "Upload failed (%s/%s). Retrying in %.2fs. %s", attempt + 1, max_retries, delay, str(e) - ) - await asyncio.sleep(delay) - delay *= retry_backoff_factor - else: - raise NetworkError(f"Failed to upload file after {max_retries + 1} attempts: {e}") from e - - async def _handle_http_error( - self, - exc: ClientResponseError, - operation_id: str, - *req_meta, - retry_count: int, - response_content: dict | str = "", - ) -> Dict[str, Any]: - status_code = exc.status - if status_code == 401: - user_friendly = "Unauthorized: Please login first to use this node." - elif status_code == 402: - user_friendly = "Payment Required: Please add credits to your account to use this node." - elif status_code == 409: - user_friendly = "There is a problem with your account. Please contact support@comfy.org." - elif status_code == 429: - user_friendly = "Rate Limit Exceeded: Please try again later." - else: - if isinstance(response_content, dict): - if "error" in response_content and "message" in response_content["error"]: - user_friendly = f"API Error: {response_content['error']['message']}" - if "type" in response_content["error"]: - user_friendly += f" (Type: {response_content['error']['type']})" - else: # Handle cases where error is just a JSON dict with unknown format - user_friendly = f"API Error: {json.dumps(response_content)}" - else: - if len(response_content) < 200: # Arbitrary limit for display - user_friendly = f"API Error (raw): {response_content}" - else: - user_friendly = f"API Error (raw, status {response_content})" - - request_logger.log_request_response( - operation_id=operation_id, - request_method=req_meta[0], - request_url=req_meta[1], - response_status_code=exc.status, - response_headers=dict(req_meta[5]) if req_meta[5] else None, - response_content=response_content, - error_message=f"HTTP Error {exc.status}", - ) - - logging.debug(f"[DEBUG] API Error: {user_friendly} (Status: {status_code})") - if response_content: - logging.debug(f"[DEBUG] Response content: {response_content}") - - # Retry if eligible - if status_code in self.retry_status_codes and retry_count < self.max_retries: - delay = self.retry_delay * (self.retry_backoff_factor ** retry_count) - logging.warning( - "HTTP error %s. Retrying in %.2fs (%s/%s)", - status_code, - delay, - retry_count + 1, - self.max_retries, - ) - await asyncio.sleep(delay) - return await self.request( - req_meta[0], # method - req_meta[1].replace(self.base_url, ""), # path - params=req_meta[2], - data=req_meta[3], - files=req_meta[4], - headers=req_meta[5], - content_type=req_meta[6], - multipart_parser=req_meta[7], - retry_count=retry_count + 1, - ) - - raise Exception(user_friendly) from exc - - @staticmethod - def _unpack_tuple(t): - """Helper to normalise (filename, file, content_type) tuples.""" - if len(t) == 3: - return t - elif len(t) == 2: - return t[0], t[1], "application/octet-stream" - else: - raise ValueError("files tuple must be (filename, file[, content_type])") - - async def _get_session(self) -> aiohttp.ClientSession: - if self._session is None or self._session.closed: - timeout = aiohttp.ClientTimeout(total=self.timeout) - self._session = aiohttp.ClientSession(timeout=timeout) - self._owns_session = True - return self._session - - async def close(self) -> None: - if self._owns_session and self._session and not self._session.closed: - await self._session.close() - - async def __aenter__(self) -> "ApiClient": - """Allow usage as async‑context‑manager – ensures clean teardown""" - return self - - async def __aexit__(self, exc_type, exc, tb): - await self.close() - - -class ApiEndpoint(Generic[T, R]): - """Defines an API endpoint with its request and response types""" - - def __init__( - self, - path: str, - method: HttpMethod, - request_model: Type[T], - response_model: Type[R], - query_params: Optional[Dict[str, Any]] = None, - ): - """Initialize an API endpoint definition. - - Args: - path: The URL path for this endpoint, can include placeholders like {id} - method: The HTTP method to use (GET, POST, etc.) - request_model: Pydantic model class that defines the structure and validation rules for API requests to this endpoint - response_model: Pydantic model class that defines the structure and validation rules for API responses from this endpoint - query_params: Optional dictionary of query parameters to include in the request - """ - self.path = path - self.method = method - self.request_model = request_model - self.response_model = response_model - self.query_params = query_params or {} - - -class SynchronousOperation(Generic[T, R]): - """Represents a single synchronous API operation.""" - - def __init__( - self, - endpoint: ApiEndpoint[T, R], - request: T, - files: Optional[Dict[str, Any] | list[tuple[str, Any]]] = None, - api_base: str | None = None, - auth_token: Optional[str] = None, - comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str, str]] = None, - timeout: float = 7200.0, - verify_ssl: bool = True, - content_type: str = "application/json", - multipart_parser: Callable | None = None, - max_retries: int = 3, - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - ) -> None: - self.endpoint = endpoint - self.request = request - self.files = files - self.api_base: str = api_base or args.comfy_api_base - self.auth_token = auth_token - self.comfy_api_key = comfy_api_key - if auth_kwargs is not None: - self.auth_token = auth_kwargs.get("auth_token", self.auth_token) - self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key) - self.timeout = timeout - self.verify_ssl = verify_ssl - self.content_type = content_type - self.multipart_parser = multipart_parser - self.max_retries = max_retries - self.retry_delay = retry_delay - self.retry_backoff_factor = retry_backoff_factor - - async def execute(self, client: Optional[ApiClient] = None) -> R: - owns_client = client is None - if owns_client: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - timeout=self.timeout, - verify_ssl=self.verify_ssl, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - - try: - request_dict: Optional[Dict[str, Any]] - if isinstance(self.request, EmptyRequest): - request_dict = None - else: - request_dict = self.request.model_dump(exclude_none=True) - for k, v in list(request_dict.items()): - if isinstance(v, Enum): - request_dict[k] = v.value - - logging.debug( - f"[DEBUG] API Request: {self.endpoint.method.value} {self.endpoint.path}" - ) - logging.debug(f"[DEBUG] Request Data: {json.dumps(request_dict, indent=2)}") - logging.debug(f"[DEBUG] Query Params: {self.endpoint.query_params}") - - response_json = await client.request( - self.endpoint.method.value, - self.endpoint.path, - params=self.endpoint.query_params, - data=request_dict, - files=self.files, - content_type=self.content_type, - multipart_parser=self.multipart_parser, - ) - - logging.debug("=" * 50) - logging.debug("[DEBUG] RESPONSE DETAILS:") - logging.debug("[DEBUG] Status Code: 200 (Success)") - logging.debug(f"[DEBUG] Response Body: {json.dumps(response_json, indent=2)}") - logging.debug("=" * 50) - - parsed_response = self.endpoint.response_model.model_validate(response_json) - logging.debug(f"[DEBUG] Parsed Response: {parsed_response}") - return parsed_response - finally: - if owns_client: - await client.close() - - -class TaskStatus(str, Enum): - """Enum for task status values""" - - COMPLETED = "completed" - FAILED = "failed" - PENDING = "pending" - - -class PollingOperation(Generic[T, R]): - """Represents an asynchronous API operation that requires polling for completion.""" - - def __init__( - self, - poll_endpoint: ApiEndpoint[EmptyRequest, R], - completed_statuses: list[str], - failed_statuses: list[str], - status_extractor: Callable[[R], str], - progress_extractor: Callable[[R], float] | None = None, - result_url_extractor: Callable[[R], str] | None = None, - request: Optional[T] = None, - api_base: str | None = None, - auth_token: Optional[str] = None, - comfy_api_key: Optional[str] = None, - auth_kwargs: Optional[Dict[str, str]] = None, - poll_interval: float = 5.0, - max_poll_attempts: int = 120, # Default max polling attempts (10 minutes with 5s interval) - max_retries: int = 3, # Max retries per individual API call - retry_delay: float = 1.0, - retry_backoff_factor: float = 2.0, - estimated_duration: Optional[float] = None, - node_id: Optional[str] = None, - ) -> None: - self.poll_endpoint = poll_endpoint - self.request = request - self.api_base: str = api_base or args.comfy_api_base - self.auth_token = auth_token - self.comfy_api_key = comfy_api_key - if auth_kwargs is not None: - self.auth_token = auth_kwargs.get("auth_token", self.auth_token) - self.comfy_api_key = auth_kwargs.get("comfy_api_key", self.comfy_api_key) - self.poll_interval = poll_interval - self.max_poll_attempts = max_poll_attempts - self.max_retries = max_retries - self.retry_delay = retry_delay - self.retry_backoff_factor = retry_backoff_factor - self.estimated_duration = estimated_duration - self.status_extractor = status_extractor or (lambda x: getattr(x, "status", None)) - self.progress_extractor = progress_extractor - self.result_url_extractor = result_url_extractor - self.node_id = node_id - self.completed_statuses = completed_statuses - self.failed_statuses = failed_statuses - self.final_response: Optional[R] = None - - async def execute(self, client: Optional[ApiClient] = None) -> R: - owns_client = client is None - if owns_client: - client = ApiClient( - base_url=self.api_base, - auth_token=self.auth_token, - comfy_api_key=self.comfy_api_key, - max_retries=self.max_retries, - retry_delay=self.retry_delay, - retry_backoff_factor=self.retry_backoff_factor, - ) - try: - return await self._poll_until_complete(client) - finally: - if owns_client: - await client.close() - - def _display_text_on_node(self, text: str): - if not self.node_id: - return - PromptServer.instance.send_progress_text(text, self.node_id) - - def _display_time_progress_on_node(self, time_completed: int | float): - if not self.node_id: - return - if self.estimated_duration is not None: - remaining = max(0, int(self.estimated_duration) - time_completed) - message = f"Task in progress: {time_completed}s (~{remaining}s remaining)" - else: - message = f"Task in progress: {time_completed}s" - self._display_text_on_node(message) - - def _check_task_status(self, response: R) -> TaskStatus: - try: - status = self.status_extractor(response) - if status in self.completed_statuses: - return TaskStatus.COMPLETED - if status in self.failed_statuses: - return TaskStatus.FAILED - return TaskStatus.PENDING - except Exception as e: - logging.error("Error extracting status: %s", e) - return TaskStatus.PENDING - - async def _poll_until_complete(self, client: ApiClient) -> R: - """Poll until the task is complete""" - consecutive_errors = 0 - max_consecutive_errors = min(5, self.max_retries * 2) # Limit consecutive errors - - if self.progress_extractor: - progress = utils.ProgressBar(PROGRESS_BAR_MAX) - - status = TaskStatus.PENDING - for poll_count in range(1, self.max_poll_attempts + 1): - try: - logging.debug(f"[DEBUG] Polling attempt #{poll_count}") - - request_dict = ( - None if self.request is None else self.request.model_dump(exclude_none=True) - ) - - if poll_count == 1: - logging.debug( - f"[DEBUG] Poll Request: {self.poll_endpoint.method.value} {self.poll_endpoint.path}" - ) - logging.debug( - f"[DEBUG] Poll Request Data: {json.dumps(request_dict, indent=2) if request_dict else 'None'}" - ) - - # Query task status - resp = await client.request( - self.poll_endpoint.method.value, - self.poll_endpoint.path, - params=self.poll_endpoint.query_params, - data=request_dict, - ) - consecutive_errors = 0 # reset on success - response_obj: R = self.poll_endpoint.response_model.model_validate(resp) - - # Check if task is complete - status = self._check_task_status(response_obj) - logging.debug(f"[DEBUG] Task Status: {status}") - - # If progress extractor is provided, extract progress - if self.progress_extractor: - new_progress = self.progress_extractor(response_obj) - if new_progress is not None: - progress.update_absolute(new_progress, total=PROGRESS_BAR_MAX) - - if status == TaskStatus.COMPLETED: - message = "Task completed successfully" - if self.result_url_extractor: - result_url = self.result_url_extractor(response_obj) - if result_url: - message = f"Result URL: {result_url}" - logging.debug(f"[DEBUG] {message}") - self._display_text_on_node(message) - self.final_response = response_obj - if self.progress_extractor: - progress.update(100) - return self.final_response - if status == TaskStatus.FAILED: - message = f"Task failed: {json.dumps(resp)}" - logging.error(f"[DEBUG] {message}") - raise Exception(message) - logging.debug("[DEBUG] Task still pending, continuing to poll...") - # Task pending – wait - for i in range(int(self.poll_interval)): - self._display_time_progress_on_node((poll_count - 1) * self.poll_interval + i) - await asyncio.sleep(1) - - except (LocalNetworkError, ApiServerError, NetworkError) as e: - consecutive_errors += 1 - if consecutive_errors >= max_consecutive_errors: - raise Exception( - f"Polling aborted after {consecutive_errors} network errors: {str(e)}" - ) from e - logging.warning("Network error (%s/%s): %s", consecutive_errors, max_consecutive_errors, str(e)) - await asyncio.sleep(self.poll_interval) - except Exception as e: - # For other errors, increment count and potentially abort - consecutive_errors += 1 - if consecutive_errors >= max_consecutive_errors or status == TaskStatus.FAILED: - raise Exception( - f"Polling aborted after {consecutive_errors} consecutive errors: {str(e)}" - ) from e - - logging.error(f"[DEBUG] Polling error: {str(e)}") - logging.warning( - f"Error during polling (attempt {poll_count}/{self.max_poll_attempts}): {str(e)}. " - f"Will retry in {self.poll_interval} seconds." - ) - await asyncio.sleep(self.poll_interval) - - # If we've exhausted all polling attempts - raise Exception( - f"Polling timed out after {self.max_poll_attempts} attempts (" f"{self.max_poll_attempts * self.poll_interval} seconds). " - "The operation may still be running on the server but is taking longer than expected." - ) diff --git a/comfy_api_nodes/apis/gemini_api.py b/comfy_api_nodes/apis/gemini_api.py index 138bf035d..2bf28bf93 100644 --- a/comfy_api_nodes/apis/gemini_api.py +++ b/comfy_api_nodes/apis/gemini_api.py @@ -1,19 +1,22 @@ -from __future__ import annotations - -from typing import List, Optional +from typing import Optional from comfy_api_nodes.apis import GeminiGenerationConfig, GeminiContent, GeminiSafetySetting, GeminiSystemInstructionContent, GeminiTool, GeminiVideoMetadata from pydantic import BaseModel +class GeminiImageConfig(BaseModel): + aspectRatio: Optional[str] = None + + class GeminiImageGenerationConfig(GeminiGenerationConfig): - responseModalities: Optional[List[str]] = None + responseModalities: Optional[list[str]] = None + imageConfig: Optional[GeminiImageConfig] = None class GeminiImageGenerateContentRequest(BaseModel): - contents: List[GeminiContent] + contents: list[GeminiContent] generationConfig: Optional[GeminiImageGenerationConfig] = None - safetySettings: Optional[List[GeminiSafetySetting]] = None + safetySettings: Optional[list[GeminiSafetySetting]] = None systemInstruction: Optional[GeminiSystemInstructionContent] = None - tools: Optional[List[GeminiTool]] = None + tools: Optional[list[GeminiTool]] = None videoMetadata: Optional[GeminiVideoMetadata] = None diff --git a/comfy_api_nodes/apis/minimax_api.py b/comfy_api_nodes/apis/minimax_api.py new file mode 100644 index 000000000..d747e177a --- /dev/null +++ b/comfy_api_nodes/apis/minimax_api.py @@ -0,0 +1,120 @@ +from enum import Enum +from typing import Optional + +from pydantic import BaseModel, Field + + +class MinimaxBaseResponse(BaseModel): + status_code: int = Field( + ..., + description='Status code. 0 indicates success, other values indicate errors.', + ) + status_msg: str = Field( + ..., description='Specific error details or success message.' + ) + + +class File(BaseModel): + bytes: Optional[int] = Field(None, description='File size in bytes') + created_at: Optional[int] = Field( + None, description='Unix timestamp when the file was created, in seconds' + ) + download_url: Optional[str] = Field( + None, description='The URL to download the video' + ) + backup_download_url: Optional[str] = Field( + None, description='The backup URL to download the video' + ) + + file_id: Optional[int] = Field(None, description='Unique identifier for the file') + filename: Optional[str] = Field(None, description='The name of the file') + purpose: Optional[str] = Field(None, description='The purpose of using the file') + + +class MinimaxFileRetrieveResponse(BaseModel): + base_resp: MinimaxBaseResponse + file: File + + +class MiniMaxModel(str, Enum): + T2V_01_Director = 'T2V-01-Director' + I2V_01_Director = 'I2V-01-Director' + S2V_01 = 'S2V-01' + I2V_01 = 'I2V-01' + I2V_01_live = 'I2V-01-live' + T2V_01 = 'T2V-01' + Hailuo_02 = 'MiniMax-Hailuo-02' + + +class Status6(str, Enum): + Queueing = 'Queueing' + Preparing = 'Preparing' + Processing = 'Processing' + Success = 'Success' + Fail = 'Fail' + + +class MinimaxTaskResultResponse(BaseModel): + base_resp: MinimaxBaseResponse + file_id: Optional[str] = Field( + None, + description='After the task status changes to Success, this field returns the file ID corresponding to the generated video.', + ) + status: Status6 = Field( + ..., + description="Task status: 'Queueing' (in queue), 'Preparing' (task is preparing), 'Processing' (generating), 'Success' (task completed successfully), or 'Fail' (task failed).", + ) + task_id: str = Field(..., description='The task ID being queried.') + + +class SubjectReferenceItem(BaseModel): + image: Optional[str] = Field( + None, description='URL or base64 encoding of the subject reference image.' + ) + mask: Optional[str] = Field( + None, + description='URL or base64 encoding of the mask for the subject reference image.', + ) + + +class MinimaxVideoGenerationRequest(BaseModel): + callback_url: Optional[str] = Field( + None, + description='Optional. URL to receive real-time status updates about the video generation task.', + ) + first_frame_image: Optional[str] = Field( + None, + description='URL or base64 encoding of the first frame image. Required when model is I2V-01, I2V-01-Director, or I2V-01-live.', + ) + model: MiniMaxModel = Field( + ..., + description='Required. ID of model. Options: T2V-01-Director, I2V-01-Director, S2V-01, I2V-01, I2V-01-live, T2V-01', + ) + prompt: Optional[str] = Field( + None, + description='Description of the video. Should be less than 2000 characters. Supports camera movement instructions in [brackets].', + max_length=2000, + ) + prompt_optimizer: Optional[bool] = Field( + True, + description='If true (default), the model will automatically optimize the prompt. Set to false for more precise control.', + ) + subject_reference: Optional[list[SubjectReferenceItem]] = Field( + None, + description='Only available when model is S2V-01. The model will generate a video based on the subject uploaded through this parameter.', + ) + duration: Optional[int] = Field( + None, + description="The length of the output video in seconds." + ) + resolution: Optional[str] = Field( + None, + description="The dimensions of the video display. 1080p corresponds to 1920 x 1080 pixels, 768p corresponds to 1366 x 768 pixels." + ) + + +class MinimaxVideoGenerationResponse(BaseModel): + base_resp: MinimaxBaseResponse + task_id: str = Field( + ..., description='The task ID for the asynchronous video generation task.' + ) diff --git a/comfy_api_nodes/apis/pika_api.py b/comfy_api_nodes/apis/pika_api.py new file mode 100644 index 000000000..232558cd7 --- /dev/null +++ b/comfy_api_nodes/apis/pika_api.py @@ -0,0 +1,100 @@ +from typing import Optional +from enum import Enum +from pydantic import BaseModel, Field + + +class Pikaffect(str, Enum): + Cake_ify = "Cake-ify" + Crumble = "Crumble" + Crush = "Crush" + Decapitate = "Decapitate" + Deflate = "Deflate" + Dissolve = "Dissolve" + Explode = "Explode" + Eye_pop = "Eye-pop" + Inflate = "Inflate" + Levitate = "Levitate" + Melt = "Melt" + Peel = "Peel" + Poke = "Poke" + Squish = "Squish" + Ta_da = "Ta-da" + Tear = "Tear" + + +class PikaBodyGenerate22C2vGenerate22PikascenesPost(BaseModel): + aspectRatio: Optional[float] = Field(None, description='Aspect ratio (width / height)') + duration: Optional[int] = Field(5) + ingredientsMode: str = Field(...) + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + resolution: Optional[str] = Field('1080p') + seed: Optional[int] = Field(None) + + +class PikaGenerateResponse(BaseModel): + video_id: str = Field(...) + + +class PikaBodyGenerate22I2vGenerate22I2vPost(BaseModel): + duration: Optional[int] = 5 + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + resolution: Optional[str] = '1080p' + seed: Optional[int] = Field(None) + + +class PikaBodyGenerate22KeyframeGenerate22PikaframesPost(BaseModel): + duration: Optional[int] = Field(None, ge=5, le=10) + negativePrompt: Optional[str] = Field(None) + promptText: str = Field(...) + resolution: Optional[str] = '1080p' + seed: Optional[int] = Field(None) + + +class PikaBodyGenerate22T2vGenerate22T2vPost(BaseModel): + aspectRatio: Optional[float] = Field( + 1.7777777777777777, + description='Aspect ratio (width / height)', + ge=0.4, + le=2.5, + ) + duration: Optional[int] = 5 + negativePrompt: Optional[str] = Field(None) + promptText: str = Field(...) + resolution: Optional[str] = '1080p' + seed: Optional[int] = Field(None) + + +class PikaBodyGeneratePikadditionsGeneratePikadditionsPost(BaseModel): + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + seed: Optional[int] = Field(None) + + +class PikaBodyGeneratePikaffectsGeneratePikaffectsPost(BaseModel): + negativePrompt: Optional[str] = Field(None) + pikaffect: Optional[str] = None + promptText: Optional[str] = Field(None) + seed: Optional[int] = Field(None) + + +class PikaBodyGeneratePikaswapsGeneratePikaswapsPost(BaseModel): + negativePrompt: Optional[str] = Field(None) + promptText: Optional[str] = Field(None) + seed: Optional[int] = Field(None) + modifyRegionRoi: Optional[str] = Field(None) + + +class PikaStatusEnum(str, Enum): + queued = "queued" + started = "started" + finished = "finished" + failed = "failed" + + +class PikaVideoResponse(BaseModel): + id: str = Field(...) + progress: Optional[int] = Field(None) + status: PikaStatusEnum + url: Optional[str] = Field(None) diff --git a/comfy_api_nodes/apis/rodin_api.py b/comfy_api_nodes/apis/rodin_api.py index 02cf42c29..fc26a6e73 100644 --- a/comfy_api_nodes/apis/rodin_api.py +++ b/comfy_api_nodes/apis/rodin_api.py @@ -52,7 +52,3 @@ class RodinResourceItem(BaseModel): class Rodin3DDownloadResponse(BaseModel): list: List[RodinResourceItem] = Field(..., description="Source List") - - - - diff --git a/comfy_api_nodes/apis/tripo_api.py b/comfy_api_nodes/apis/tripo_api.py index 9f43d4d09..713260e2a 100644 --- a/comfy_api_nodes/apis/tripo_api.py +++ b/comfy_api_nodes/apis/tripo_api.py @@ -1,13 +1,20 @@ from __future__ import annotations -from comfy_api_nodes.apis import ( - TripoModelVersion, - TripoTextureQuality, -) from enum import Enum from typing import Optional, List, Dict, Any, Union from pydantic import BaseModel, Field, RootModel +class TripoModelVersion(str, Enum): + v2_5_20250123 = 'v2.5-20250123' + v2_0_20240919 = 'v2.0-20240919' + v1_4_20240625 = 'v1.4-20240625' + + +class TripoTextureQuality(str, Enum): + standard = 'standard' + detailed = 'detailed' + + class TripoStyle(str, Enum): PERSON_TO_CARTOON = "person:person2cartoon" ANIMAL_VENOM = "animal:venom" diff --git a/comfy_api_nodes/apis/veo_api.py b/comfy_api_nodes/apis/veo_api.py new file mode 100644 index 000000000..a55137afb --- /dev/null +++ b/comfy_api_nodes/apis/veo_api.py @@ -0,0 +1,111 @@ +from typing import Optional, Union +from enum import Enum + +from pydantic import BaseModel, Field + + +class Image2(BaseModel): + bytesBase64Encoded: str + gcsUri: Optional[str] = None + mimeType: Optional[str] = None + + +class Image3(BaseModel): + bytesBase64Encoded: Optional[str] = None + gcsUri: str + mimeType: Optional[str] = None + + +class Instance1(BaseModel): + image: Optional[Union[Image2, Image3]] = Field( + None, description='Optional image to guide video generation' + ) + prompt: str = Field(..., description='Text description of the video') + + +class PersonGeneration1(str, Enum): + ALLOW = 'ALLOW' + BLOCK = 'BLOCK' + + +class Parameters1(BaseModel): + aspectRatio: Optional[str] = Field(None, examples=['16:9']) + durationSeconds: Optional[int] = None + enhancePrompt: Optional[bool] = None + generateAudio: Optional[bool] = Field( + None, + description='Generate audio for the video. Only supported by veo 3 models.', + ) + negativePrompt: Optional[str] = None + personGeneration: Optional[PersonGeneration1] = None + sampleCount: Optional[int] = None + seed: Optional[int] = None + storageUri: Optional[str] = Field( + None, description='Optional Cloud Storage URI to upload the video' + ) + + +class VeoGenVidRequest(BaseModel): + instances: Optional[list[Instance1]] = None + parameters: Optional[Parameters1] = None + + +class VeoGenVidResponse(BaseModel): + name: str = Field( + ..., + description='Operation resource name', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/a1b07c8e-7b5a-4aba-bb34-3e1ccb8afcc8' + ], + ) + + +class VeoGenVidPollRequest(BaseModel): + operationName: str = Field( + ..., + description='Full operation name (from predict response)', + examples=[ + 'projects/PROJECT_ID/locations/us-central1/publishers/google/models/MODEL_ID/operations/OPERATION_ID' + ], + ) + + +class Video(BaseModel): + bytesBase64Encoded: Optional[str] = Field( + None, description='Base64-encoded video content' + ) + gcsUri: Optional[str] = Field(None, description='Cloud Storage URI of the video') + mimeType: Optional[str] = Field(None, description='Video MIME type') + + +class Error1(BaseModel): + code: Optional[int] = Field(None, description='Error code') + message: Optional[str] = Field(None, description='Error message') + + +class Response1(BaseModel): + field_type: Optional[str] = Field( + None, + alias='@type', + examples=[ + 'type.googleapis.com/cloud.ai.large_models.vision.GenerateVideoResponse' + ], + ) + raiMediaFilteredCount: Optional[int] = Field( + None, description='Count of media filtered by responsible AI policies' + ) + raiMediaFilteredReasons: Optional[list[str]] = Field( + None, description='Reasons why media was filtered by responsible AI policies' + ) + videos: Optional[list[Video]] = None + + +class VeoGenVidPollResponse(BaseModel): + done: Optional[bool] = None + error: Optional[Error1] = Field( + None, description='Error details if operation failed' + ) + name: Optional[str] = None + response: Optional[Response1] = Field( + None, description='The actual prediction response if done is true' + ) diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index c09be8d5b..1740fb377 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -1,406 +1,272 @@ -import asyncio -import io from inspect import cleandoc -from typing import Union, Optional -from comfy.comfy_types.node_typing import IO, ComfyNodeABC +from typing import Optional + +import torch +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.bfl_api import ( - BFLStatus, BFLFluxExpandImageRequest, BFLFluxFillImageRequest, - BFLFluxCannyImageRequest, - BFLFluxDepthImageRequest, - BFLFluxProGenerateRequest, BFLFluxKontextProGenerateRequest, - BFLFluxProUltraGenerateRequest, + BFLFluxProGenerateRequest, BFLFluxProGenerateResponse, + BFLFluxProUltraGenerateRequest, + BFLFluxStatusResponse, + BFLStatus, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, -) -from comfy_api_nodes.apinode_utils import ( - downscale_image_tensor, - validate_aspect_ratio, - process_image_response, + download_url_to_image_tensor, + poll_op, resize_mask_to_image, + sync_op, + tensor_to_base64_string, + validate_aspect_ratio_string, validate_string, ) -import numpy as np -from PIL import Image -import aiohttp -import torch -import base64 -import time -from server import PromptServer - def convert_mask_to_image(mask: torch.Tensor): """ Make mask have the expected amount of dims (4) and channels (3) to be recognized as an image. """ mask = mask.unsqueeze(-1) - mask = torch.cat([mask]*3, dim=-1) + mask = torch.cat([mask] * 3, dim=-1) return mask -async def handle_bfl_synchronous_operation( - operation: SynchronousOperation, - timeout_bfl_calls=360, - node_id: Union[str, None] = None, -): - response_api: BFLFluxProGenerateResponse = await operation.execute() - return await _poll_until_generated( - response_api.polling_url, timeout=timeout_bfl_calls, node_id=node_id - ) - - -async def _poll_until_generated( - polling_url: str, timeout=360, node_id: Union[str, None] = None -): - # used bfl-comfy-nodes to verify code implementation: - # https://github.com/black-forest-labs/bfl-comfy-nodes/tree/main - start_time = time.time() - retries_404 = 0 - max_retries_404 = 5 - retry_404_seconds = 2 - retry_202_seconds = 2 - retry_pending_seconds = 1 - - async with aiohttp.ClientSession() as session: - # NOTE: should True loop be replaced with checking if workflow has been interrupted? - while True: - if node_id: - time_elapsed = time.time() - start_time - PromptServer.instance.send_progress_text( - f"Generating ({time_elapsed:.0f}s)", node_id - ) - - async with session.get(polling_url) as response: - if response.status == 200: - result = await response.json() - if result["status"] == BFLStatus.ready: - img_url = result["result"]["sample"] - if node_id: - PromptServer.instance.send_progress_text( - f"Result URL: {img_url}", node_id - ) - async with session.get(img_url) as img_resp: - return process_image_response(await img_resp.content.read()) - elif result["status"] in [ - BFLStatus.request_moderated, - BFLStatus.content_moderated, - ]: - status = result["status"] - raise Exception( - f"BFL API did not return an image due to: {status}." - ) - elif result["status"] == BFLStatus.error: - raise Exception(f"BFL API encountered an error: {result}.") - elif result["status"] == BFLStatus.pending: - await asyncio.sleep(retry_pending_seconds) - continue - elif response.status == 404: - if retries_404 < max_retries_404: - retries_404 += 1 - await asyncio.sleep(retry_404_seconds) - continue - raise Exception( - f"BFL API could not find task after {max_retries_404} tries." - ) - elif response.status == 202: - await asyncio.sleep(retry_202_seconds) - elif time.time() - start_time > timeout: - raise Exception( - f"BFL API experienced a timeout; could not return request under {timeout} seconds." - ) - else: - raise Exception(f"BFL API encountered an error: {response.json()}") - -def convert_image_to_base64(image: torch.Tensor): - scaled_image = downscale_image_tensor(image, total_pixels=2048 * 2048) - # remove batch dimension if present - if len(scaled_image.shape) > 3: - scaled_image = scaled_image[0] - image_np = (scaled_image.numpy() * 255).astype(np.uint8) - img = Image.fromarray(image_np) - img_byte_arr = io.BytesIO() - img.save(img_byte_arr, format="PNG") - return base64.b64encode(img_byte_arr.getvalue()).decode() - - -class FluxProUltraImageNode(ComfyNodeABC): +class FluxProUltraImageNode(IO.ComfyNode): """ Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution. """ - MINIMUM_RATIO = 1 / 4 - MAXIMUM_RATIO = 4 / 1 - MINIMUM_RATIO_STR = "1:4" - MAXIMUM_RATIO_STR = "4:1" + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="FluxProUltraImageNode", + display_name="Flux 1.1 [pro] Ultra Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", + ), + IO.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + IO.String.Input( + "aspect_ratio", + default="16:9", + tooltip="Aspect ratio of image; must be between 1:4 and 4:1.", + ), + IO.Boolean.Input( + "raw", + default=False, + tooltip="When True, generate less processed, more natural-looking images.", + ), + IO.Image.Input( + "image_prompt", + optional=True, + ), + IO.Float.Input( + "image_prompt_strength", + default=0.1, + min=0.0, + max=1.0, + step=0.01, + tooltip="Blend between the prompt and the image prompt.", + optional=True, + ), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - "aspect_ratio": ( - IO.STRING, - { - "default": "16:9", - "tooltip": "Aspect ratio of image; must be between 1:4 and 4:1.", - }, - ), - "raw": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "When True, generate less processed, more natural-looking images.", - }, - ), - }, - "optional": { - "image_prompt": (IO.IMAGE,), - "image_prompt_strength": ( - IO.FLOAT, - { - "default": 0.1, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Blend between the prompt and the image prompt.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - @classmethod - def VALIDATE_INPUTS(cls, aspect_ratio: str): - try: - validate_aspect_ratio( - aspect_ratio, - minimum_ratio=cls.MINIMUM_RATIO, - maximum_ratio=cls.MAXIMUM_RATIO, - minimum_ratio_str=cls.MINIMUM_RATIO_STR, - maximum_ratio_str=cls.MAXIMUM_RATIO_STR, - ) - except Exception as e: - return str(e) + def validate_inputs(cls, aspect_ratio: str): + validate_aspect_ratio_string(aspect_ratio, (1, 4), (4, 1)) return True - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, aspect_ratio: str, - prompt_upsampling=False, - raw=False, - seed=0, - image_prompt=None, - image_prompt_strength=0.1, - unique_id: Union[str, None] = None, - **kwargs, - ): + prompt_upsampling: bool = False, + raw: bool = False, + seed: int = 0, + image_prompt: Optional[torch.Tensor] = None, + image_prompt_strength: float = 0.1, + ) -> IO.NodeOutput: if image_prompt is None: validate_string(prompt, strip_whitespace=False) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.1-ultra/generate", - method=HttpMethod.POST, - request_model=BFLFluxProUltraGenerateRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxProUltraGenerateRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-pro-1.1-ultra/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxProUltraGenerateRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, seed=seed, - aspect_ratio=validate_aspect_ratio( - aspect_ratio, - minimum_ratio=self.MINIMUM_RATIO, - maximum_ratio=self.MAXIMUM_RATIO, - minimum_ratio_str=self.MINIMUM_RATIO_STR, - maximum_ratio_str=self.MAXIMUM_RATIO_STR, - ), + aspect_ratio=aspect_ratio, raw=raw, - image_prompt=( - image_prompt - if image_prompt is None - else convert_image_to_base64(image_prompt) - ), - image_prompt_strength=( - None if image_prompt is None else round(image_prompt_strength, 2) - ), + image_prompt=(image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt)), + image_prompt_strength=(None if image_prompt is None else round(image_prompt_strength, 2)), ), - auth_kwargs=kwargs, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) -class FluxKontextProImageNode(ComfyNodeABC): +class FluxKontextProImageNode(IO.ComfyNode): """ Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio. """ - MINIMUM_RATIO = 1 / 4 - MAXIMUM_RATIO = 4 / 1 - MINIMUM_RATIO_STR = "1:4" - MAXIMUM_RATIO_STR = "4:1" - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation - specify what and how to edit.", - }, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id=cls.NODE_ID, + display_name=cls.DISPLAY_NAME, + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation - specify what and how to edit.", ), - "aspect_ratio": ( - IO.STRING, - { - "default": "16:9", - "tooltip": "Aspect ratio of image; must be between 1:4 and 4:1.", - }, + IO.String.Input( + "aspect_ratio", + default="16:9", + tooltip="Aspect ratio of image; must be between 1:4 and 4:1.", ), - "guidance": ( - IO.FLOAT, - { - "default": 3.0, - "min": 0.1, - "max": 99.0, - "step": 0.1, - "tooltip": "Guidance strength for the image generation process" - }, + IO.Float.Input( + "guidance", + default=3.0, + min=0.1, + max=99.0, + step=0.1, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 1, - "max": 150, - "tooltip": "Number of steps for the image generation process" - }, + IO.Int.Input( + "steps", + default=50, + min=1, + max=150, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 1234, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + IO.Int.Input( + "seed", + default=1234, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + IO.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", ), - }, - "optional": { - "input_image": (IO.IMAGE,), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" + IO.Image.Input( + "input_image", + optional=True, + ), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) BFL_PATH = "/proxy/bfl/flux-kontext-pro/generate" + NODE_ID = "FluxKontextProImageNode" + DISPLAY_NAME = "Flux.1 Kontext [pro] Image" - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, aspect_ratio: str, guidance: float, steps: int, - input_image: Optional[torch.Tensor]=None, + input_image: Optional[torch.Tensor] = None, seed=0, prompt_upsampling=False, - unique_id: Union[str, None] = None, - **kwargs, - ): - aspect_ratio = validate_aspect_ratio( - aspect_ratio, - minimum_ratio=self.MINIMUM_RATIO, - maximum_ratio=self.MAXIMUM_RATIO, - minimum_ratio_str=self.MINIMUM_RATIO_STR, - maximum_ratio_str=self.MAXIMUM_RATIO_STR, - ) + ) -> IO.NodeOutput: + validate_aspect_ratio_string(aspect_ratio, (1, 4), (4, 1)) if input_image is None: validate_string(prompt, strip_whitespace=False) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=self.BFL_PATH, - method=HttpMethod.POST, - request_model=BFLFluxKontextProGenerateRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxKontextProGenerateRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path=cls.BFL_PATH, method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxKontextProGenerateRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, guidance=round(guidance, 1), steps=steps, seed=seed, aspect_ratio=aspect_ratio, - input_image=( - input_image - if input_image is None - else convert_image_to_base64(input_image) - ) + input_image=(input_image if input_image is None else tensor_to_base64_string(input_image)), ), - auth_kwargs=kwargs, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) class FluxKontextMaxImageNode(FluxKontextProImageNode): @@ -410,63 +276,62 @@ class FluxKontextMaxImageNode(FluxKontextProImageNode): DESCRIPTION = cleandoc(__doc__ or "") BFL_PATH = "/proxy/bfl/flux-kontext-max/generate" + NODE_ID = "FluxKontextMaxImageNode" + DISPLAY_NAME = "Flux.1 Kontext [max] Image" -class FluxProImageNode(ComfyNodeABC): +class FluxProImageNode(IO.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="FluxProImageNode", + display_name="Flux 1.1 [pro] Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + IO.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), - "width": ( - IO.INT, - { - "default": 1024, - "min": 256, - "max": 1440, - "step": 32, - }, + IO.Int.Input( + "width", + default=1024, + min=256, + max=1440, + step=32, ), - "height": ( - IO.INT, - { - "default": 768, - "min": 256, - "max": 1440, - "step": 32, - }, + IO.Int.Input( + "height", + default=768, + min=256, + max=1440, + step=32, ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", + ), + IO.Image.Input( + "image_prompt", + optional=True, ), - }, - "optional": { - "image_prompt": (IO.IMAGE,), # "image_prompt_strength": ( # IO.FLOAT, # { @@ -477,22 +342,19 @@ class FluxProImageNode(ComfyNodeABC): # "tooltip": "Blend between the prompt and the image prompt.", # }, # ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, prompt_upsampling, width: int, @@ -500,23 +362,16 @@ class FluxProImageNode(ComfyNodeABC): seed=0, image_prompt=None, # image_prompt_strength=0.1, - unique_id: Union[str, None] = None, - **kwargs, - ): - image_prompt = ( - image_prompt - if image_prompt is None - else convert_image_to_base64(image_prompt) - ) - - operation = SynchronousOperation( - endpoint=ApiEndpoint( + ) -> IO.NodeOutput: + image_prompt = image_prompt if image_prompt is None else tensor_to_base64_string(image_prompt) + initial_response = await sync_op( + cls, + ApiEndpoint( path="/proxy/bfl/flux-pro-1.1/generate", - method=HttpMethod.POST, - request_model=BFLFluxProGenerateRequest, - response_model=BFLFluxProGenerateResponse, + method="POST", ), - request=BFLFluxProGenerateRequest( + response_model=BFLFluxProGenerateResponse, + data=BFLFluxProGenerateRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, width=width, @@ -524,118 +379,115 @@ class FluxProImageNode(ComfyNodeABC): seed=seed, image_prompt=image_prompt, ), - auth_kwargs=kwargs, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) -class FluxProExpandNode(ComfyNodeABC): +class FluxProExpandNode(IO.ComfyNode): """ Outpaints image based on prompt. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="FluxProExpandNode", + display_name="Flux.1 Expand Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("image"), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + IO.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), - "top": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the top of the image" - }, + IO.Int.Input( + "top", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the top of the image", ), - "bottom": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the bottom of the image" - }, + IO.Int.Input( + "bottom", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the bottom of the image", ), - "left": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the left side of the image" - }, + IO.Int.Input( + "left", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the left of the image", ), - "right": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2048, - "tooltip": "Number of pixels to expand at the right side of the image" - }, + IO.Int.Input( + "right", + default=0, + min=0, + max=2048, + tooltip="Number of pixels to expand at the right of the image", ), - "guidance": ( - IO.FLOAT, - { - "default": 60, - "min": 1.5, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, + IO.Float.Input( + "guidance", + default=60, + min=1.5, + max=100, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, + IO.Int.Input( + "steps", + default=50, + min=15, + max=50, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, prompt: str, prompt_upsampling: bool, @@ -646,19 +498,12 @@ class FluxProExpandNode(ComfyNodeABC): steps: int, guidance: float, seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): - image = convert_image_to_base64(image) - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-expand/generate", - method=HttpMethod.POST, - request_model=BFLFluxExpandImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxExpandImageRequest( + ) -> IO.NodeOutput: + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-pro-1.0-expand/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxExpandImageRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, top=top, @@ -668,86 +513,90 @@ class FluxProExpandNode(ComfyNodeABC): steps=steps, guidance=guidance, seed=seed, - image=image, + image=tensor_to_base64_string(image), ), - auth_kwargs=kwargs, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], + ) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) - -class FluxProFillNode(ComfyNodeABC): +class FluxProFillNode(IO.ComfyNode): """ Inpaints image based on mask and prompt. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "mask": (IO.MASK,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="FluxProFillNode", + display_name="Flux.1 Fill Image", + category="api node/image/BFL", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("image"), + IO.Mask.Input("mask"), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, + IO.Boolean.Input( + "prompt_upsampling", + default=False, + tooltip="Whether to perform upsampling on the prompt. " + "If active, automatically modifies the prompt for more creative generation, " + "but results are nondeterministic (same seed will not produce exactly the same result).", ), - "guidance": ( - IO.FLOAT, - { - "default": 60, - "min": 1.5, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, + IO.Float.Input( + "guidance", + default=60, + min=1.5, + max=100, + tooltip="Guidance strength for the image generation process", ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, + IO.Int.Input( + "steps", + default=50, + min=15, + max=50, + tooltip="Number of steps for the image generation process", ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="The random seed used for creating the noise.", ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, mask: torch.Tensor, prompt: str, @@ -755,323 +604,54 @@ class FluxProFillNode(ComfyNodeABC): steps: int, guidance: float, seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): + ) -> IO.NodeOutput: # prepare mask mask = resize_mask_to_image(mask, image) - mask = convert_image_to_base64(convert_mask_to_image(mask)) - # make sure image will have alpha channel removed - image = convert_image_to_base64(image[:, :, :, :3]) - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-fill/generate", - method=HttpMethod.POST, - request_model=BFLFluxFillImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxFillImageRequest( + mask = tensor_to_base64_string(convert_mask_to_image(mask)) + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/bfl/flux-pro-1.0-fill/generate", method="POST"), + response_model=BFLFluxProGenerateResponse, + data=BFLFluxFillImageRequest( prompt=prompt, prompt_upsampling=prompt_upsampling, steps=steps, guidance=guidance, seed=seed, - image=image, + image=tensor_to_base64_string(image[:, :, :, :3]), # make sure image will have alpha channel removed mask=mask, ), - auth_kwargs=kwargs, ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) - - -class FluxProCannyNode(ComfyNodeABC): - """ - Generate image using a control image (canny). - """ - - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "control_image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, - ), - "canny_low_threshold": ( - IO.FLOAT, - { - "default": 0.1, - "min": 0.01, - "max": 0.99, - "step": 0.01, - "tooltip": "Low threshold for Canny edge detection; ignored if skip_processing is True" - }, - ), - "canny_high_threshold": ( - IO.FLOAT, - { - "default": 0.4, - "min": 0.01, - "max": 0.99, - "step": 0.01, - "tooltip": "High threshold for Canny edge detection; ignored if skip_processing is True" - }, - ), - "skip_preprocessing": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to skip preprocessing; set to True if control_image already is canny-fied, False if it is a raw image.", - }, - ), - "guidance": ( - IO.FLOAT, - { - "default": 30, - "min": 1, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, - ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, - control_image: torch.Tensor, - prompt: str, - prompt_upsampling: bool, - canny_low_threshold: float, - canny_high_threshold: float, - skip_preprocessing: bool, - steps: int, - guidance: float, - seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): - control_image = convert_image_to_base64(control_image[:, :, :, :3]) - preprocessed_image = None - - # scale canny threshold between 0-500, to match BFL's API - def scale_value(value: float, min_val=0, max_val=500): - return min_val + value * (max_val - min_val) - canny_low_threshold = int(round(scale_value(canny_low_threshold))) - canny_high_threshold = int(round(scale_value(canny_high_threshold))) - - - if skip_preprocessing: - preprocessed_image = control_image - control_image = None - canny_low_threshold = None - canny_high_threshold = None - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-canny/generate", - method=HttpMethod.POST, - request_model=BFLFluxCannyImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxCannyImageRequest( - prompt=prompt, - prompt_upsampling=prompt_upsampling, - steps=steps, - guidance=guidance, - seed=seed, - control_image=control_image, - canny_low_threshold=canny_low_threshold, - canny_high_threshold=canny_high_threshold, - preprocessed_image=preprocessed_image, - ), - auth_kwargs=kwargs, + response = await poll_op( + cls, + ApiEndpoint(initial_response.polling_url), + response_model=BFLFluxStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + completed_statuses=[BFLStatus.ready], + failed_statuses=[ + BFLStatus.request_moderated, + BFLStatus.content_moderated, + BFLStatus.error, + BFLStatus.task_not_found, + ], + queued_statuses=[], ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) + return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"])) -class FluxProDepthNode(ComfyNodeABC): - """ - Generate image using a control image (depth). - """ - - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "control_image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "prompt_upsampling": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to perform upsampling on the prompt. If active, automatically modifies the prompt for more creative generation, but results are nondeterministic (same seed will not produce exactly the same result).", - }, - ), - "skip_preprocessing": ( - IO.BOOLEAN, - { - "default": False, - "tooltip": "Whether to skip preprocessing; set to True if control_image already is depth-ified, False if it is a raw image.", - }, - ), - "guidance": ( - IO.FLOAT, - { - "default": 15, - "min": 1, - "max": 100, - "tooltip": "Guidance strength for the image generation process" - }, - ), - "steps": ( - IO.INT, - { - "default": 50, - "min": 15, - "max": 50, - "tooltip": "Number of steps for the image generation process" - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "The random seed used for creating the noise.", - }, - ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/BFL" - - async def api_call( - self, - control_image: torch.Tensor, - prompt: str, - prompt_upsampling: bool, - skip_preprocessing: bool, - steps: int, - guidance: float, - seed=0, - unique_id: Union[str, None] = None, - **kwargs, - ): - control_image = convert_image_to_base64(control_image[:,:,:,:3]) - preprocessed_image = None - - if skip_preprocessing: - preprocessed_image = control_image - control_image = None - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/bfl/flux-pro-1.0-depth/generate", - method=HttpMethod.POST, - request_model=BFLFluxDepthImageRequest, - response_model=BFLFluxProGenerateResponse, - ), - request=BFLFluxDepthImageRequest( - prompt=prompt, - prompt_upsampling=prompt_upsampling, - steps=steps, - guidance=guidance, - seed=seed, - control_image=control_image, - preprocessed_image=preprocessed_image, - ), - auth_kwargs=kwargs, - ) - output_image = await handle_bfl_synchronous_operation(operation, node_id=unique_id) - return (output_image,) +class BFLExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + FluxProUltraImageNode, + # FluxProImageNode, + FluxKontextProImageNode, + FluxKontextMaxImageNode, + FluxProExpandNode, + FluxProFillNode, + ] -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "FluxProUltraImageNode": FluxProUltraImageNode, - # "FluxProImageNode": FluxProImageNode, - "FluxKontextProImageNode": FluxKontextProImageNode, - "FluxKontextMaxImageNode": FluxKontextMaxImageNode, - "FluxProExpandNode": FluxProExpandNode, - "FluxProFillNode": FluxProFillNode, - "FluxProCannyNode": FluxProCannyNode, - "FluxProDepthNode": FluxProDepthNode, -} - -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "FluxProUltraImageNode": "Flux 1.1 [pro] Ultra Image", - # "FluxProImageNode": "Flux 1.1 [pro] Image", - "FluxKontextProImageNode": "Flux.1 Kontext [pro] Image", - "FluxKontextMaxImageNode": "Flux.1 Kontext [max] Image", - "FluxProExpandNode": "Flux.1 Expand Image", - "FluxProFillNode": "Flux.1 Fill Image", - "FluxProCannyNode": "Flux.1 Canny Control Image", - "FluxProDepthNode": "Flux.1 Depth Control Image", -} +async def comfy_entrypoint() -> BFLExtension: + return BFLExtension() diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index a7eeaf15a..caced471e 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -1,35 +1,27 @@ import logging import math from enum import Enum -from typing import Literal, Optional, Type, Union -from typing_extensions import override +from typing import Literal, Optional, Union import torch from pydantic import BaseModel, Field +from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io -from comfy_api_nodes.util.validation_utils import ( - validate_image_aspect_ratio_range, - get_number_of_images, - validate_image_dimensions, -) -from comfy_api_nodes.apis.client import ( +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( ApiEndpoint, - EmptyRequest, - HttpMethod, - SynchronousOperation, - PollingOperation, - T, -) -from comfy_api_nodes.apinode_utils import ( download_url_to_image_tensor, download_url_to_video_output, - upload_images_to_comfyapi, - validate_string, + get_number_of_images, image_tensor_pair_to_batch, + poll_op, + sync_op, + upload_images_to_comfyapi, + validate_image_aspect_ratio, + validate_image_dimensions, + validate_string, ) - BYTEPLUS_IMAGE_ENDPOINT = "/proxy/byteplus/api/v3/images/generations" # Long-running tasks endpoints(e.g., video) @@ -46,13 +38,14 @@ class Image2ImageModelName(str, Enum): class Text2VideoModelName(str, Enum): - seedance_1_pro = "seedance-1-0-pro-250528" + seedance_1_pro = "seedance-1-0-pro-250528" seedance_1_lite = "seedance-1-0-lite-t2v-250428" class Image2VideoModelName(str, Enum): """note(August 31): Pro model only supports FirstFrame: https://docs.byteplus.com/en/docs/ModelArk/1520757""" - seedance_1_pro = "seedance-1-0-pro-250528" + + seedance_1_pro = "seedance-1-0-pro-250528" seedance_1_lite = "seedance-1-0-lite-i2v-250428" @@ -208,62 +201,33 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N return None -async def poll_until_finished( - auth_kwargs: dict[str, str], - task_id: str, - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> TaskStatusResponse: - """Polls the ByteDance API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - completed_statuses=[ - "succeeded", - ], - failed_statuses=[ - "cancelled", - "failed", - ], - status_extractor=lambda response: response.status, - auth_kwargs=auth_kwargs, - result_url_extractor=get_video_url_from_task_status, - estimated_duration=estimated_duration, - node_id=node_id, - ).execute() - - -class ByteDanceImageNode(comfy_io.ComfyNode): +class ByteDanceImageNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageNode", display_name="ByteDance Image", category="api node/image/ByteDance", description="Generate images using ByteDance models via api based on prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in Text2ImageModelName], - default=Text2ImageModelName.seedream_3.value, + options=Text2ImageModelName, + default=Text2ImageModelName.seedream_3, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the image", ), - comfy_io.Combo.Input( + IO.Combo.Input( "size_preset", options=[label for label, _, _ in RECOMMENDED_PRESETS], tooltip="Pick a recommended size. Select Custom to use the width and height below", ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=1024, min=512, @@ -271,7 +235,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): step=64, tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=1024, min=512, @@ -279,41 +243,41 @@ class ByteDanceImageNode(comfy_io.ComfyNode): step=64, tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "guidance_scale", default=2.5, min=1.0, max=10.0, step=0.01, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Higher value makes the image follow the prompt more closely", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the image", + tooltip='Whether to add an "AI generated" watermark to the image', optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -329,7 +293,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): seed: int, guidance_scale: float, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) w = h = None for label, tw, th in RECOMMENDED_PRESETS: @@ -341,8 +305,7 @@ class ByteDanceImageNode(comfy_io.ComfyNode): w, h = width, height if not (512 <= w <= 2048) or not (512 <= h <= 2048): raise ValueError( - f"Custom size out of range: {w}x{h}. " - "Both width and height must be between 512 and 2048 pixels." + f"Custom size out of range: {w}x{h}. " "Both width and height must be between 512 and 2048 pixels." ) payload = Text2ImageTaskCreationRequest( @@ -353,84 +316,76 @@ class ByteDanceImageNode(comfy_io.ComfyNode): guidance_scale=guidance_scale, watermark=watermark, ) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_IMAGE_ENDPOINT, - method=HttpMethod.POST, - request_model=Text2ImageTaskCreationRequest, - response_model=ImageTaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), + data=payload, + response_model=ImageTaskCreationResponse, + ) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) -class ByteDanceImageEditNode(comfy_io.ComfyNode): +class ByteDanceImageEditNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageEditNode", display_name="ByteDance Image Edit", category="api node/image/ByteDance", description="Edit images using ByteDance models via api based on prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in Image2ImageModelName], - default=Image2ImageModelName.seededit_3.value, + options=Image2ImageModelName, + default=Image2ImageModelName.seededit_3, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="The base image to edit", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Instruction to edit image", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "guidance_scale", default=5.5, min=1.0, max=10.0, step=0.01, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Higher value makes the image follow the prompt more closely", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the image", + tooltip='Whether to add an "AI generated" watermark to the image', optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -444,21 +399,12 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): seed: int, guidance_scale: float, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") - validate_image_aspect_ratio_range(image, (1, 3), (3, 1)) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - source_url = (await upload_images_to_comfyapi( - image, - max_images=1, - mime_type="image/png", - auth_kwargs=auth_kwargs, - ))[0] + validate_image_aspect_ratio(image, (1, 3), (3, 1)) + source_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0] payload = Image2ImageTaskCreationRequest( model=model, prompt=prompt, @@ -467,52 +413,48 @@ class ByteDanceImageEditNode(comfy_io.ComfyNode): guidance_scale=guidance_scale, watermark=watermark, ) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_IMAGE_ENDPOINT, - method=HttpMethod.POST, - request_model=Image2ImageTaskCreationRequest, - response_model=ImageTaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), + data=payload, + response_model=ImageTaskCreationResponse, + ) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) -class ByteDanceSeedreamNode(comfy_io.ComfyNode): +class ByteDanceSeedreamNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceSeedreamNode", display_name="ByteDance Seedream 4", category="api node/image/ByteDance", description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["seedream-4-0-250828"], tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for creating or editing an image.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Input image(s) for image-to-image generation. " - "List of 1-10 images for single or multi-reference generation.", + "List of 1-10 images for single or multi-reference generation.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "size_preset", options=[label for label, _, _ in RECOMMENDED_PRESETS_SEEDREAM_4], tooltip="Pick a recommended size. Select Custom to use the width and height below.", ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=2048, min=1024, @@ -521,7 +463,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=2048, min=1024, @@ -530,44 +472,44 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "sequential_image_generation", options=["disabled", "auto"], tooltip="Group image generation mode. " - "'disabled' generates a single image. " - "'auto' lets the model decide whether to generate multiple related images " - "(e.g., story scenes, character variations).", + "'disabled' generates a single image. " + "'auto' lets the model decide whether to generate multiple related images " + "(e.g., story scenes, character variations).", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "max_images", default=1, min=1, max=15, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Maximum number of images to generate when sequential_image_generation='auto'. " - "Total images (input + generated) cannot exceed 15.", + "Total images (input + generated) cannot exceed 15.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the image.", + tooltip='Whether to add an "AI generated" watermark to the image.', optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "fail_on_partial", default=True, tooltip="If enabled, abort execution if any requested images are missing or return an error.", @@ -575,12 +517,12 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -599,7 +541,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): seed: int = 0, watermark: bool = True, fail_on_partial: bool = True, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) w = h = None for label, tw, th in RECOMMENDED_PRESETS_SEEDREAM_4: @@ -611,8 +553,7 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): w, h = width, height if not (1024 <= w <= 4096) or not (1024 <= h <= 4096): raise ValueError( - f"Custom size out of range: {w}x{h}. " - "Both width and height must be between 1024 and 4096 pixels." + f"Custom size out of range: {w}x{h}. " "Both width and height must be between 1024 and 4096 pixels." ) n_input_images = get_number_of_images(image) if image is not None else 0 if n_input_images > 10: @@ -621,121 +562,111 @@ class ByteDanceSeedreamNode(comfy_io.ComfyNode): raise ValueError( "The maximum number of generated images plus the number of reference images cannot exceed 15." ) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } reference_images_urls = [] if n_input_images: for i in image: - validate_image_aspect_ratio_range(i, (1, 3), (3, 1)) - reference_images_urls = (await upload_images_to_comfyapi( + validate_image_aspect_ratio(i, (1, 3), (3, 1)) + reference_images_urls = await upload_images_to_comfyapi( + cls, image, max_images=n_input_images, mime_type="image/png", - auth_kwargs=auth_kwargs, - )) - payload = Seedream4TaskCreationRequest( - model=model, - prompt=prompt, - image=reference_images_urls, - size=f"{w}x{h}", - seed=seed, - sequential_image_generation=sequential_image_generation, - sequential_image_generation_options=Seedream4Options(max_images=max_images), - watermark=watermark, - ) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_IMAGE_ENDPOINT, - method=HttpMethod.POST, - request_model=Seedream4TaskCreationRequest, - response_model=ImageTaskCreationResponse, + ) + response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"), + response_model=ImageTaskCreationResponse, + data=Seedream4TaskCreationRequest( + model=model, + prompt=prompt, + image=reference_images_urls, + size=f"{w}x{h}", + seed=seed, + sequential_image_generation=sequential_image_generation, + sequential_image_generation_options=Seedream4Options(max_images=max_images), + watermark=watermark, ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - + ) if len(response.data) == 1: - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response))) urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d] if fail_on_partial and len(urls) < len(response.data): raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.") - return comfy_io.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls])) + return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls])) -class ByteDanceTextToVideoNode(comfy_io.ComfyNode): +class ByteDanceTextToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceTextToVideoNode", display_name="ByteDance Text to Video", category="api node/video/ByteDance", description="Generate video using ByteDance models via api based on prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in Text2VideoModelName], - default=Text2VideoModelName.seedance_1_pro.value, + options=Text2VideoModelName, + default=Text2VideoModelName.seedance_1_pro, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p", "1080p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " - "to fix the camera to your prompt, but does not guarantee the actual effect.", + "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -751,7 +682,7 @@ class ByteDanceTextToVideoNode(comfy_io.ComfyNode): seed: int, camera_fixed: bool, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) @@ -764,99 +695,89 @@ class ByteDanceTextToVideoNode(comfy_io.ComfyNode): f"--camerafixed {str(camera_fixed).lower()} " f"--watermark {str(watermark).lower()}" ) - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } return await process_video_task( - request_model=Text2VideoTaskCreationRequest, - payload=Text2VideoTaskCreationRequest( - model=model, - content=[TaskTextContent(text=prompt)], - ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, + cls, + payload=Text2VideoTaskCreationRequest(model=model, content=[TaskTextContent(text=prompt)]), estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) -class ByteDanceImageToVideoNode(comfy_io.ComfyNode): +class ByteDanceImageToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageToVideoNode", display_name="ByteDance Image to Video", category="api node/video/ByteDance", description="Generate video using ByteDance models via api based on image and prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in Image2VideoModelName], - default=Image2VideoModelName.seedance_1_pro.value, + options=Image2VideoModelName, + default=Image2VideoModelName.seedance_1_pro, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="First frame to be used for the video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p", "1080p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " - "to fix the camera to your prompt, but does not guarantee the actual effect.", + "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -873,19 +794,13 @@ class ByteDanceImageToVideoNode(comfy_io.ComfyNode): seed: int, camera_fixed: bool, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) - validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=auth_kwargs))[0] + validate_image_aspect_ratio(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0] prompt = ( f"{prompt} " f"--resolution {resolution} " @@ -897,97 +812,95 @@ class ByteDanceImageToVideoNode(comfy_io.ComfyNode): ) return await process_video_task( - request_model=Image2VideoTaskCreationRequest, + cls, payload=Image2VideoTaskCreationRequest( model=model, content=[TaskTextContent(text=prompt), TaskImageContent(image_url=TaskImageContentUrl(url=image_url))], ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) -class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): +class ByteDanceFirstLastFrameNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceFirstLastFrameNode", display_name="ByteDance First-Last-Frame to Video", category="api node/video/ByteDance", description="Generate video using prompt and first and last frames.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[Image2VideoModelName.seedance_1_lite.value], + options=[model.value for model in Image2VideoModelName], default=Image2VideoModelName.seedance_1_lite.value, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "first_frame", tooltip="First frame to be used for the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "last_frame", tooltip="Last frame to be used for the video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p", "1080p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "camera_fixed", default=False, tooltip="Specifies whether to fix the camera. The platform appends an instruction " - "to fix the camera to your prompt, but does not guarantee the actual effect.", + "to fix the camera to your prompt, but does not guarantee the actual effect.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1005,23 +918,18 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): seed: int, camera_fixed: bool, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "camerafixed", "watermark"]) for i in (first_frame, last_frame): validate_image_dimensions(i, min_width=300, min_height=300, max_width=6000, max_height=6000) - validate_image_aspect_ratio_range(i, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } + validate_image_aspect_ratio(i, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 download_urls = await upload_images_to_comfyapi( + cls, image_tensor_pair_to_batch(first_frame, last_frame), max_images=2, mime_type="image/png", - auth_kwargs=auth_kwargs, ) prompt = ( @@ -1035,7 +943,7 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): ) return await process_video_task( - request_model=Image2VideoTaskCreationRequest, + cls, payload=Image2VideoTaskCreationRequest( model=model, content=[ @@ -1044,81 +952,79 @@ class ByteDanceFirstLastFrameNode(comfy_io.ComfyNode): TaskImageContent(image_url=TaskImageContentUrl(url=str(download_urls[1])), role="last_frame"), ], ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) -class ByteDanceImageReferenceNode(comfy_io.ComfyNode): +class ByteDanceImageReferenceNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ByteDanceImageReferenceNode", display_name="ByteDance Reference Images to Video", category="api node/video/ByteDance", description="Generate video using prompt and reference images.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=[Image2VideoModelName.seedance_1_lite.value], default=Image2VideoModelName.seedance_1_lite.value, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="The text prompt used to generate the video.", ), - comfy_io.Image.Input( + IO.Image.Input( "images", tooltip="One to four images.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["480p", "720p"], tooltip="The resolution of the output video.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["adaptive", "16:9", "4:3", "1:1", "3:4", "9:16", "21:9"], tooltip="The aspect ratio of the output video.", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=3, max=12, step=1, tooltip="The duration of the output video in seconds.", - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the video.", + tooltip='Whether to add an "AI generated" watermark to the video.', optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -1134,22 +1040,14 @@ class ByteDanceImageReferenceNode(comfy_io.ComfyNode): duration: int, seed: int, watermark: bool, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) raise_if_text_params(prompt, ["resolution", "ratio", "duration", "seed", "watermark"]) for image in images: validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) - validate_image_aspect_ratio_range(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - image_urls = await upload_images_to_comfyapi( - images, max_images=4, mime_type="image/png", auth_kwargs=auth_kwargs - ) + validate_image_aspect_ratio(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + image_urls = await upload_images_to_comfyapi(cls, images, max_images=4, mime_type="image/png") prompt = ( f"{prompt} " f"--resolution {resolution} " @@ -1160,44 +1058,34 @@ class ByteDanceImageReferenceNode(comfy_io.ComfyNode): ) x = [ TaskTextContent(text=prompt), - *[TaskImageContent(image_url=TaskImageContentUrl(url=str(i)), role="reference_image") for i in image_urls] + *[TaskImageContent(image_url=TaskImageContentUrl(url=str(i)), role="reference_image") for i in image_urls], ] return await process_video_task( - request_model=Image2VideoTaskCreationRequest, - payload=Image2VideoTaskCreationRequest( - model=model, - content=x, - ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, + cls, + payload=Image2VideoTaskCreationRequest(model=model, content=x), estimated_duration=max(1, math.ceil(VIDEO_TASKS_EXECUTION_TIME[model][resolution] * (duration / 10.0))), ) async def process_video_task( - request_model: Type[T], + cls: type[IO.ComfyNode], payload: Union[Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], - auth_kwargs: dict, - node_id: str, estimated_duration: Optional[int], -) -> comfy_io.NodeOutput: - initial_response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=BYTEPLUS_TASK_ENDPOINT, - method=HttpMethod.POST, - request_model=request_model, - response_model=TaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - response = await poll_until_finished( - auth_kwargs, - initial_response.id, - estimated_duration=estimated_duration, - node_id=node_id, +) -> IO.NodeOutput: + initial_response = await sync_op( + cls, + ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"), + data=payload, + response_model=TaskCreationResponse, ) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) + response = await poll_op( + cls, + ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"), + status_extractor=lambda r: r.status, + estimated_duration=estimated_duration, + response_model=TaskStatusResponse, + ) + return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(response))) def raise_if_text_params(prompt: str, text_params: list[str]) -> None: @@ -1210,7 +1098,7 @@ def raise_if_text_params(prompt: str, text_params: list[str]) -> None: class ByteDanceExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ ByteDanceImageNode, ByteDanceImageEditNode, @@ -1221,5 +1109,6 @@ class ByteDanceExtension(ComfyExtension): ByteDanceImageReferenceNode, ] + async def comfy_entrypoint() -> ByteDanceExtension: return ByteDanceExtension() diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index baa379b75..67f2469ad 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -2,44 +2,47 @@ API Nodes for Gemini Multimodal LLM Usage via Remote API See: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference """ + from __future__ import annotations -import json -import time -import os -import uuid import base64 -from io import BytesIO +import json +import os +import time +import uuid from enum import Enum -from typing import Optional, Literal +from io import BytesIO +from typing import Literal, Optional import torch +from typing_extensions import override import folder_paths -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeDict -from server import PromptServer +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api.util import VideoCodec, VideoContainer from comfy_api_nodes.apis import ( GeminiContent, GeminiGenerateContentRequest, GeminiGenerateContentResponse, GeminiInlineData, - GeminiPart, GeminiMimeType, + GeminiPart, ) -from comfy_api_nodes.apis.gemini_api import GeminiImageGenerationConfig, GeminiImageGenerateContentRequest -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.apis.gemini_api import ( + GeminiImageConfig, + GeminiImageGenerateContentRequest, + GeminiImageGenerationConfig, +) +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, -) -from comfy_api_nodes.apinode_utils import ( - validate_string, audio_to_base64_string, - video_to_base64_string, - tensor_to_base64_string, bytesio_to_image_tensor, + sync_op, + tensor_to_base64_string, + validate_string, + video_to_base64_string, ) - +from server import PromptServer GEMINI_BASE_ENDPOINT = "/proxy/vertexai/gemini" GEMINI_MAX_INPUT_FILE_SIZE = 20 * 1024 * 1024 # 20 MB @@ -62,50 +65,7 @@ class GeminiImageModel(str, Enum): """ gemini_2_5_flash_image_preview = "gemini-2.5-flash-image-preview" - - -def get_gemini_endpoint( - model: GeminiModel, -) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: - """ - Get the API endpoint for a given Gemini model. - - Args: - model: The Gemini model to use, either as enum or string value. - - Returns: - ApiEndpoint configured for the specific Gemini model. - """ - if isinstance(model, str): - model = GeminiModel(model) - return ApiEndpoint( - path=f"{GEMINI_BASE_ENDPOINT}/{model.value}", - method=HttpMethod.POST, - request_model=GeminiGenerateContentRequest, - response_model=GeminiGenerateContentResponse, - ) - - -def get_gemini_image_endpoint( - model: GeminiImageModel, -) -> ApiEndpoint[GeminiGenerateContentRequest, GeminiGenerateContentResponse]: - """ - Get the API endpoint for a given Gemini model. - - Args: - model: The Gemini model to use, either as enum or string value. - - Returns: - ApiEndpoint configured for the specific Gemini model. - """ - if isinstance(model, str): - model = GeminiImageModel(model) - return ApiEndpoint( - path=f"{GEMINI_BASE_ENDPOINT}/{model.value}", - method=HttpMethod.POST, - request_model=GeminiImageGenerateContentRequest, - response_model=GeminiGenerateContentResponse, - ) + gemini_2_5_flash_image = "gemini-2.5-flash-image" def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: @@ -120,9 +80,7 @@ def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: """ image_parts: list[GeminiPart] = [] for image_index in range(image_input.shape[0]): - image_as_b64 = tensor_to_base64_string( - image_input[image_index].unsqueeze(0) - ) + image_as_b64 = tensor_to_base64_string(image_input[image_index].unsqueeze(0)) image_parts.append( GeminiPart( inlineData=GeminiInlineData( @@ -134,37 +92,7 @@ def create_image_parts(image_input: torch.Tensor) -> list[GeminiPart]: return image_parts -def create_text_part(text: str) -> GeminiPart: - """ - Create a text part for the Gemini API request. - - Args: - text: The text content to include in the request. - - Returns: - A GeminiPart object with the text content. - """ - return GeminiPart(text=text) - - -def get_parts_from_response( - response: GeminiGenerateContentResponse -) -> list[GeminiPart]: - """ - Extract all parts from the Gemini API response. - - Args: - response: The API response from Gemini. - - Returns: - List of response parts from the first candidate. - """ - return response.candidates[0].content.parts - - -def get_parts_by_type( - response: GeminiGenerateContentResponse, part_type: Literal["text"] | str -) -> list[GeminiPart]: +def get_parts_by_type(response: GeminiGenerateContentResponse, part_type: Literal["text"] | str) -> list[GeminiPart]: """ Filter response parts by their type. @@ -176,14 +104,10 @@ def get_parts_by_type( List of response parts matching the requested type. """ parts = [] - for part in get_parts_from_response(response): + for part in response.candidates[0].content.parts: if part_type == "text" and hasattr(part, "text") and part.text: parts.append(part) - elif ( - hasattr(part, "inlineData") - and part.inlineData - and part.inlineData.mimeType == part_type - ): + elif hasattr(part, "inlineData") and part.inlineData and part.inlineData.mimeType == part_type: parts.append(part) # Skip parts that don't match the requested type return parts @@ -211,11 +135,11 @@ def get_image_from_response(response: GeminiGenerateContentResponse) -> torch.Te returned_image = bytesio_to_image_tensor(BytesIO(image_data)) image_tensors.append(returned_image) if len(image_tensors) == 0: - return torch.zeros((1,1024,1024,4)) + return torch.zeros((1, 1024, 1024, 4)) return torch.cat(image_tensors, dim=0) -class GeminiNode(ComfyNodeABC): +class GeminiNode(IO.ComfyNode): """ Node to generate text responses from a Gemini model. @@ -226,96 +150,79 @@ class GeminiNode(ComfyNodeABC): """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text inputs to the model, used to generate a response. You can include detailed instructions, questions, or context for the model.", - }, + def define_schema(cls): + return IO.Schema( + node_id="GeminiNode", + display_name="Google Gemini", + category="api node/text/Gemini", + description="Generate text responses with Google's Gemini AI model. " + "You can provide multiple types of inputs (text, images, audio, video) " + "as context for generating more relevant and meaningful responses.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text inputs to the model, used to generate a response. " + "You can include detailed instructions, questions, or context for the model.", ), - "model": ( - IO.COMBO, - { - "tooltip": "The Gemini model to use for generating responses.", - "options": [model.value for model in GeminiModel], - "default": GeminiModel.gemini_2_5_pro.value, - }, + IO.Combo.Input( + "model", + options=GeminiModel, + default=GeminiModel.gemini_2_5_pro, + tooltip="The Gemini model to use for generating responses.", ), - "seed": ( - IO.INT, - { - "default": 42, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "When seed is fixed to a specific value, the model makes a best effort to provide the same response for repeated requests. Deterministic output isn't guaranteed. Also, changing the model or parameter settings, such as the temperature, can cause variations in the response even when you use the same seed value. By default, a random seed value is used.", - }, + IO.Int.Input( + "seed", + default=42, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="When seed is fixed to a specific value, the model makes a best effort to provide " + "the same response for repeated requests. Deterministic output isn't guaranteed. " + "Also, changing the model or parameter settings, such as the temperature, " + "can cause variations in the response even when you use the same seed value. " + "By default, a random seed value is used.", ), - }, - "optional": { - "images": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", - }, + IO.Image.Input( + "images", + optional=True, + tooltip="Optional image(s) to use as context for the model. " + "To include multiple images, you can use the Batch Images node.", ), - "audio": ( - IO.AUDIO, - { - "tooltip": "Optional audio to use as context for the model.", - "default": None, - }, + IO.Audio.Input( + "audio", + optional=True, + tooltip="Optional audio to use as context for the model.", ), - "video": ( - IO.VIDEO, - { - "tooltip": "Optional video to use as context for the model.", - "default": None, - }, + IO.Video.Input( + "video", + optional=True, + tooltip="Optional video to use as context for the model.", ), - "files": ( - "GEMINI_INPUT_FILES", - { - "default": None, - "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the Gemini Generate Content Input Files node.", - }, + IO.Custom("GEMINI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. " + "Accepts inputs from the Gemini Generate Content Input Files node.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Generate text responses with Google's Gemini AI model. You can provide multiple types of inputs (text, images, audio, video) as context for generating more relevant and meaningful responses." - RETURN_TYPES = ("STRING",) - FUNCTION = "api_call" - CATEGORY = "api node/text/Gemini" - API_NODE = True - - def create_video_parts(self, video_input: IO.VIDEO, **kwargs) -> list[GeminiPart]: - """ - Convert video input to Gemini API compatible parts. - - Args: - video_input: Video tensor from ComfyUI. - **kwargs: Additional arguments to pass to the conversion function. - - Returns: - List of GeminiPart objects containing the encoded video. - """ - from comfy_api.util import VideoContainer, VideoCodec - base_64_string = video_to_base64_string( - video_input, - container_format=VideoContainer.MP4, - codec=VideoCodec.H264 + ], + outputs=[ + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) + + @classmethod + def create_video_parts(cls, video_input: Input.Video) -> list[GeminiPart]: + """Convert video input to Gemini API compatible parts.""" + + base_64_string = video_to_base64_string(video_input, container_format=VideoContainer.MP4, codec=VideoCodec.H264) return [ GeminiPart( inlineData=GeminiInlineData( @@ -325,7 +232,8 @@ class GeminiNode(ComfyNodeABC): ) ] - def create_audio_parts(self, audio_input: IO.AUDIO) -> list[GeminiPart]: + @classmethod + def create_audio_parts(cls, audio_input: Input.Audio) -> list[GeminiPart]: """ Convert audio input to Gemini API compatible parts. @@ -338,10 +246,10 @@ class GeminiNode(ComfyNodeABC): audio_parts: list[GeminiPart] = [] for batch_index in range(audio_input["waveform"].shape[0]): # Recreate an IO.AUDIO object for the given batch dimension index - audio_at_index = { - "waveform": audio_input["waveform"][batch_index].unsqueeze(0), - "sample_rate": audio_input["sample_rate"], - } + audio_at_index = Input.Audio( + waveform=audio_input["waveform"][batch_index].unsqueeze(0), + sample_rate=audio_input["sample_rate"], + ) # Convert to MP3 format for compatibility with Gemini API audio_bytes = audio_to_base64_string( audio_at_index, @@ -358,38 +266,38 @@ class GeminiNode(ComfyNodeABC): ) return audio_parts - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, - model: GeminiModel, - images: Optional[IO.IMAGE] = None, - audio: Optional[IO.AUDIO] = None, - video: Optional[IO.VIDEO] = None, + model: str, + seed: int, + images: Optional[torch.Tensor] = None, + audio: Optional[Input.Audio] = None, + video: Optional[Input.Video] = None, files: Optional[list[GeminiPart]] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[str]: - # Validate inputs + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # Create parts list with text prompt as the first part - parts: list[GeminiPart] = [create_text_part(prompt)] + parts: list[GeminiPart] = [GeminiPart(text=prompt)] # Add other modal parts if images is not None: image_parts = create_image_parts(images) parts.extend(image_parts) if audio is not None: - parts.extend(self.create_audio_parts(audio)) + parts.extend(cls.create_audio_parts(audio)) if video is not None: - parts.extend(self.create_video_parts(video)) + parts.extend(cls.create_video_parts(video)) if files is not None: parts.extend(files) # Create response - response = await SynchronousOperation( - endpoint=get_gemini_endpoint(model), - request=GeminiGenerateContentRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), + data=GeminiGenerateContentRequest( contents=[ GeminiContent( role="user", @@ -397,15 +305,15 @@ class GeminiNode(ComfyNodeABC): ) ] ), - auth_kwargs=kwargs, - ).execute() + response_model=GeminiGenerateContentResponse, + ) # Get result output output_text = get_text_from_response(response) - if unique_id and output_text: + if output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { - "node_id": unique_id, + "node_id": cls.hidden.unique_id, "component": "ChatHistoryWidget", "props": { "history": json.dumps( @@ -425,10 +333,10 @@ class GeminiNode(ComfyNodeABC): render_spec, ) - return (output_text or "Empty response from Gemini model...",) + return IO.NodeOutput(output_text or "Empty response from Gemini model...") -class GeminiInputFiles(ComfyNodeABC): +class GeminiInputFiles(IO.ComfyNode): """ Loads and formats input files for use with the Gemini API. @@ -439,7 +347,7 @@ class GeminiInputFiles(ComfyNodeABC): """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: + def define_schema(cls): """ For details about the supported file input types, see: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference @@ -454,43 +362,40 @@ class GeminiInputFiles(ComfyNodeABC): ] input_files = sorted(input_files, key=lambda x: x.name) input_files = [f.name for f in input_files] - return { - "required": { - "file": ( - IO.COMBO, - { - "tooltip": "Input files to include as context for the model. Only accepts text (.txt) and PDF (.pdf) files for now.", - "options": input_files, - "default": input_files[0] if input_files else None, - }, + return IO.Schema( + node_id="GeminiInputFiles", + display_name="Gemini Input Files", + category="api node/text/Gemini", + description="Loads and prepares input files to include as inputs for Gemini LLM nodes. " + "The files will be read by the Gemini model when generating a response. " + "The contents of the text file count toward the token limit. " + "🛈 TIP: Can be chained together with other Gemini Input File nodes.", + inputs=[ + IO.Combo.Input( + "file", + options=input_files, + default=input_files[0] if input_files else None, + tooltip="Input files to include as context for the model. " + "Only accepts text (.txt) and PDF (.pdf) files for now.", ), - }, - "optional": { - "GEMINI_INPUT_FILES": ( + IO.Custom("GEMINI_INPUT_FILES").Input( "GEMINI_INPUT_FILES", - { - "tooltip": "An optional additional file(s) to batch together with the file loaded from this node. Allows chaining of input files so that a single message can include multiple input files.", - "default": None, - }, + optional=True, + tooltip="An optional additional file(s) to batch together with the file loaded from this node. " + "Allows chaining of input files so that a single message can include multiple input files.", ), - }, - } - - DESCRIPTION = "Loads and prepares input files to include as inputs for Gemini LLM nodes. The files will be read by the Gemini model when generating a response. The contents of the text file count toward the token limit. 🛈 TIP: Can be chained together with other Gemini Input File nodes." - RETURN_TYPES = ("GEMINI_INPUT_FILES",) - FUNCTION = "prepare_files" - CATEGORY = "api node/text/Gemini" - - def create_file_part(self, file_path: str) -> GeminiPart: - mime_type = ( - GeminiMimeType.application_pdf - if file_path.endswith(".pdf") - else GeminiMimeType.text_plain + ], + outputs=[ + IO.Custom("GEMINI_INPUT_FILES").Output(), + ], ) + + @classmethod + def create_file_part(cls, file_path: str) -> GeminiPart: + mime_type = GeminiMimeType.application_pdf if file_path.endswith(".pdf") else GeminiMimeType.text_plain # Use base64 string directly, not the data URI with open(file_path, "rb") as f: file_content = f.read() - import base64 base64_str = base64.b64encode(file_content).decode("utf-8") return GeminiPart( @@ -500,143 +405,127 @@ class GeminiInputFiles(ComfyNodeABC): ) ) - def prepare_files( - self, file: str, GEMINI_INPUT_FILES: list[GeminiPart] = [] - ) -> tuple[list[GeminiPart]]: - """ - Loads and formats input files for Gemini API. - """ - file_path = folder_paths.get_annotated_filepath(file) - input_file_content = self.create_file_part(file_path) - files = [input_file_content] + GEMINI_INPUT_FILES - return (files,) - - -class GeminiImage(ComfyNodeABC): - """ - Node to generate text and image responses from a Gemini model. - - This node allows users to interact with Google's Gemini AI models, providing - multimodal inputs (text, images, files) to generate coherent - text and image responses. The node works with the latest Gemini models, handling the - API communication and response parsing. - """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for generation", - }, - ), - "model": ( - IO.COMBO, - { - "tooltip": "The Gemini model to use for generating responses.", - "options": [model.value for model in GeminiImageModel], - "default": GeminiImageModel.gemini_2_5_flash_image_preview.value, - }, - ), - "seed": ( - IO.INT, - { - "default": 42, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "When seed is fixed to a specific value, the model makes a best effort to provide the same response for repeated requests. Deterministic output isn't guaranteed. Also, changing the model or parameter settings, such as the temperature, can cause variations in the response even when you use the same seed value. By default, a random seed value is used.", - }, - ), - }, - "optional": { - "images": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", - }, - ), - "files": ( - "GEMINI_INPUT_FILES", - { - "default": None, - "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the Gemini Generate Content Input Files node.", - }, - ), - # TODO: later we can add this parameter later - # "n": ( - # IO.INT, - # { - # "default": 1, - # "min": 1, - # "max": 8, - # "step": 1, - # "display": "number", - # "tooltip": "How many images to generate", - # }, - # ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def execute(cls, file: str, GEMINI_INPUT_FILES: Optional[list[GeminiPart]] = None) -> IO.NodeOutput: + """Loads and formats input files for Gemini API.""" + if GEMINI_INPUT_FILES is None: + GEMINI_INPUT_FILES = [] + file_path = folder_paths.get_annotated_filepath(file) + input_file_content = cls.create_file_part(file_path) + return IO.NodeOutput([input_file_content] + GEMINI_INPUT_FILES) - RETURN_TYPES = (IO.IMAGE, IO.STRING) - FUNCTION = "api_call" - CATEGORY = "api node/image/Gemini" - DESCRIPTION = "Edit images synchronously via Google API." - API_NODE = True - async def api_call( - self, +class GeminiImage(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GeminiImageNode", + display_name="Google Gemini Image", + category="api node/image/Gemini", + description="Edit images synchronously via Google API.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + tooltip="Text prompt for generation", + default="", + ), + IO.Combo.Input( + "model", + options=GeminiImageModel, + default=GeminiImageModel.gemini_2_5_flash_image, + tooltip="The Gemini model to use for generating responses.", + ), + IO.Int.Input( + "seed", + default=42, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="When seed is fixed to a specific value, the model makes a best effort to provide " + "the same response for repeated requests. Deterministic output isn't guaranteed. " + "Also, changing the model or parameter settings, such as the temperature, " + "can cause variations in the response even when you use the same seed value. " + "By default, a random seed value is used.", + ), + IO.Image.Input( + "images", + optional=True, + tooltip="Optional image(s) to use as context for the model. " + "To include multiple images, you can use the Batch Images node.", + ), + IO.Custom("GEMINI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. " + "Accepts inputs from the Gemini Generate Content Input Files node.", + ), + IO.Combo.Input( + "aspect_ratio", + options=["auto", "1:1", "2:3", "3:2", "3:4", "4:3", "4:5", "5:4", "9:16", "16:9", "21:9"], + default="auto", + tooltip="Defaults to matching the output image size to that of your input image, " + "or otherwise generates 1:1 squares.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, prompt: str, - model: GeminiImageModel, - images: Optional[IO.IMAGE] = None, + model: str, + seed: int, + images: Optional[torch.Tensor] = None, files: Optional[list[GeminiPart]] = None, - n=1, - unique_id: Optional[str] = None, - **kwargs, - ): - # Validate inputs + aspect_ratio: str = "auto", + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=1) - # Create parts list with text prompt as the first part - parts: list[GeminiPart] = [create_text_part(prompt)] + parts: list[GeminiPart] = [GeminiPart(text=prompt)] + + if not aspect_ratio: + aspect_ratio = "auto" # for backward compatability with old workflows; to-do remove this in December + image_config = GeminiImageConfig(aspectRatio=aspect_ratio) - # Add other modal parts if images is not None: image_parts = create_image_parts(images) parts.extend(image_parts) if files is not None: parts.extend(files) - response = await SynchronousOperation( - endpoint=get_gemini_image_endpoint(model), - request=GeminiImageGenerateContentRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=f"{GEMINI_BASE_ENDPOINT}/{model}", method="POST"), + data=GeminiImageGenerateContentRequest( contents=[ - GeminiContent( - role="user", - parts=parts, - ), + GeminiContent(role="user", parts=parts), ], generationConfig=GeminiImageGenerationConfig( - responseModalities=["TEXT","IMAGE"] - ) + responseModalities=["TEXT", "IMAGE"], + imageConfig=None if aspect_ratio == "auto" else image_config, + ), ), - auth_kwargs=kwargs, - ).execute() + response_model=GeminiGenerateContentResponse, + ) output_image = get_image_from_response(response) output_text = get_text_from_response(response) - if unique_id and output_text: + if output_text: # Not a true chat history like the OpenAI Chat node. It is emulated so the frontend can show a copy button. render_spec = { - "node_id": unique_id, + "node_id": cls.hidden.unique_id, "component": "ChatHistoryWidget", "props": { "history": json.dumps( @@ -657,17 +546,18 @@ class GeminiImage(ComfyNodeABC): ) output_text = output_text or "Empty response from Gemini model..." - return (output_image, output_text,) + return IO.NodeOutput(output_image, output_text) -NODE_CLASS_MAPPINGS = { - "GeminiNode": GeminiNode, - "GeminiImageNode": GeminiImage, - "GeminiInputFiles": GeminiInputFiles, -} +class GeminiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + GeminiNode, + GeminiImage, + GeminiInputFiles, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "GeminiNode": "Google Gemini", - "GeminiImageNode": "Google Gemini Image", - "GeminiInputFiles": "Gemini Input Files", -} + +async def comfy_entrypoint() -> GeminiExtension: + return GeminiExtension() diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index 2d1c32e4f..48f94e612 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -1,6 +1,6 @@ from io import BytesIO from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io +from comfy_api.latest import IO, ComfyExtension from PIL import Image import numpy as np import torch @@ -11,19 +11,13 @@ from comfy_api_nodes.apis import ( IdeogramV3Request, IdeogramV3EditRequest, ) - -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, -) - -from comfy_api_nodes.apinode_utils import ( - download_url_to_bytesio, bytesio_to_image_tensor, + download_url_as_bytesio, resize_mask_to_image, + sync_op, ) -from server import PromptServer V1_V1_RES_MAP = { "Auto":"AUTO", @@ -220,7 +214,7 @@ async def download_and_process_images(image_urls): for image_url in image_urls: # Using functions from apinode_utils.py to handle downloading and processing - image_bytesio = await download_url_to_bytesio(image_url) # Download image content to BytesIO + image_bytesio = await download_url_as_bytesio(image_url) # Download image content to BytesIO img_tensor = bytesio_to_image_tensor(image_bytesio, mode="RGB") # Convert to torch.Tensor with RGB mode image_tensors.append(img_tensor) @@ -233,89 +227,76 @@ async def download_and_process_images(image_urls): return stacked_tensors -def display_image_urls_on_node(image_urls, node_id): - if node_id and image_urls: - if len(image_urls) == 1: - PromptServer.instance.send_progress_text( - f"Generated Image URL:\n{image_urls[0]}", node_id - ) - else: - urls_text = "Generated Image URLs:\n" + "\n".join( - f"{i+1}. {url}" for i, url in enumerate(image_urls) - ) - PromptServer.instance.send_progress_text(urls_text, node_id) - - -class IdeogramV1(comfy_io.ComfyNode): +class IdeogramV1(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="IdeogramV1", display_name="Ideogram V1", category="api node/image/Ideogram", description="Generates images using the Ideogram V1 model.", is_api_node=True, inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "turbo", default=False, tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=list(V1_V2_RATIO_MAP.keys()), default="1:1", tooltip="The aspect ratio for image generation.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "magic_prompt_option", options=["AUTO", "ON", "OFF"], default="AUTO", tooltip="Determine if MagicPrompt should be used in generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Description of what to exclude from the image", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "num_images", default=1, min=1, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -334,77 +315,63 @@ class IdeogramV1(comfy_io.ComfyNode): aspect_ratio = V1_V2_RATIO_MAP.get(aspect_ratio, None) model = "V_1_TURBO" if turbo else "V_1" - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/ideogram/generate", - method=HttpMethod.POST, - request_model=IdeogramGenerateRequest, - response_model=IdeogramGenerateResponse, - ), - request=IdeogramGenerateRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/ideogram/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=IdeogramGenerateRequest( image_request=ImageRequest( prompt=prompt, model=model, num_images=num_images, seed=seed, aspect_ratio=aspect_ratio if aspect_ratio != "ASPECT_1_1" else None, - magic_prompt_option=( - magic_prompt_option if magic_prompt_option != "AUTO" else None - ), + magic_prompt_option=(magic_prompt_option if magic_prompt_option != "AUTO" else None), negative_prompt=negative_prompt if negative_prompt else None, ) ), - auth_kwargs=auth, + max_retries=1, ) - response = await operation.execute() - if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") image_urls = [image_data.url for image_data in response.data if image_data.url] - if not image_urls: raise Exception("No image URLs were generated in the response") - - display_image_urls_on_node(image_urls, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_and_process_images(image_urls)) + return IO.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV2(comfy_io.ComfyNode): +class IdeogramV2(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="IdeogramV2", display_name="Ideogram V2", category="api node/image/Ideogram", description="Generates images using the Ideogram V2 model.", is_api_node=True, inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "turbo", default=False, tooltip="Whether to use turbo mode (faster generation, potentially lower quality)", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=list(V1_V2_RATIO_MAP.keys()), default="1:1", tooltip="The aspect ratio for image generation. Ignored if resolution is not set to AUTO.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=list(V1_V1_RES_MAP.keys()), default="Auto", @@ -412,44 +379,44 @@ class IdeogramV2(comfy_io.ComfyNode): "If not set to AUTO, this overrides the aspect_ratio setting.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "magic_prompt_option", options=["AUTO", "ON", "OFF"], default="AUTO", tooltip="Determine if MagicPrompt should be used in generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_type", options=["AUTO", "GENERAL", "REALISTIC", "DESIGN", "RENDER_3D", "ANIME"], default="NONE", tooltip="Style type for generation (V2 only)", optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Description of what to exclude from the image", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "num_images", default=1, min=1, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), #"color_palette": ( @@ -462,12 +429,12 @@ class IdeogramV2(comfy_io.ComfyNode): #), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -500,18 +467,11 @@ class IdeogramV2(comfy_io.ComfyNode): else: final_aspect_ratio = aspect_ratio if aspect_ratio != "ASPECT_1_1" else None - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/ideogram/generate", - method=HttpMethod.POST, - request_model=IdeogramGenerateRequest, - response_model=IdeogramGenerateResponse, - ), - request=IdeogramGenerateRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/ideogram/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=IdeogramGenerateRequest( image_request=ImageRequest( prompt=prompt, model=model, @@ -519,36 +479,28 @@ class IdeogramV2(comfy_io.ComfyNode): seed=seed, aspect_ratio=final_aspect_ratio, resolution=final_resolution, - magic_prompt_option=( - magic_prompt_option if magic_prompt_option != "AUTO" else None - ), + magic_prompt_option=(magic_prompt_option if magic_prompt_option != "AUTO" else None), style_type=style_type if style_type != "NONE" else None, negative_prompt=negative_prompt if negative_prompt else None, color_palette=color_palette if color_palette else None, ) ), - auth_kwargs=auth, + max_retries=1, ) - - response = await operation.execute() - if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") image_urls = [image_data.url for image_data in response.data if image_data.url] - if not image_urls: raise Exception("No image URLs were generated in the response") - - display_image_urls_on_node(image_urls, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_and_process_images(image_urls)) + return IO.NodeOutput(await download_and_process_images(image_urls)) -class IdeogramV3(comfy_io.ComfyNode): +class IdeogramV3(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="IdeogramV3", display_name="Ideogram V3", category="api node/image/Ideogram", @@ -556,30 +508,30 @@ class IdeogramV3(comfy_io.ComfyNode): "Supports both regular image generation from text prompts and image editing with mask.", is_api_node=True, inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt for the image generation or editing", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Optional reference image for image editing.", optional=True, ), - comfy_io.Mask.Input( + IO.Mask.Input( "mask", tooltip="Optional mask for inpainting (white areas will be replaced)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=list(V3_RATIO_MAP.keys()), default="1:1", tooltip="The aspect ratio for image generation. Ignored if resolution is not set to Auto.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=V3_RESOLUTIONS, default="Auto", @@ -587,57 +539,57 @@ class IdeogramV3(comfy_io.ComfyNode): "If not set to Auto, this overrides the aspect_ratio setting.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "magic_prompt_option", options=["AUTO", "ON", "OFF"], default="AUTO", tooltip="Determine if MagicPrompt should be used in generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "num_images", default=1, min=1, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "rendering_speed", options=["DEFAULT", "TURBO", "QUALITY"], default="DEFAULT", tooltip="Controls the trade-off between generation speed and quality", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "character_image", tooltip="Image to use as character reference.", optional=True, ), - comfy_io.Mask.Input( + IO.Mask.Input( "character_mask", tooltip="Optional mask for character reference image.", optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], ) @@ -656,10 +608,6 @@ class IdeogramV3(comfy_io.ComfyNode): character_image=None, character_mask=None, ): - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } if rendering_speed == "BALANCED": # for backward compatibility rendering_speed = "DEFAULT" @@ -694,9 +642,6 @@ class IdeogramV3(comfy_io.ComfyNode): # Check if both image and mask are provided for editing mode if image is not None and mask is not None: - # Edit mode - path = "/proxy/ideogram/ideogram-v3/edit" - # Process image and mask input_tensor = image.squeeze().cpu() # Resize mask to match image dimension @@ -749,27 +694,20 @@ class IdeogramV3(comfy_io.ComfyNode): if character_mask_binary: files["character_mask_binary"] = character_mask_binary - # Execute the operation for edit mode - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=IdeogramV3EditRequest, - response_model=IdeogramGenerateResponse, - ), - request=edit_request, + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/ideogram/ideogram-v3/edit", method="POST"), + response_model=IdeogramGenerateResponse, + data=edit_request, files=files, content_type="multipart/form-data", - auth_kwargs=auth, + max_retries=1, ) elif image is not None or mask is not None: # If only one of image or mask is provided, raise an error raise Exception("Ideogram V3 image editing requires both an image AND a mask") else: - # Generation mode - path = "/proxy/ideogram/ideogram-v3/generate" - # Create generation request gen_request = IdeogramV3Request( prompt=prompt, @@ -800,43 +738,34 @@ class IdeogramV3(comfy_io.ComfyNode): if files: gen_request.style_type = "AUTO" - # Execute the operation for generation mode - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=IdeogramV3Request, - response_model=IdeogramGenerateResponse, - ), - request=gen_request, + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/ideogram/ideogram-v3/generate", method="POST"), + response_model=IdeogramGenerateResponse, + data=gen_request, files=files if files else None, content_type="multipart/form-data", - auth_kwargs=auth, + max_retries=1, ) - # Execute the operation and process response - response = await operation.execute() - if not response.data or len(response.data) == 0: raise Exception("No images were generated in the response") image_urls = [image_data.url for image_data in response.data if image_data.url] - if not image_urls: raise Exception("No image URLs were generated in the response") - - display_image_urls_on_node(image_urls, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_and_process_images(image_urls)) + return IO.NodeOutput(await download_and_process_images(image_urls)) class IdeogramExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ IdeogramV1, IdeogramV2, IdeogramV3, ] + async def comfy_entrypoint() -> IdeogramExtension: return IdeogramExtension() diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 5f55b2cc9..7b23e9cf9 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -5,15 +5,15 @@ For source of truth on the allowed permutations of request fields, please refere """ from __future__ import annotations -from typing import Optional, TypeVar, Any -from collections.abc import Callable +from typing import Optional, TypeVar import math import logging +from typing_extensions import override + import torch from comfy_api_nodes.apis import ( - KlingTaskStatus, KlingCameraControl, KlingCameraConfig, KlingCameraControlType, @@ -50,31 +50,25 @@ from comfy_api_nodes.apis import ( KlingCharacterEffectModelName, KlingSingleImageEffectModelName, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - tensor_to_base64_string, - download_url_to_video_output, - upload_video_to_comfyapi, - upload_audio_to_comfyapi, - download_url_to_image_tensor, -) -from comfy_api_nodes.mapper_utils import model_field_to_node_input -from comfy_api_nodes.util.validation_utils import ( +from comfy_api_nodes.util import ( validate_image_dimensions, validate_image_aspect_ratio, validate_video_dimensions, validate_video_duration, + tensor_to_base64_string, + validate_string, + upload_audio_to_comfyapi, + download_url_to_image_tensor, + upload_video_to_comfyapi, + download_url_to_video_output, + sync_op, + ApiEndpoint, + poll_op, ) +from comfy_api.input_impl import VideoFromFile from comfy_api.input.basic_types import AudioInput from comfy_api.input.video_types import VideoInput -from comfy_api.input_impl import VideoFromFile -from comfy.comfy_types.node_typing import IO, InputTypeOptions, ComfyNodeABC +from comfy_api.latest import ComfyExtension, IO KLING_API_VERSION = "v1" PATH_TEXT_TO_VIDEO = f"/proxy/kling/{KLING_API_VERSION}/videos/text2video" @@ -103,38 +97,113 @@ AVERAGE_DURATION_VIDEO_EXTEND = 320 R = TypeVar("R") -class KlingApiError(Exception): - """Base exception for Kling API errors.""" +MODE_TEXT2VIDEO = { + "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), + "standard mode / 10s duration / kling-v1": ("std", "10", "kling-v1"), + "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), + "pro mode / 10s duration / kling-v1": ("pro", "10", "kling-v1"), + "standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"), + "standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"), + "pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"), + "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), + "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), + "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), + "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), + "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), + "pro mode / 5s duration / kling-v2-5-turbo": ("pro", "5", "kling-v2-5-turbo"), + "pro mode / 10s duration / kling-v2-5-turbo": ("pro", "10", "kling-v2-5-turbo"), +} +""" +Mapping of mode strings to their corresponding (mode, duration, model_name) tuples. +Only includes config combos that support the `image_tail` request field. - pass +See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) +""" -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, R], - result_url_extractor: Optional[Callable[[R], str]] = None, - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> R: - """Polls the Kling API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[ - KlingTaskStatus.succeed.value, - ], - failed_statuses=[KlingTaskStatus.failed.value], - status_extractor=lambda response: ( - response.data.task_status.value - if response.data and response.data.task_status - else None - ), - auth_kwargs=auth_kwargs, - result_url_extractor=result_url_extractor, - estimated_duration=estimated_duration, - node_id=node_id, - poll_interval=16.0, - max_poll_attempts=256, - ).execute() +MODE_START_END_FRAME = { + "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), + "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), + "pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"), + "pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"), + "pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"), + "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"), + "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"), + "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"), +} +""" +Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. +Only includes config combos that support the `image_tail` request field. + +See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) +""" + + +VOICES_CONFIG = { + # English voices + "Melody": ("girlfriend_4_speech02", "en"), + "Sunny": ("genshin_vindi2", "en"), + "Sage": ("zhinen_xuesheng", "en"), + "Ace": ("AOT", "en"), + "Blossom": ("ai_shatang", "en"), + "Peppy": ("genshin_klee2", "en"), + "Dove": ("genshin_kirara", "en"), + "Shine": ("ai_kaiya", "en"), + "Anchor": ("oversea_male1", "en"), + "Lyric": ("ai_chenjiahao_712", "en"), + "Tender": ("chat1_female_new-3", "en"), + "Siren": ("chat_0407_5-1", "en"), + "Zippy": ("cartoon-boy-07", "en"), + "Bud": ("uk_boy1", "en"), + "Sprite": ("cartoon-girl-01", "en"), + "Candy": ("PeppaPig_platform", "en"), + "Beacon": ("ai_huangzhong_712", "en"), + "Rock": ("ai_huangyaoshi_712", "en"), + "Titan": ("ai_laoguowang_712", "en"), + "Grace": ("chengshu_jiejie", "en"), + "Helen": ("you_pingjing", "en"), + "Lore": ("calm_story1", "en"), + "Crag": ("uk_man2", "en"), + "Prattle": ("laopopo_speech02", "en"), + "Hearth": ("heainainai_speech02", "en"), + "The Reader": ("reader_en_m-v1", "en"), + "Commercial Lady": ("commercial_lady_en_f-v1", "en"), + # Chinese voices + "阳光少年": ("genshin_vindi2", "zh"), + "懂事小弟": ("zhinen_xuesheng", "zh"), + "运动少年": ("tiyuxi_xuedi", "zh"), + "青春少女": ("ai_shatang", "zh"), + "温柔小妹": ("genshin_klee2", "zh"), + "元气少女": ("genshin_kirara", "zh"), + "阳光男生": ("ai_kaiya", "zh"), + "幽默小哥": ("tiexin_nanyou", "zh"), + "文艺小哥": ("ai_chenjiahao_712", "zh"), + "甜美邻家": ("girlfriend_1_speech02", "zh"), + "温柔姐姐": ("chat1_female_new-3", "zh"), + "职场女青": ("girlfriend_2_speech02", "zh"), + "活泼男童": ("cartoon-boy-07", "zh"), + "俏皮女童": ("cartoon-girl-01", "zh"), + "稳重老爸": ("ai_huangyaoshi_712", "zh"), + "温柔妈妈": ("you_pingjing", "zh"), + "严肃上司": ("ai_laoguowang_712", "zh"), + "优雅贵妇": ("chengshu_jiejie", "zh"), + "慈祥爷爷": ("zhuxi_speech02", "zh"), + "唠叨爷爷": ("uk_oldman3", "zh"), + "唠叨奶奶": ("laopopo_speech02", "zh"), + "和蔼奶奶": ("heainainai_speech02", "zh"), + "东北老铁": ("dongbeilaotie_speech02", "zh"), + "重庆小伙": ("chongqingxiaohuo_speech02", "zh"), + "四川妹子": ("chuanmeizi_speech02", "zh"), + "潮汕大叔": ("chaoshandashu_speech02", "zh"), + "台湾男生": ("ai_taiwan_man2_speech02", "zh"), + "西安掌柜": ("xianzhanggui_speech02", "zh"), + "天津姐姐": ("tianjinjiejie_speech02", "zh"), + "新闻播报男": ("diyinnansang_DB_CN_M_04-v2", "zh"), + "译制片男": ("yizhipiannan-v1", "zh"), + "撒娇女友": ("tianmeixuemei-v1", "zh"), + "刀片烟嗓": ("daopianyansang-v1", "zh"), + "乖巧正太": ("mengwa-v1", "zh"), +} def is_valid_camera_control_configs(configs: list[float]) -> bool: @@ -142,11 +211,6 @@ def is_valid_camera_control_configs(configs: list[float]) -> bool: return any(not math.isclose(value, 0.0) for value in configs) -def is_valid_prompt(prompt: str) -> bool: - """Verifies that the prompt is not empty.""" - return bool(prompt) - - def is_valid_task_creation_response(response: KlingText2VideoResponse) -> bool: """Verifies that the initial response contains a task ID.""" return bool(response.data.task_id) @@ -190,23 +254,23 @@ def validate_task_creation_response(response) -> None: if not is_valid_task_creation_response(response): error_msg = f"Kling initial request failed. Code: {response.code}, Message: {response.message}, Data: {response.data}" logging.error(error_msg) - raise KlingApiError(error_msg) + raise Exception(error_msg) def validate_video_result_response(response) -> None: """Validates that the Kling task result contains a video.""" if not is_valid_video_response(response): error_msg = f"Kling task {response.data.task_id} succeeded but no video data found in response." - logging.error(f"Error: {error_msg}.\nResponse: {response}") - raise KlingApiError(error_msg) + logging.error("Error: %s.\nResponse: %s", error_msg, response) + raise Exception(error_msg) def validate_image_result_response(response) -> None: """Validates that the Kling task result contains an image.""" if not is_valid_image_response(response): error_msg = f"Kling task {response.data.task_id} succeeded but no image data found in response." - logging.error(f"Error: {error_msg}.\nResponse: {response}") - raise KlingApiError(error_msg) + logging.error("Error: %s.\nResponse: %s", error_msg, response) + raise Exception(error_msg) def validate_input_image(image: torch.Tensor) -> None: @@ -218,22 +282,7 @@ def validate_input_image(image: torch.Tensor) -> None: See: https://app.klingai.com/global/dev/document-api/apiReference/model/imageToVideo """ validate_image_dimensions(image, min_width=300, min_height=300) - validate_image_aspect_ratio(image, min_aspect_ratio=1 / 2.5, max_aspect_ratio=2.5) - - -def get_camera_control_input_config( - tooltip: str, default: float = 0.0 -) -> tuple[IO, InputTypeOptions]: - """Returns common InputTypeOptions for Kling camera control configurations.""" - input_config = { - "default": default, - "min": -10.0, - "max": 10.0, - "step": 0.25, - "display": "slider", - "tooltip": tooltip, - } - return IO.FLOAT, input_config + validate_image_aspect_ratio(image, (1, 2.5), (2.5, 1)) def get_video_from_response(response) -> KlingVideoResult: @@ -278,17 +327,6 @@ def get_images_urls_from_response(response) -> Optional[str]: return None -async def video_result_to_node_output( - video: KlingVideoResult, -) -> tuple[VideoFromFile, str, str]: - """Converts a KlingVideoResult to a tuple of (VideoFromFile, str, str) to be used as a ComfyUI node output.""" - return ( - await download_url_to_video_output(str(video.url)), - str(video.id), - str(video.duration), - ) - - async def image_result_to_node_output( images: list[KlingImageResult], ) -> torch.Tensor: @@ -302,57 +340,295 @@ async def image_result_to_node_output( return torch.cat([await download_url_to_image_tensor(str(image.url)) for image in images]) -class KlingNodeBase(ComfyNodeABC): - """Base class for Kling nodes.""" +async def execute_text2video( + cls: type[IO.ComfyNode], + prompt: str, + negative_prompt: str, + cfg_scale: float, + model_name: str, + model_mode: str, + duration: str, + aspect_ratio: str, + camera_control: Optional[KlingCameraControl] = None, +) -> IO.NodeOutput: + validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_TEXT_TO_VIDEO, method="POST"), + response_model=KlingText2VideoResponse, + data=KlingText2VideoRequest( + prompt=prompt if prompt else None, + negative_prompt=negative_prompt if negative_prompt else None, + duration=KlingVideoGenDuration(duration), + mode=KlingVideoGenMode(model_mode), + model_name=KlingVideoGenModelName(model_name), + cfg_scale=cfg_scale, + aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), + camera_control=camera_control, + ), + ) - FUNCTION = "api_call" - CATEGORY = "api node/video/Kling" - API_NODE = True + validate_task_creation_response(task_creation_response) + + task_id = task_creation_response.data.task_id + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_TEXT_TO_VIDEO}/{task_id}"), + response_model=KlingText2VideoResponse, + estimated_duration=AVERAGE_DURATION_T2V, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) -class KlingCameraControls(KlingNodeBase): +async def execute_image2video( + cls: type[IO.ComfyNode], + start_frame: torch.Tensor, + prompt: str, + negative_prompt: str, + model_name: str, + cfg_scale: float, + model_mode: str, + aspect_ratio: str, + duration: str, + camera_control: Optional[KlingCameraControl] = None, + end_frame: Optional[torch.Tensor] = None, +) -> IO.NodeOutput: + validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V) + validate_input_image(start_frame) + + if camera_control is not None: + # Camera control type for image 2 video is always `simple` + camera_control.type = KlingCameraControlType.simple + + if model_mode == "std" and model_name == KlingVideoGenModelName.kling_v2_5_turbo.value: + model_mode = "pro" # October 5: currently "std" mode is not supported for this model + + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), + response_model=KlingImage2VideoResponse, + data=KlingImage2VideoRequest( + model_name=KlingVideoGenModelName(model_name), + image=tensor_to_base64_string(start_frame), + image_tail=( + tensor_to_base64_string(end_frame) + if end_frame is not None + else None + ), + prompt=prompt, + negative_prompt=negative_prompt if negative_prompt else None, + cfg_scale=cfg_scale, + mode=KlingVideoGenMode(model_mode), + duration=KlingVideoGenDuration(duration), + camera_control=camera_control, + ), + ) + + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.data.task_id + + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}"), + response_model=KlingImage2VideoResponse, + estimated_duration=AVERAGE_DURATION_I2V, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + + +async def execute_video_effect( + cls: type[IO.ComfyNode], + dual_character: bool, + effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, + model_name: str, + duration: KlingVideoGenDuration, + image_1: torch.Tensor, + image_2: Optional[torch.Tensor] = None, + model_mode: Optional[KlingVideoGenMode] = None, +) -> tuple[VideoFromFile, str, str]: + if dual_character: + request_input_field = KlingDualCharacterEffectInput( + model_name=model_name, + mode=model_mode, + images=[ + tensor_to_base64_string(image_1), + tensor_to_base64_string(image_2), + ], + duration=duration, + ) + else: + request_input_field = KlingSingleImageEffectInput( + model_name=model_name, + image=tensor_to_base64_string(image_1), + duration=duration, + ) + + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=PATH_VIDEO_EFFECTS, method="POST"), + response_model=KlingVideoEffectsResponse, + data=KlingVideoEffectsRequest( + effect_scene=effect_scene, + input=request_input_field, + ), + ) + + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.data.task_id + + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIDEO_EFFECTS}/{task_id}"), + response_model=KlingVideoEffectsResponse, + estimated_duration=AVERAGE_DURATION_VIDEO_EFFECTS, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration) + + +async def execute_lipsync( + cls: type[IO.ComfyNode], + video: VideoInput, + audio: Optional[AudioInput] = None, + voice_language: Optional[str] = None, + model_mode: Optional[str] = None, + text: Optional[str] = None, + voice_speed: Optional[float] = None, + voice_id: Optional[str] = None, +) -> IO.NodeOutput: + if text: + validate_string(text, field_name="Text", max_length=MAX_PROMPT_LENGTH_LIP_SYNC) + validate_video_dimensions(video, 720, 1920) + validate_video_duration(video, 2, 10) + + # Upload video to Comfy API and get download URL + video_url = await upload_video_to_comfyapi(cls, video) + logging.info("Uploaded video to Comfy API. URL: %s", video_url) + + # Upload the audio file to Comfy API and get download URL + if audio: + audio_url = await upload_audio_to_comfyapi(cls, audio) + logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) + else: + audio_url = None + + task_creation_response = await sync_op( + cls, + ApiEndpoint(PATH_LIP_SYNC, "POST"), + response_model=KlingLipSyncResponse, + data=KlingLipSyncRequest( + input=KlingLipSyncInputObject( + video_url=video_url, + mode=model_mode, + text=text, + voice_language=voice_language, + voice_speed=voice_speed, + audio_type="url", + audio_url=audio_url, + voice_id=voice_id, + ), + ), + ) + + validate_task_creation_response(task_creation_response) + task_id = task_creation_response.data.task_id + + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_LIP_SYNC}/{task_id}"), + response_model=KlingLipSyncResponse, + estimated_duration=AVERAGE_DURATION_LIP_SYNC, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), + ) + validate_video_result_response(final_response) + + video = get_video_from_response(final_response) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) + + +class KlingCameraControls(IO.ComfyNode): """Kling Camera Controls Node""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "camera_control_type": model_field_to_node_input( - IO.COMBO, - KlingCameraControl, - "type", - enum_type=KlingCameraControlType, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingCameraControls", + display_name="Kling Camera Controls", + category="api node/video/Kling", + description="Allows specifying configuration options for Kling Camera Controls and motion control effects.", + inputs=[ + IO.Combo.Input("camera_control_type", options=KlingCameraControlType), + IO.Float.Input( + "horizontal_movement", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=IO.NumberDisplay.slider, + tooltip="Controls camera's movement along horizontal axis (x-axis). Negative indicates left, positive indicates right", ), - "horizontal_movement": get_camera_control_input_config( - "Controls camera's movement along horizontal axis (x-axis). Negative indicates left, positive indicates right" + IO.Float.Input( + "vertical_movement", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=IO.NumberDisplay.slider, + tooltip="Controls camera's movement along vertical axis (y-axis). Negative indicates downward, positive indicates upward.", ), - "vertical_movement": get_camera_control_input_config( - "Controls camera's movement along vertical axis (y-axis). Negative indicates downward, positive indicates upward." - ), - "pan": get_camera_control_input_config( - "Controls camera's rotation in vertical plane (x-axis). Negative indicates downward rotation, positive indicates upward rotation.", + IO.Float.Input( + "pan", default=0.5, + min=-10.0, + max=10.0, + step=0.25, + display_mode=IO.NumberDisplay.slider, + tooltip="Controls camera's rotation in vertical plane (x-axis). Negative indicates downward rotation, positive indicates upward rotation.", ), - "tilt": get_camera_control_input_config( - "Controls camera's rotation in horizontal plane (y-axis). Negative indicates left rotation, positive indicates right rotation.", + IO.Float.Input( + "tilt", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=IO.NumberDisplay.slider, + tooltip="Controls camera's rotation in horizontal plane (y-axis). Negative indicates left rotation, positive indicates right rotation.", ), - "roll": get_camera_control_input_config( - "Controls camera's rolling amount (z-axis). Negative indicates counterclockwise, positive indicates clockwise.", + IO.Float.Input( + "roll", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=IO.NumberDisplay.slider, + tooltip="Controls camera's rolling amount (z-axis). Negative indicates counterclockwise, positive indicates clockwise.", ), - "zoom": get_camera_control_input_config( - "Controls change in camera's focal length. Negative indicates narrower field of view, positive indicates wider field of view.", + IO.Float.Input( + "zoom", + default=0.0, + min=-10.0, + max=10.0, + step=0.25, + display_mode=IO.NumberDisplay.slider, + tooltip="Controls change in camera's focal length. Negative indicates narrower field of view, positive indicates wider field of view.", ), - } - } - - DESCRIPTION = "Allows specifying configuration options for Kling Camera Controls and motion control effects." - RETURN_TYPES = ("CAMERA_CONTROL",) - RETURN_NAMES = ("camera_control",) - FUNCTION = "main" - API_NODE = False # This is just a helper node, it doesn't make an API call + ], + outputs=[IO.Custom("CAMERA_CONTROL").Output(display_name="camera_control")], + ) @classmethod - def VALIDATE_INPUTS( + def validate_inputs( cls, horizontal_movement: float, vertical_movement: float, @@ -374,8 +650,9 @@ class KlingCameraControls(KlingNodeBase): return "Invalid camera control configs: at least one of the values must be non-zero" return True - def main( - self, + @classmethod + def execute( + cls, camera_control_type: str, horizontal_movement: float, vertical_movement: float, @@ -383,8 +660,8 @@ class KlingCameraControls(KlingNodeBase): tilt: float, roll: float, zoom: float, - ) -> tuple[KlingCameraControl]: - return ( + ) -> IO.NodeOutput: + return IO.NodeOutput( KlingCameraControl( type=KlingCameraControlType(camera_control_type), config=KlingCameraConfig( @@ -395,301 +672,178 @@ class KlingCameraControls(KlingNodeBase): tilt=tilt, zoom=zoom, ), - ), + ) ) -class KlingTextToVideoNode(KlingNodeBase): +class KlingTextToVideoNode(IO.ComfyNode): """Kling Text to Video Node""" - @staticmethod - def get_mode_string_mapping() -> dict[str, tuple[str, str, str]]: - """ - Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. - Only includes config combos that support the `image_tail` request field. - - See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) - """ - return { - "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), - "standard mode / 10s duration / kling-v1": ("std", "10", "kling-v1"), - "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), - "pro mode / 10s duration / kling-v1": ("pro", "10", "kling-v1"), - "standard mode / 5s duration / kling-v1-6": ("std", "5", "kling-v1-6"), - "standard mode / 10s duration / kling-v1-6": ("std", "10", "kling-v1-6"), - "pro mode / 5s duration / kling-v2-master": ("pro", "5", "kling-v2-master"), - "pro mode / 10s duration / kling-v2-master": ("pro", "10", "kling-v2-master"), - "standard mode / 5s duration / kling-v2-master": ("std", "5", "kling-v2-master"), - "standard mode / 10s duration / kling-v2-master": ("std", "10", "kling-v2-master"), - "pro mode / 5s duration / kling-v2-1-master": ("pro", "5", "kling-v2-1-master"), - "pro mode / 10s duration / kling-v2-1-master": ("pro", "10", "kling-v2-1-master"), - } - @classmethod - def INPUT_TYPES(s): - modes = list(KlingTextToVideoNode.get_mode_string_mapping().keys()) - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, KlingText2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, KlingText2VideoRequest, "negative_prompt", multiline=True - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingText2VideoRequest, - "cfg_scale", - default=1.0, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingText2VideoRequest, + def define_schema(cls) -> IO.Schema: + modes = list(MODE_TEXT2VIDEO.keys()) + return IO.Schema( + node_id="KlingTextToVideoNode", + display_name="Kling Text to Video", + category="api node/video/Kling", + description="Kling Text to Video Node", + inputs=[ + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=1.0, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=KlingVideoGenAspectRatio, + default="16:9", ), - "mode": ( - modes, - { - "default": modes[4], - "tooltip": "The configuration to use for the video generation following the format: mode / duration / model_name.", - }, + IO.Combo.Input( + "mode", + options=modes, + default=modes[4], + tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - DESCRIPTION = "Kling Text to Video Node" - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingText2VideoResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_TEXT_TO_VIDEO}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingText2VideoResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_T2V, - node_id=node_id, + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, negative_prompt: str, cfg_scale: float, mode: str, aspect_ratio: str, - camera_control: Optional[KlingCameraControl] = None, - model_name: Optional[str] = None, - duration: Optional[str] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile, str, str]: - validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) - if model_name is None: - mode, duration, model_name = self.get_mode_string_mapping()[mode] - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_VIDEO, - method=HttpMethod.POST, - request_model=KlingText2VideoRequest, - response_model=KlingText2VideoResponse, - ), - request=KlingText2VideoRequest( - prompt=prompt if prompt else None, - negative_prompt=negative_prompt if negative_prompt else None, - duration=KlingVideoGenDuration(duration), - mode=KlingVideoGenMode(mode), - model_name=KlingVideoGenModelName(model_name), - cfg_scale=cfg_scale, - aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), - camera_control=camera_control, - ), - auth_kwargs=kwargs, + ) -> IO.NodeOutput: + model_mode, duration, model_name = MODE_TEXT2VIDEO[mode] + return await execute_text2video( + cls, + prompt=prompt, + negative_prompt=negative_prompt, + cfg_scale=cfg_scale, + model_mode=model_mode, + aspect_ratio=aspect_ratio, + model_name=model_name, + duration=duration, ) - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingCameraControlT2VNode(KlingTextToVideoNode): +class KlingCameraControlT2VNode(IO.ComfyNode): """ Kling Text to Video Camera Control Node. This node is a text to video node, but it supports controlling the camera. Duration, mode, and model_name request fields are hard-coded because camera control is only supported in pro mode with the kling-v1-5 model at 5s duration as of 2025-05-02. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, KlingText2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingText2VideoRequest, - "negative_prompt", - multiline=True, - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingText2VideoRequest, - "cfg_scale", - default=0.75, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingText2VideoRequest, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingCameraControlT2VNode", + display_name="Kling Text to Video (Camera Control)", + category="api node/video/Kling", + description="Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text.", + inputs=[ + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=KlingVideoGenAspectRatio, + default="16:9", ), - "camera_control": ( - "CAMERA_CONTROL", - { - "tooltip": "Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", - }, + IO.Custom("CAMERA_CONTROL").Input( + "camera_control", + tooltip="Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text." - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, negative_prompt: str, cfg_scale: float, aspect_ratio: str, camera_control: Optional[KlingCameraControl] = None, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> IO.NodeOutput: + return await execute_text2video( + cls, model_name=KlingVideoGenModelName.kling_v1, cfg_scale=cfg_scale, - mode=KlingVideoGenMode.std, + model_mode=KlingVideoGenMode.std, aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), duration=KlingVideoGenDuration.field_5, prompt=prompt, negative_prompt=negative_prompt, camera_control=camera_control, - **kwargs, ) -class KlingImage2VideoNode(KlingNodeBase): +class KlingImage2VideoNode(IO.ComfyNode): """Kling Image to Video Node""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "start_frame": model_field_to_node_input( - IO.IMAGE, - KlingImage2VideoRequest, - "image", - tooltip="The reference image used to generate the video.", - ), - "prompt": model_field_to_node_input( - IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImage2VideoRequest, - "negative_prompt", - multiline=True, - ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingImage2VideoNode", + display_name="Kling Image to Video", + category="api node/video/Kling", + description="Kling Image to Video Node", + inputs=[ + IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Combo.Input( "model_name", - enum_type=KlingVideoGenModelName, + options=KlingVideoGenModelName, + default="kling-v2-master", ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingImage2VideoRequest, - "cfg_scale", - default=0.8, - min=0.0, - max=1.0, - ), - "mode": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, - "mode", - enum_type=KlingVideoGenMode, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, + IO.Float.Input("cfg_scale", default=0.8, min=0.0, max=1.0), + IO.Combo.Input("mode", options=KlingVideoGenMode, default=KlingVideoGenMode.std), + IO.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=KlingVideoGenAspectRatio, + default=KlingVideoGenAspectRatio.field_16_9, ), - "duration": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, - "duration", - enum_type=KlingVideoGenDuration, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - DESCRIPTION = "Kling Image to Video Node" - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingImage2VideoResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_IMAGE_TO_VIDEO}/{task_id}", - method=HttpMethod.GET, - request_model=KlingImage2VideoRequest, - response_model=KlingImage2VideoResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_I2V, - node_id=node_id, + IO.Combo.Input("duration", options=KlingVideoGenDuration, default=KlingVideoGenDuration.field_5), + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, start_frame: torch.Tensor, prompt: str, negative_prompt: str, @@ -700,209 +854,143 @@ class KlingImage2VideoNode(KlingNodeBase): duration: str, camera_control: Optional[KlingCameraControl] = None, end_frame: Optional[torch.Tensor] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_I2V) - validate_input_image(start_frame) - - if camera_control is not None: - # Camera control type for image 2 video is always `simple` - camera_control.type = KlingCameraControlType.simple - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=KlingImage2VideoRequest, - response_model=KlingImage2VideoResponse, - ), - request=KlingImage2VideoRequest( - model_name=KlingVideoGenModelName(model_name), - image=tensor_to_base64_string(start_frame), - image_tail=( - tensor_to_base64_string(end_frame) - if end_frame is not None - else None - ), - prompt=prompt, - negative_prompt=negative_prompt if negative_prompt else None, - cfg_scale=cfg_scale, - mode=KlingVideoGenMode(mode), - duration=KlingVideoGenDuration(duration), - camera_control=camera_control, - ), - auth_kwargs=kwargs, + ) -> IO.NodeOutput: + return await execute_image2video( + cls, + start_frame=start_frame, + prompt=prompt, + negative_prompt=negative_prompt, + cfg_scale=cfg_scale, + model_name=model_name, + aspect_ratio=aspect_ratio, + model_mode=mode, + duration=duration, + camera_control=camera_control, + end_frame=end_frame, ) - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingCameraControlI2VNode(KlingImage2VideoNode): +class KlingCameraControlI2VNode(IO.ComfyNode): """ Kling Image to Video Camera Control Node. This node is a image to video node, but it supports controlling the camera. Duration, mode, and model_name request fields are hard-coded because camera control is only supported in pro mode with the kling-v1-5 model at 5s duration as of 2025-05-02. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "start_frame": model_field_to_node_input( - IO.IMAGE, KlingImage2VideoRequest, "image" + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingCameraControlI2VNode", + display_name="Kling Image to Video (Camera Control)", + category="api node/video/Kling", + description="Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image.", + inputs=[ + IO.Image.Input( + "start_frame", + tooltip="Reference Image - URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1. Base64 should not include data:image prefix.", ), - "prompt": model_field_to_node_input( - IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImage2VideoRequest, - "negative_prompt", - multiline=True, - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingImage2VideoRequest, - "cfg_scale", - default=0.75, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=0.75, min=0.0, max=1.0), + IO.Combo.Input( "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, + options=KlingVideoGenAspectRatio, + default=KlingVideoGenAspectRatio.field_16_9, ), - "camera_control": ( - "CAMERA_CONTROL", - { - "tooltip": "Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", - }, + IO.Custom("CAMERA_CONTROL").Input( + "camera_control", + tooltip="Can be created using the Kling Camera Controls node. Controls the camera movement and motion during the video generation.", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image." - - async def api_call( - self, + @classmethod + async def execute( + cls, start_frame: torch.Tensor, prompt: str, negative_prompt: str, cfg_scale: float, aspect_ratio: str, camera_control: KlingCameraControl, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> IO.NodeOutput: + return await execute_image2video( + cls, model_name=KlingVideoGenModelName.kling_v1_5, start_frame=start_frame, cfg_scale=cfg_scale, - mode=KlingVideoGenMode.pro, + model_mode=KlingVideoGenMode.pro, aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio), duration=KlingVideoGenDuration.field_5, prompt=prompt, negative_prompt=negative_prompt, camera_control=camera_control, - unique_id=unique_id, - **kwargs, ) -class KlingStartEndFrameNode(KlingImage2VideoNode): +class KlingStartEndFrameNode(IO.ComfyNode): """ Kling First Last Frame Node. This node allows creation of a video from a first and last frame. It calls the normal image to video endpoint, but only allows the subset of input options that support the `image_tail` request field. """ - @staticmethod - def get_mode_string_mapping() -> dict[str, tuple[str, str, str]]: - """ - Returns a mapping of mode strings to their corresponding (mode, duration, model_name) tuples. - Only includes config combos that support the `image_tail` request field. - - See: [Kling API Docs Capability Map](https://app.klingai.com/global/dev/document-api/apiReference/model/skillsMap) - """ - return { - "standard mode / 5s duration / kling-v1": ("std", "5", "kling-v1"), - "pro mode / 5s duration / kling-v1": ("pro", "5", "kling-v1"), - "pro mode / 5s duration / kling-v1-5": ("pro", "5", "kling-v1-5"), - "pro mode / 10s duration / kling-v1-5": ("pro", "10", "kling-v1-5"), - "pro mode / 5s duration / kling-v1-6": ("pro", "5", "kling-v1-6"), - "pro mode / 10s duration / kling-v1-6": ("pro", "10", "kling-v1-6"), - "pro mode / 5s duration / kling-v2-1": ("pro", "5", "kling-v2-1"), - "pro mode / 10s duration / kling-v2-1": ("pro", "10", "kling-v2-1"), - } + @classmethod + def define_schema(cls) -> IO.Schema: + modes = list(MODE_START_END_FRAME.keys()) + return IO.Schema( + node_id="KlingStartEndFrameNode", + display_name="Kling Start-End Frame to Video", + category="api node/video/Kling", + description="Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last.", + inputs=[ + IO.Image.Input( + "start_frame", + tooltip="Reference Image - URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1. Base64 should not include data:image prefix.", + ), + IO.Image.Input( + "end_frame", + tooltip="Reference Image - End frame control. URL or Base64 encoded string, cannot exceed 10MB, resolution not less than 300*300px. Base64 should not include data:image prefix.", + ), + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), + IO.Combo.Input( + "aspect_ratio", + options=[i.value for i in KlingVideoGenAspectRatio], + default="16:9", + ), + IO.Combo.Input( + "mode", + options=modes, + default=modes[2], + tooltip="The configuration to use for the video generation following the format: mode / duration / model_name.", + ), + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - modes = list(KlingStartEndFrameNode.get_mode_string_mapping().keys()) - return { - "required": { - "start_frame": model_field_to_node_input( - IO.IMAGE, KlingImage2VideoRequest, "image" - ), - "end_frame": model_field_to_node_input( - IO.IMAGE, KlingImage2VideoRequest, "image_tail" - ), - "prompt": model_field_to_node_input( - IO.STRING, KlingImage2VideoRequest, "prompt", multiline=True - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImage2VideoRequest, - "negative_prompt", - multiline=True, - ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingImage2VideoRequest, - "cfg_scale", - default=0.5, - min=0.0, - max=1.0, - ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImage2VideoRequest, - "aspect_ratio", - enum_type=KlingVideoGenAspectRatio, - ), - "mode": ( - modes, - { - "default": modes[2], - "tooltip": "The configuration to use for the video generation following the format: mode / duration / model_name.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last." - - async def api_call( - self, + async def execute( + cls, start_frame: torch.Tensor, end_frame: torch.Tensor, prompt: str, @@ -910,776 +998,469 @@ class KlingStartEndFrameNode(KlingImage2VideoNode): cfg_scale: float, aspect_ratio: str, mode: str, - unique_id: Optional[str] = None, - **kwargs, - ): - mode, duration, model_name = KlingStartEndFrameNode.get_mode_string_mapping()[ - mode - ] - return await super().api_call( + ) -> IO.NodeOutput: + mode, duration, model_name = MODE_START_END_FRAME[mode] + return await execute_image2video( + cls, prompt=prompt, negative_prompt=negative_prompt, model_name=model_name, start_frame=start_frame, cfg_scale=cfg_scale, - mode=mode, + model_mode=mode, aspect_ratio=aspect_ratio, duration=duration, end_frame=end_frame, - unique_id=unique_id, - **kwargs, ) -class KlingVideoExtendNode(KlingNodeBase): +class KlingVideoExtendNode(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, KlingVideoExtendRequest, "prompt", multiline=True + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingVideoExtendNode", + display_name="Kling Video Extend", + category="api node/video/Kling", + description="Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + tooltip="Positive text prompt for guiding the video extension", ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingVideoExtendRequest, + IO.String.Input( "negative_prompt", multiline=True, + tooltip="Negative text prompt for elements to avoid in the extended video", ), - "cfg_scale": model_field_to_node_input( - IO.FLOAT, - KlingVideoExtendRequest, - "cfg_scale", - default=0.5, - min=0.0, - max=1.0, + IO.Float.Input("cfg_scale", default=0.5, min=0.0, max=1.0), + IO.String.Input( + "video_id", + force_input=True, + tooltip="The ID of the video to be extended. Supports videos generated by text-to-video, image-to-video, and previous video extension operations. Cannot exceed 3 minutes total duration after extension.", ), - "video_id": model_field_to_node_input( - IO.STRING, KlingVideoExtendRequest, "video_id", forceInput=True - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - DESCRIPTION = "Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes." - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingVideoExtendResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_VIDEO_EXTEND}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVideoExtendResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_VIDEO_EXTEND, - node_id=node_id, + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, negative_prompt: str, cfg_scale: float, video_id: str, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile, str, str]: + ) -> IO.NodeOutput: validate_prompts(prompt, negative_prompt, MAX_PROMPT_LENGTH_T2V) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIDEO_EXTEND, - method=HttpMethod.POST, - request_model=KlingVideoExtendRequest, - response_model=KlingVideoExtendResponse, - ), - request=KlingVideoExtendRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_VIDEO_EXTEND, method="POST"), + response_model=KlingVideoExtendResponse, + data=KlingVideoExtendRequest( prompt=prompt if prompt else None, negative_prompt=negative_prompt if negative_prompt else None, cfg_scale=cfg_scale, video_id=video_id, ), - auth_kwargs=kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIDEO_EXTEND}/{task_id}"), + response_model=KlingVideoExtendResponse, + estimated_duration=AVERAGE_DURATION_VIDEO_EXTEND, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_video_result_response(final_response) video = get_video_from_response(final_response) - return await video_result_to_node_output(video) + return IO.NodeOutput(await download_url_to_video_output(str(video.url)), str(video.id), str(video.duration)) -class KlingVideoEffectsBase(KlingNodeBase): - """Kling Video Effects Base""" - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingVideoEffectsResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_VIDEO_EFFECTS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVideoEffectsResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_VIDEO_EFFECTS, - node_id=node_id, - ) - - async def api_call( - self, - dual_character: bool, - effect_scene: KlingDualCharacterEffectsScene | KlingSingleImageEffectsScene, - model_name: str, - duration: KlingVideoGenDuration, - image_1: torch.Tensor, - image_2: Optional[torch.Tensor] = None, - mode: Optional[KlingVideoGenMode] = None, - unique_id: Optional[str] = None, - **kwargs, - ): - if dual_character: - request_input_field = KlingDualCharacterEffectInput( - model_name=model_name, - mode=mode, - images=[ - tensor_to_base64_string(image_1), - tensor_to_base64_string(image_2), - ], - duration=duration, - ) - else: - request_input_field = KlingSingleImageEffectInput( - model_name=model_name, - image=tensor_to_base64_string(image_1), - duration=duration, - ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIDEO_EFFECTS, - method=HttpMethod.POST, - request_model=KlingVideoEffectsRequest, - response_model=KlingVideoEffectsResponse, - ), - request=KlingVideoEffectsRequest( - effect_scene=effect_scene, - input=request_input_field, - ), - auth_kwargs=kwargs, - ) - - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingDualCharacterVideoEffectNode(KlingVideoEffectsBase): +class KlingDualCharacterVideoEffectNode(IO.ComfyNode): """Kling Dual Character Video Effect Node""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image_left": (IO.IMAGE, {"tooltip": "Left side image"}), - "image_right": (IO.IMAGE, {"tooltip": "Right side image"}), - "effect_scene": model_field_to_node_input( - IO.COMBO, - KlingVideoEffectsRequest, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingDualCharacterVideoEffectNode", + display_name="Kling Dual Character Video Effects", + category="api node/video/Kling", + description="Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite.", + inputs=[ + IO.Image.Input("image_left", tooltip="Left side image"), + IO.Image.Input("image_right", tooltip="Right side image"), + IO.Combo.Input( "effect_scene", - enum_type=KlingDualCharacterEffectsScene, + options=[i.value for i in KlingDualCharacterEffectsScene], ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingDualCharacterEffectInput, + IO.Combo.Input( "model_name", - enum_type=KlingCharacterEffectModelName, + options=[i.value for i in KlingCharacterEffectModelName], + default="kling-v1", ), - "mode": model_field_to_node_input( - IO.COMBO, - KlingDualCharacterEffectInput, + IO.Combo.Input( "mode", - enum_type=KlingVideoGenMode, + options=[i.value for i in KlingVideoGenMode], + default="std", ), - "duration": model_field_to_node_input( - IO.COMBO, - KlingDualCharacterEffectInput, + IO.Combo.Input( "duration", - enum_type=KlingVideoGenDuration, + options=[i.value for i in KlingVideoGenDuration], ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite." - RETURN_TYPES = ("VIDEO", "STRING") - RETURN_NAMES = ("VIDEO", "duration") - - async def api_call( - self, + @classmethod + async def execute( + cls, image_left: torch.Tensor, image_right: torch.Tensor, effect_scene: KlingDualCharacterEffectsScene, model_name: KlingCharacterEffectModelName, mode: KlingVideoGenMode, duration: KlingVideoGenDuration, - unique_id: Optional[str] = None, - **kwargs, - ): - video, _, duration = await super().api_call( + ) -> IO.NodeOutput: + video, _, duration = await execute_video_effect( + cls, dual_character=True, effect_scene=effect_scene, model_name=model_name, - mode=mode, + model_mode=mode, duration=duration, image_1=image_left, image_2=image_right, - unique_id=unique_id, - **kwargs, ) - return video, duration + return IO.NodeOutput(video, duration) -class KlingSingleImageVideoEffectNode(KlingVideoEffectsBase): +class KlingSingleImageVideoEffectNode(IO.ComfyNode): """Kling Single Image Video Effect Node""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ( - IO.IMAGE, - { - "tooltip": " Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1" - }, - ), - "effect_scene": model_field_to_node_input( - IO.COMBO, - KlingVideoEffectsRequest, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingSingleImageVideoEffectNode", + display_name="Kling Video Effects", + category="api node/video/Kling", + description="Achieve different special effects when generating a video based on the effect_scene.", + inputs=[ + IO.Image.Input("image", tooltip=" Reference Image. URL or Base64 encoded string (without data:image prefix). File size cannot exceed 10MB, resolution not less than 300*300px, aspect ratio between 1:2.5 ~ 2.5:1"), + IO.Combo.Input( "effect_scene", - enum_type=KlingSingleImageEffectsScene, + options=[i.value for i in KlingSingleImageEffectsScene], ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingSingleImageEffectInput, + IO.Combo.Input( "model_name", - enum_type=KlingSingleImageEffectModelName, + options=[i.value for i in KlingSingleImageEffectModelName], ), - "duration": model_field_to_node_input( - IO.COMBO, - KlingSingleImageEffectInput, + IO.Combo.Input( "duration", - enum_type=KlingVideoGenDuration, + options=[i.value for i in KlingVideoGenDuration], ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Achieve different special effects when generating a video based on the effect_scene." - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, effect_scene: KlingSingleImageEffectsScene, model_name: KlingSingleImageEffectModelName, duration: KlingVideoGenDuration, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( - dual_character=False, - effect_scene=effect_scene, - model_name=model_name, - duration=duration, - image_1=image, - unique_id=unique_id, - **kwargs, - ) - - -class KlingLipSyncBase(KlingNodeBase): - """Kling Lip Sync Base""" - - RETURN_TYPES = ("VIDEO", "STRING", "STRING") - RETURN_NAMES = ("VIDEO", "video_id", "duration") - - def validate_lip_sync_video(self, video: VideoInput): - """ - Validates the input video adheres to the expectations of the Kling Lip Sync API: - - Video length does not exceed 10s and is not shorter than 2s - - Length and width dimensions should both be between 720px and 1920px - - See: https://app.klingai.com/global/dev/document-api/apiReference/model/videoTolip - """ - validate_video_dimensions(video, 720, 1920) - validate_video_duration(video, 2, 10) - - def validate_text(self, text: str): - if not text: - raise ValueError("Text is required") - if len(text) > MAX_PROMPT_LENGTH_LIP_SYNC: - raise ValueError( - f"Text is too long. Maximum length is {MAX_PROMPT_LENGTH_LIP_SYNC} characters." + ) -> IO.NodeOutput: + return IO.NodeOutput( + *( + await execute_video_effect( + cls, + dual_character=False, + effect_scene=effect_scene, + model_name=model_name, + duration=duration, + image_1=image, + ) ) - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingLipSyncResponse: - """Polls the Kling API endpoint until the task reaches a terminal state.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_LIP_SYNC}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingLipSyncResponse, - ), - result_url_extractor=get_video_url_from_response, - estimated_duration=AVERAGE_DURATION_LIP_SYNC, - node_id=node_id, ) - async def api_call( - self, - video: VideoInput, - audio: Optional[AudioInput] = None, - voice_language: Optional[str] = None, - mode: Optional[str] = None, - text: Optional[str] = None, - voice_speed: Optional[float] = None, - voice_id: Optional[str] = None, - unique_id: Optional[str] = None, - **kwargs, - ) -> tuple[VideoFromFile, str, str]: - if text: - self.validate_text(text) - self.validate_lip_sync_video(video) - # Upload video to Comfy API and get download URL - video_url = await upload_video_to_comfyapi(video, auth_kwargs=kwargs) - logging.info("Uploaded video to Comfy API. URL: %s", video_url) - - # Upload the audio file to Comfy API and get download URL - if audio: - audio_url = await upload_audio_to_comfyapi(audio, auth_kwargs=kwargs) - logging.info("Uploaded audio to Comfy API. URL: %s", audio_url) - else: - audio_url = None - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_LIP_SYNC, - method=HttpMethod.POST, - request_model=KlingLipSyncRequest, - response_model=KlingLipSyncResponse, - ), - request=KlingLipSyncRequest( - input=KlingLipSyncInputObject( - video_url=video_url, - mode=mode, - text=text, - voice_language=voice_language, - voice_speed=voice_speed, - audio_type="url", - audio_url=audio_url, - voice_id=voice_id, - ), - ), - auth_kwargs=kwargs, - ) - - task_creation_response = await initial_operation.execute() - validate_task_creation_response(task_creation_response) - task_id = task_creation_response.data.task_id - - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id - ) - validate_video_result_response(final_response) - - video = get_video_from_response(final_response) - return await video_result_to_node_output(video) - - -class KlingLipSyncAudioToVideoNode(KlingLipSyncBase): +class KlingLipSyncAudioToVideoNode(IO.ComfyNode): """Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "video": (IO.VIDEO, {}), - "audio": (IO.AUDIO, {}), - "voice_language": model_field_to_node_input( - IO.COMBO, - KlingLipSyncInputObject, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingLipSyncAudioToVideoNode", + display_name="Kling Lip Sync Video with Audio", + category="api node/video/Kling", + description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", + inputs=[ + IO.Video.Input("video"), + IO.Audio.Input("audio"), + IO.Combo.Input( "voice_language", - enum_type=KlingLipSyncVoiceLanguage, + options=[i.value for i in KlingLipSyncVoiceLanguage], + default="en", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - - async def api_call( - self, + @classmethod + async def execute( + cls, video: VideoInput, audio: AudioInput, voice_language: str, - unique_id: Optional[str] = None, - **kwargs, - ): - return await super().api_call( + ) -> IO.NodeOutput: + return await execute_lipsync( + cls, video=video, audio=audio, voice_language=voice_language, - mode="audio2video", - unique_id=unique_id, - **kwargs, + model_mode="audio2video", ) -class KlingLipSyncTextToVideoNode(KlingLipSyncBase): +class KlingLipSyncTextToVideoNode(IO.ComfyNode): """Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt.""" - @staticmethod - def get_voice_config() -> dict[str, tuple[str, str]]: - return { - # English voices - "Melody": ("girlfriend_4_speech02", "en"), - "Sunny": ("genshin_vindi2", "en"), - "Sage": ("zhinen_xuesheng", "en"), - "Ace": ("AOT", "en"), - "Blossom": ("ai_shatang", "en"), - "Peppy": ("genshin_klee2", "en"), - "Dove": ("genshin_kirara", "en"), - "Shine": ("ai_kaiya", "en"), - "Anchor": ("oversea_male1", "en"), - "Lyric": ("ai_chenjiahao_712", "en"), - "Tender": ("chat1_female_new-3", "en"), - "Siren": ("chat_0407_5-1", "en"), - "Zippy": ("cartoon-boy-07", "en"), - "Bud": ("uk_boy1", "en"), - "Sprite": ("cartoon-girl-01", "en"), - "Candy": ("PeppaPig_platform", "en"), - "Beacon": ("ai_huangzhong_712", "en"), - "Rock": ("ai_huangyaoshi_712", "en"), - "Titan": ("ai_laoguowang_712", "en"), - "Grace": ("chengshu_jiejie", "en"), - "Helen": ("you_pingjing", "en"), - "Lore": ("calm_story1", "en"), - "Crag": ("uk_man2", "en"), - "Prattle": ("laopopo_speech02", "en"), - "Hearth": ("heainainai_speech02", "en"), - "The Reader": ("reader_en_m-v1", "en"), - "Commercial Lady": ("commercial_lady_en_f-v1", "en"), - # Chinese voices - "阳光少年": ("genshin_vindi2", "zh"), - "懂事小弟": ("zhinen_xuesheng", "zh"), - "运动少年": ("tiyuxi_xuedi", "zh"), - "青春少女": ("ai_shatang", "zh"), - "温柔小妹": ("genshin_klee2", "zh"), - "元气少女": ("genshin_kirara", "zh"), - "阳光男生": ("ai_kaiya", "zh"), - "幽默小哥": ("tiexin_nanyou", "zh"), - "文艺小哥": ("ai_chenjiahao_712", "zh"), - "甜美邻家": ("girlfriend_1_speech02", "zh"), - "温柔姐姐": ("chat1_female_new-3", "zh"), - "职场女青": ("girlfriend_2_speech02", "zh"), - "活泼男童": ("cartoon-boy-07", "zh"), - "俏皮女童": ("cartoon-girl-01", "zh"), - "稳重老爸": ("ai_huangyaoshi_712", "zh"), - "温柔妈妈": ("you_pingjing", "zh"), - "严肃上司": ("ai_laoguowang_712", "zh"), - "优雅贵妇": ("chengshu_jiejie", "zh"), - "慈祥爷爷": ("zhuxi_speech02", "zh"), - "唠叨爷爷": ("uk_oldman3", "zh"), - "唠叨奶奶": ("laopopo_speech02", "zh"), - "和蔼奶奶": ("heainainai_speech02", "zh"), - "东北老铁": ("dongbeilaotie_speech02", "zh"), - "重庆小伙": ("chongqingxiaohuo_speech02", "zh"), - "四川妹子": ("chuanmeizi_speech02", "zh"), - "潮汕大叔": ("chaoshandashu_speech02", "zh"), - "台湾男生": ("ai_taiwan_man2_speech02", "zh"), - "西安掌柜": ("xianzhanggui_speech02", "zh"), - "天津姐姐": ("tianjinjiejie_speech02", "zh"), - "新闻播报男": ("diyinnansang_DB_CN_M_04-v2", "zh"), - "译制片男": ("yizhipiannan-v1", "zh"), - "撒娇女友": ("tianmeixuemei-v1", "zh"), - "刀片烟嗓": ("daopianyansang-v1", "zh"), - "乖巧正太": ("mengwa-v1", "zh"), - } + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingLipSyncTextToVideoNode", + display_name="Kling Lip Sync Video with Text", + category="api node/video/Kling", + description="Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", + inputs=[ + IO.Video.Input("video"), + IO.String.Input( + "text", + multiline=True, + tooltip="Text Content for Lip-Sync Video Generation. Required when mode is text2video. Maximum length is 120 characters.", + ), + IO.Combo.Input( + "voice", + options=list(VOICES_CONFIG.keys()), + default="Melody", + ), + IO.Float.Input( + "voice_speed", + default=1, + min=0.8, + max=2.0, + display_mode=IO.NumberDisplay.slider, + tooltip="Speech Rate. Valid range: 0.8~2.0, accurate to one decimal place.", + ), + ], + outputs=[ + IO.Video.Output(), + IO.String.Output(display_name="video_id"), + IO.String.Output(display_name="duration"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - voice_options = list(s.get_voice_config().keys()) - return { - "required": { - "video": (IO.VIDEO, {}), - "text": model_field_to_node_input( - IO.STRING, KlingLipSyncInputObject, "text", multiline=True - ), - "voice": (voice_options, {"default": voice_options[0]}), - "voice_speed": model_field_to_node_input( - IO.FLOAT, KlingLipSyncInputObject, "voice_speed", slider=True - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length." - - async def api_call( - self, + async def execute( + cls, video: VideoInput, text: str, voice: str, voice_speed: float, - unique_id: Optional[str] = None, - **kwargs, - ): - voice_id, voice_language = KlingLipSyncTextToVideoNode.get_voice_config()[voice] - return await super().api_call( + ) -> IO.NodeOutput: + voice_id, voice_language = VOICES_CONFIG[voice] + return await execute_lipsync( + cls, video=video, text=text, voice_language=voice_language, voice_id=voice_id, voice_speed=voice_speed, - mode="text2video", - unique_id=unique_id, - **kwargs, + model_mode="text2video", ) -class KlingImageGenerationBase(KlingNodeBase): - """Kling Image Generation Base Node.""" - - RETURN_TYPES = ("IMAGE",) - CATEGORY = "api node/image/Kling" - - def validate_prompt(self, prompt: str, negative_prompt: Optional[str] = None): - if not prompt or len(prompt) > MAX_PROMPT_LENGTH_IMAGE_GEN: - raise ValueError( - f"Prompt must be less than {MAX_PROMPT_LENGTH_IMAGE_GEN} characters" - ) - if negative_prompt and len(negative_prompt) > MAX_PROMPT_LENGTH_IMAGE_GEN: - raise ValueError( - f"Negative prompt must be less than {MAX_PROMPT_LENGTH_IMAGE_GEN} characters" - ) - - -class KlingVirtualTryOnNode(KlingImageGenerationBase): +class KlingVirtualTryOnNode(IO.ComfyNode): """Kling Virtual Try On Node.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "human_image": (IO.IMAGE, {}), - "cloth_image": (IO.IMAGE, {}), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingVirtualTryOnRequest, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingVirtualTryOnNode", + display_name="Kling Virtual Try On", + category="api node/image/Kling", + description="Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background.", + inputs=[ + IO.Image.Input("human_image"), + IO.Image.Input("cloth_image"), + IO.Combo.Input( "model_name", - enum_type=KlingVirtualTryOnModelName, + options=[i.value for i in KlingVirtualTryOnModelName], + default="kolors-virtual-try-on-v1", ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background." - - async def get_response( - self, task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None - ) -> KlingVirtualTryOnResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingVirtualTryOnResponse, - ), - result_url_extractor=get_images_urls_from_response, - estimated_duration=AVERAGE_DURATION_VIRTUAL_TRY_ON, - node_id=node_id, + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, human_image: torch.Tensor, cloth_image: torch.Tensor, model_name: KlingVirtualTryOnModelName, - unique_id: Optional[str] = None, - **kwargs, - ): - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_VIRTUAL_TRY_ON, - method=HttpMethod.POST, - request_model=KlingVirtualTryOnRequest, - response_model=KlingVirtualTryOnResponse, - ), - request=KlingVirtualTryOnRequest( + ) -> IO.NodeOutput: + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_VIRTUAL_TRY_ON, method="POST"), + response_model=KlingVirtualTryOnResponse, + data=KlingVirtualTryOnRequest( human_image=tensor_to_base64_string(human_image), cloth_image=tensor_to_base64_string(cloth_image), model_name=model_name, ), - auth_kwargs=kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIRTUAL_TRY_ON}/{task_id}"), + response_model=KlingVirtualTryOnResponse, + estimated_duration=AVERAGE_DURATION_VIRTUAL_TRY_ON, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (await image_result_to_node_output(images),) + return IO.NodeOutput(await image_result_to_node_output(images)) -class KlingImageGenerationNode(KlingImageGenerationBase): +class KlingImageGenerationNode(IO.ComfyNode): """Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.""" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": model_field_to_node_input( - IO.STRING, - KlingImageGenerationsRequest, - "prompt", - multiline=True, - max_length=MAX_PROMPT_LENGTH_IMAGE_GEN, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="KlingImageGenerationNode", + display_name="Kling Image Generation", + category="api node/image/Kling", + description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.", + inputs=[ + IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), + IO.String.Input("negative_prompt", multiline=True, tooltip="Negative text prompt"), + IO.Combo.Input( + "image_type", + options=[i.value for i in KlingImageGenImageReferenceType], ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - KlingImageGenerationsRequest, - "negative_prompt", - multiline=True, - ), - "image_type": model_field_to_node_input( - IO.COMBO, - KlingImageGenerationsRequest, - "image_reference", - enum_type=KlingImageGenImageReferenceType, - ), - "image_fidelity": model_field_to_node_input( - IO.FLOAT, - KlingImageGenerationsRequest, + IO.Float.Input( "image_fidelity", - slider=True, + default=0.5, + min=0.0, + max=1.0, step=0.01, + display_mode=IO.NumberDisplay.slider, + tooltip="Reference intensity for user-uploaded images", ), - "human_fidelity": model_field_to_node_input( - IO.FLOAT, - KlingImageGenerationsRequest, + IO.Float.Input( "human_fidelity", - slider=True, + default=0.45, + min=0.0, + max=1.0, step=0.01, + display_mode=IO.NumberDisplay.slider, + tooltip="Subject reference similarity", ), - "model_name": model_field_to_node_input( - IO.COMBO, - KlingImageGenerationsRequest, + IO.Combo.Input( "model_name", - enum_type=KlingImageGenModelName, + options=[i.value for i in KlingImageGenModelName], + default="kling-v1", ), - "aspect_ratio": model_field_to_node_input( - IO.COMBO, - KlingImageGenerationsRequest, + IO.Combo.Input( "aspect_ratio", - enum_type=KlingImageGenAspectRatio, + options=[i.value for i in KlingImageGenAspectRatio], + default="16:9", ), - "n": model_field_to_node_input( - IO.INT, - KlingImageGenerationsRequest, + IO.Int.Input( "n", + default=1, + min=1, + max=9, + tooltip="Number of generated images", ), - }, - "optional": { - "image": (IO.IMAGE, {}), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Kling Image Generation Node. Generate an image from a text prompt with an optional reference image." - - async def get_response( - self, - task_id: str, - auth_kwargs: Optional[dict[str, str]], - node_id: Optional[str] = None, - ) -> KlingImageGenerationsResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_IMAGE_GENERATIONS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=KlingImageGenerationsResponse, - ), - result_url_extractor=get_images_urls_from_response, - estimated_duration=AVERAGE_DURATION_IMAGE_GEN, - node_id=node_id, + IO.Image.Input("image", optional=True), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) - async def api_call( - self, + @classmethod + async def execute( + cls, model_name: KlingImageGenModelName, prompt: str, negative_prompt: str, @@ -1689,10 +1470,9 @@ class KlingImageGenerationNode(KlingImageGenerationBase): n: int, aspect_ratio: KlingImageGenAspectRatio, image: Optional[torch.Tensor] = None, - unique_id: Optional[str] = None, - **kwargs, - ): - self.validate_prompt(prompt, negative_prompt) + ) -> IO.NodeOutput: + validate_string(prompt, field_name="prompt", min_length=1, max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MAX_PROMPT_LENGTH_IMAGE_GEN) if image is None: image_type = None @@ -1701,14 +1481,11 @@ class KlingImageGenerationNode(KlingImageGenerationBase): else: image = tensor_to_base64_string(image) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_GENERATIONS, - method=HttpMethod.POST, - request_model=KlingImageGenerationsRequest, - response_model=KlingImageGenerationsResponse, - ), - request=KlingImageGenerationsRequest( + task_creation_response = await sync_op( + cls, + ApiEndpoint(path=PATH_IMAGE_GENERATIONS, method="POST"), + response_model=KlingImageGenerationsResponse, + data=KlingImageGenerationsRequest( model_name=model_name, prompt=prompt, negative_prompt=negative_prompt, @@ -1719,50 +1496,43 @@ class KlingImageGenerationNode(KlingImageGenerationBase): n=n, aspect_ratio=aspect_ratio, ), - auth_kwargs=kwargs, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) task_id = task_creation_response.data.task_id - final_response = await self.get_response( - task_id, auth_kwargs=kwargs, node_id=unique_id + final_response = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_IMAGE_GENERATIONS}/{task_id}"), + response_model=KlingImageGenerationsResponse, + estimated_duration=AVERAGE_DURATION_IMAGE_GEN, + status_extractor=lambda r: (r.data.task_status.value if r.data and r.data.task_status else None), ) validate_image_result_response(final_response) images = get_images_from_response(final_response) - return (await image_result_to_node_output(images),) + return IO.NodeOutput(await image_result_to_node_output(images)) -NODE_CLASS_MAPPINGS = { - "KlingCameraControls": KlingCameraControls, - "KlingTextToVideoNode": KlingTextToVideoNode, - "KlingImage2VideoNode": KlingImage2VideoNode, - "KlingCameraControlI2VNode": KlingCameraControlI2VNode, - "KlingCameraControlT2VNode": KlingCameraControlT2VNode, - "KlingStartEndFrameNode": KlingStartEndFrameNode, - "KlingVideoExtendNode": KlingVideoExtendNode, - "KlingLipSyncAudioToVideoNode": KlingLipSyncAudioToVideoNode, - "KlingLipSyncTextToVideoNode": KlingLipSyncTextToVideoNode, - "KlingVirtualTryOnNode": KlingVirtualTryOnNode, - "KlingImageGenerationNode": KlingImageGenerationNode, - "KlingSingleImageVideoEffectNode": KlingSingleImageVideoEffectNode, - "KlingDualCharacterVideoEffectNode": KlingDualCharacterVideoEffectNode, -} +class KlingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + KlingCameraControls, + KlingTextToVideoNode, + KlingImage2VideoNode, + KlingCameraControlI2VNode, + KlingCameraControlT2VNode, + KlingStartEndFrameNode, + KlingVideoExtendNode, + KlingLipSyncAudioToVideoNode, + KlingLipSyncTextToVideoNode, + KlingVirtualTryOnNode, + KlingImageGenerationNode, + KlingSingleImageVideoEffectNode, + KlingDualCharacterVideoEffectNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "KlingCameraControls": "Kling Camera Controls", - "KlingTextToVideoNode": "Kling Text to Video", - "KlingImage2VideoNode": "Kling Image to Video", - "KlingCameraControlI2VNode": "Kling Image to Video (Camera Control)", - "KlingCameraControlT2VNode": "Kling Text to Video (Camera Control)", - "KlingStartEndFrameNode": "Kling Start-End Frame to Video", - "KlingVideoExtendNode": "Kling Video Extend", - "KlingLipSyncAudioToVideoNode": "Kling Lip Sync Video with Audio", - "KlingLipSyncTextToVideoNode": "Kling Lip Sync Video with Text", - "KlingVirtualTryOnNode": "Kling Virtual Try On", - "KlingImageGenerationNode": "Kling Image Generation", - "KlingSingleImageVideoEffectNode": "Kling Video Effects", - "KlingDualCharacterVideoEffectNode": "Kling Dual Character Video Effects", -} + +async def comfy_entrypoint() -> KlingExtension: + return KlingExtension() diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py new file mode 100644 index 000000000..0b757a62b --- /dev/null +++ b/comfy_api_nodes/nodes_ltxv.py @@ -0,0 +1,199 @@ +from io import BytesIO +from typing import Optional + +import torch +from pydantic import BaseModel, Field +from typing_extensions import override + +from comfy_api.input_impl import VideoFromFile +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( + ApiEndpoint, + get_number_of_images, + sync_op_raw, + upload_images_to_comfyapi, + validate_string, +) + +MODELS_MAP = { + "LTX-2 (Pro)": "ltx-2-pro", + "LTX-2 (Fast)": "ltx-2-fast", +} + + +class ExecuteTaskRequest(BaseModel): + prompt: str = Field(...) + model: str = Field(...) + duration: int = Field(...) + resolution: str = Field(...) + fps: Optional[int] = Field(25) + generate_audio: Optional[bool] = Field(True) + image_uri: Optional[str] = Field(None) + + +class TextToVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="LtxvApiTextToVideo", + display_name="LTXV Text To Video", + category="api node/video/LTXV", + description="Professional-quality videos with customizable duration and resolution.", + inputs=[ + IO.Combo.Input("model", options=list(MODELS_MAP.keys())), + IO.String.Input( + "prompt", + multiline=True, + default="", + ), + IO.Combo.Input("duration", options=[6, 8, 10, 12, 14, 16, 18, 20], default=8), + IO.Combo.Input( + "resolution", + options=[ + "1920x1080", + "2560x1440", + "3840x2160", + ], + ), + IO.Combo.Input("fps", options=[25, 50], default=25), + IO.Boolean.Input( + "generate_audio", + default=False, + optional=True, + tooltip="When true, the generated video will include AI-generated audio matching the scene.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + duration: int, + resolution: str, + fps: int = 25, + generate_audio: bool = False, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=10000) + if duration > 10 and (model != "LTX-2 (Fast)" or resolution != "1920x1080" or fps != 25): + raise ValueError( + "Durations over 10s are only available for the Fast model at 1920x1080 resolution and 25 FPS." + ) + response = await sync_op_raw( + cls, + ApiEndpoint("/proxy/ltx/v1/text-to-video", "POST"), + data=ExecuteTaskRequest( + prompt=prompt, + model=MODELS_MAP[model], + duration=duration, + resolution=resolution, + fps=fps, + generate_audio=generate_audio, + ), + as_binary=True, + max_retries=1, + ) + return IO.NodeOutput(VideoFromFile(BytesIO(response))) + + +class ImageToVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="LtxvApiImageToVideo", + display_name="LTXV Image To Video", + category="api node/video/LTXV", + description="Professional-quality videos with customizable duration and resolution based on start image.", + inputs=[ + IO.Image.Input("image", tooltip="First frame to be used for the video."), + IO.Combo.Input("model", options=list(MODELS_MAP.keys())), + IO.String.Input( + "prompt", + multiline=True, + default="", + ), + IO.Combo.Input("duration", options=[6, 8, 10, 12, 14, 16, 18, 20], default=8), + IO.Combo.Input( + "resolution", + options=[ + "1920x1080", + "2560x1440", + "3840x2160", + ], + ), + IO.Combo.Input("fps", options=[25, 50], default=25), + IO.Boolean.Input( + "generate_audio", + default=False, + optional=True, + tooltip="When true, the generated video will include AI-generated audio matching the scene.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + image: torch.Tensor, + model: str, + prompt: str, + duration: int, + resolution: str, + fps: int = 25, + generate_audio: bool = False, + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=10000) + if duration > 10 and (model != "LTX-2 (Fast)" or resolution != "1920x1080" or fps != 25): + raise ValueError( + "Durations over 10s are only available for the Fast model at 1920x1080 resolution and 25 FPS." + ) + if get_number_of_images(image) != 1: + raise ValueError("Currently only one input image is supported.") + response = await sync_op_raw( + cls, + ApiEndpoint("/proxy/ltx/v1/image-to-video", "POST"), + data=ExecuteTaskRequest( + image_uri=(await upload_images_to_comfyapi(cls, image, max_images=1, mime_type="image/png"))[0], + prompt=prompt, + model=MODELS_MAP[model], + duration=duration, + resolution=resolution, + fps=fps, + generate_audio=generate_audio, + ), + as_binary=True, + max_retries=1, + ) + return IO.NodeOutput(VideoFromFile(BytesIO(response))) + + +class LtxvApiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + TextToVideoNode, + ImageToVideoNode, + ] + + +async def comfy_entrypoint() -> LtxvApiExtension: + return LtxvApiExtension() diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index b3c32bed5..894f2b08c 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -1,266 +1,228 @@ -from __future__ import annotations -from inspect import cleandoc from typing import Optional -from comfy.comfy_types.node_typing import IO, ComfyNodeABC -from comfy_api.input_impl.video_types import VideoFromFile + +import torch +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.luma_api import ( - LumaImageModel, - LumaVideoModel, - LumaVideoOutputResolution, - LumaVideoModelOutputDuration, LumaAspectRatio, - LumaState, - LumaImageGenerationRequest, - LumaGenerationRequest, - LumaGeneration, LumaCharacterRef, - LumaModifyImageRef, + LumaConceptChain, + LumaGeneration, + LumaGenerationRequest, + LumaImageGenerationRequest, LumaImageIdentity, + LumaImageModel, + LumaImageReference, + LumaIO, + LumaKeyframes, + LumaModifyImageRef, LumaReference, LumaReferenceChain, - LumaImageReference, - LumaKeyframes, - LumaConceptChain, - LumaIO, + LumaVideoModel, + LumaVideoModelOutputDuration, + LumaVideoOutputResolution, get_luma_concepts, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( + download_url_to_image_tensor, + download_url_to_video_output, + poll_op, + sync_op, upload_images_to_comfyapi, - process_image_response, validate_string, ) -from server import PromptServer - -import aiohttp -import torch -from io import BytesIO LUMA_T2V_AVERAGE_DURATION = 105 LUMA_I2V_AVERAGE_DURATION = 100 -def image_result_url_extractor(response: LumaGeneration): - return response.assets.image if hasattr(response, "assets") and hasattr(response.assets, "image") else None -def video_result_url_extractor(response: LumaGeneration): - return response.assets.video if hasattr(response, "assets") and hasattr(response.assets, "video") else None - -class LumaReferenceNode(ComfyNodeABC): - """ - Holds an image and weight for use with Luma Generate Image node. - """ - - RETURN_TYPES = (LumaIO.LUMA_REF,) - RETURN_NAMES = ("luma_ref",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_luma_reference" - CATEGORY = "api node/image/Luma" +class LumaReferenceNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="LumaReferenceNode", + display_name="Luma Reference", + category="api node/image/Luma", + description="Holds an image and weight for use with Luma Generate Image node.", + inputs=[ + IO.Image.Input( + "image", + tooltip="Image to use as reference.", + ), + IO.Float.Input( + "weight", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + tooltip="Weight of image reference.", + ), + IO.Custom(LumaIO.LUMA_REF).Input( + "luma_ref", + optional=True, + ), + ], + outputs=[IO.Custom(LumaIO.LUMA_REF).Output(display_name="luma_ref")], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ( - IO.IMAGE, - { - "tooltip": "Image to use as reference.", - }, - ), - "weight": ( - IO.FLOAT, - { - "default": 1.0, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Weight of image reference.", - }, - ), - }, - "optional": {"luma_ref": (LumaIO.LUMA_REF,)}, - } - - def create_luma_reference( - self, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None - ): + def execute(cls, image: torch.Tensor, weight: float, luma_ref: LumaReferenceChain = None) -> IO.NodeOutput: if luma_ref is not None: luma_ref = luma_ref.clone() else: luma_ref = LumaReferenceChain() luma_ref.add(LumaReference(image=image, weight=round(weight, 2))) - return (luma_ref,) + return IO.NodeOutput(luma_ref) -class LumaConceptsNode(ComfyNodeABC): - """ - Holds one or more Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes. - """ - - RETURN_TYPES = (LumaIO.LUMA_CONCEPTS,) - RETURN_NAMES = ("luma_concepts",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_concepts" - CATEGORY = "api node/video/Luma" +class LumaConceptsNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="LumaConceptsNode", + display_name="Luma Concepts", + category="api node/video/Luma", + description="Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes.", + inputs=[ + IO.Combo.Input( + "concept1", + options=get_luma_concepts(include_none=True), + ), + IO.Combo.Input( + "concept2", + options=get_luma_concepts(include_none=True), + ), + IO.Combo.Input( + "concept3", + options=get_luma_concepts(include_none=True), + ), + IO.Combo.Input( + "concept4", + options=get_luma_concepts(include_none=True), + ), + IO.Custom(LumaIO.LUMA_CONCEPTS).Input( + "luma_concepts", + tooltip="Optional Camera Concepts to add to the ones chosen here.", + optional=True, + ), + ], + outputs=[IO.Custom(LumaIO.LUMA_CONCEPTS).Output(display_name="luma_concepts")], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "concept1": (get_luma_concepts(include_none=True),), - "concept2": (get_luma_concepts(include_none=True),), - "concept3": (get_luma_concepts(include_none=True),), - "concept4": (get_luma_concepts(include_none=True),), - }, - "optional": { - "luma_concepts": ( - LumaIO.LUMA_CONCEPTS, - { - "tooltip": "Optional Camera Concepts to add to the ones chosen here." - }, - ), - }, - } - - def create_concepts( - self, + def execute( + cls, concept1: str, concept2: str, concept3: str, concept4: str, luma_concepts: LumaConceptChain = None, - ): + ) -> IO.NodeOutput: chain = LumaConceptChain(str_list=[concept1, concept2, concept3, concept4]) if luma_concepts is not None: chain = luma_concepts.clone_and_merge(chain) - return (chain,) + return IO.NodeOutput(chain) -class LumaImageGenerationNode(ComfyNodeABC): - """ - Generates images synchronously based on prompt and aspect ratio. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Luma" +class LumaImageGenerationNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="LumaImageNode", + display_name="Luma Text to Image", + category="api node/image/Luma", + description="Generates images synchronously based on prompt and aspect ratio.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", + ), + IO.Combo.Input( + "model", + options=LumaImageModel, + ), + IO.Combo.Input( + "aspect_ratio", + options=LumaAspectRatio, + default=LumaAspectRatio.ratio_16_9, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + IO.Float.Input( + "style_image_weight", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + tooltip="Weight of style image. Ignored if no style_image provided.", + ), + IO.Custom(LumaIO.LUMA_REF).Input( + "image_luma_ref", + tooltip="Luma Reference node connection to influence generation with input images; up to 4 images can be considered.", + optional=True, + ), + IO.Image.Input( + "style_image", + tooltip="Style reference image; only 1 image will be used.", + optional=True, + ), + IO.Image.Input( + "character_image", + tooltip="Character reference images; can be a batch of multiple, up to 4 images can be considered.", + optional=True, + ), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "model": ([model.value for model in LumaImageModel],), - "aspect_ratio": ( - [ratio.value for ratio in LumaAspectRatio], - { - "default": LumaAspectRatio.ratio_16_9, - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - "style_image_weight": ( - IO.FLOAT, - { - "default": 1.0, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Weight of style image. Ignored if no style_image provided.", - }, - ), - }, - "optional": { - "image_luma_ref": ( - LumaIO.LUMA_REF, - { - "tooltip": "Luma Reference node connection to influence generation with input images; up to 4 images can be considered." - }, - ), - "style_image": ( - IO.IMAGE, - {"tooltip": "Style reference image; only 1 image will be used."}, - ), - "character_image": ( - IO.IMAGE, - { - "tooltip": "Character reference images; can be a batch of multiple, up to 4 images can be considered." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, aspect_ratio: str, seed, style_image_weight: float, - image_luma_ref: LumaReferenceChain = None, - style_image: torch.Tensor = None, - character_image: torch.Tensor = None, - unique_id: str = None, - **kwargs, - ): + image_luma_ref: Optional[LumaReferenceChain] = None, + style_image: Optional[torch.Tensor] = None, + character_image: Optional[torch.Tensor] = None, + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=True, min_length=3) # handle image_luma_ref api_image_ref = None if image_luma_ref is not None: - api_image_ref = await self._convert_luma_refs( - image_luma_ref, max_refs=4, auth_kwargs=kwargs, - ) + api_image_ref = await cls._convert_luma_refs(image_luma_ref, max_refs=4) # handle style_luma_ref api_style_ref = None if style_image is not None: - api_style_ref = await self._convert_style_image( - style_image, weight=style_image_weight, auth_kwargs=kwargs, - ) + api_style_ref = await cls._convert_style_image(style_image, weight=style_image_weight) # handle character_ref images character_ref = None if character_image is not None: - download_urls = await upload_images_to_comfyapi( - character_image, max_images=4, auth_kwargs=kwargs, - ) - character_ref = LumaCharacterRef( - identity0=LumaImageIdentity(images=download_urls) - ) + download_urls = await upload_images_to_comfyapi(cls, character_image, max_images=4) + character_ref = LumaCharacterRef(identity0=LumaImageIdentity(images=download_urls)) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations/image", - method=HttpMethod.POST, - request_model=LumaImageGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaImageGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations/image", method="POST"), + response_model=LumaGeneration, + data=LumaImageGenerationRequest( prompt=prompt, model=model, aspect_ratio=aspect_ratio, @@ -268,236 +230,176 @@ class LumaImageGenerationNode(ComfyNodeABC): style_ref=api_style_ref, character_ref=character_ref, ), - auth_kwargs=kwargs, ) - response_api: LumaGeneration = await operation.execute() - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=image_result_url_extractor, - node_id=unique_id, - auth_kwargs=kwargs, ) - response_poll = await operation.execute() + return IO.NodeOutput(await download_url_to_image_tensor(response_poll.assets.image)) - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.image) as img_response: - img = process_image_response(await img_response.content.read()) - return (img,) - - async def _convert_luma_refs( - self, luma_ref: LumaReferenceChain, max_refs: int, auth_kwargs: Optional[dict[str,str]] = None - ): + @classmethod + async def _convert_luma_refs(cls, luma_ref: LumaReferenceChain, max_refs: int): luma_urls = [] ref_count = 0 for ref in luma_ref.refs: - download_urls = await upload_images_to_comfyapi( - ref.image, max_images=1, auth_kwargs=auth_kwargs - ) + download_urls = await upload_images_to_comfyapi(cls, ref.image, max_images=1) luma_urls.append(download_urls[0]) ref_count += 1 if ref_count >= max_refs: break return luma_ref.create_api_model(download_urls=luma_urls, max_refs=max_refs) - async def _convert_style_image( - self, style_image: torch.Tensor, weight: float, auth_kwargs: Optional[dict[str,str]] = None - ): - chain = LumaReferenceChain( - first_ref=LumaReference(image=style_image, weight=weight) + @classmethod + async def _convert_style_image(cls, style_image: torch.Tensor, weight: float): + chain = LumaReferenceChain(first_ref=LumaReference(image=style_image, weight=weight)) + return await cls._convert_luma_refs(chain, max_refs=1) + + +class LumaImageModifyNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="LumaImageModifyNode", + display_name="Luma Image to Image", + category="api node/image/Luma", + description="Modifies images synchronously based on prompt and aspect ratio.", + inputs=[ + IO.Image.Input( + "image", + ), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the image generation", + ), + IO.Float.Input( + "image_weight", + default=0.1, + min=0.0, + max=0.98, + step=0.01, + tooltip="Weight of the image; the closer to 1.0, the less the image will be modified.", + ), + IO.Combo.Input( + "model", + options=LumaImageModel, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, ) - return await self._convert_luma_refs(chain, max_refs=1, auth_kwargs=auth_kwargs) - - -class LumaImageModifyNode(ComfyNodeABC): - """ - Modifies images synchronously based on prompt and aspect ratio. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Luma" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation", - }, - ), - "image_weight": ( - IO.FLOAT, - { - "default": 0.1, - "min": 0.0, - "max": 0.98, - "step": 0.01, - "tooltip": "Weight of the image; the closer to 1.0, the less the image will be modified.", - }, - ), - "model": ([model.value for model in LumaImageModel],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": {}, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, image: torch.Tensor, image_weight: float, seed, - unique_id: str = None, - **kwargs, - ): - # first, upload image - download_urls = await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=kwargs, - ) + ) -> IO.NodeOutput: + download_urls = await upload_images_to_comfyapi(cls, image, max_images=1) image_url = download_urls[0] - # next, make Luma call with download url provided - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations/image", - method=HttpMethod.POST, - request_model=LumaImageGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaImageGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations/image", method="POST"), + response_model=LumaGeneration, + data=LumaImageGenerationRequest( prompt=prompt, model=model, modify_image_ref=LumaModifyImageRef( - url=image_url, weight=round(max(min(1.0-image_weight, 0.98), 0.0), 2) + url=image_url, weight=round(max(min(1.0 - image_weight, 0.98), 0.0), 2) ), ), - auth_kwargs=kwargs, ) - response_api: LumaGeneration = await operation.execute() - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=image_result_url_extractor, - node_id=unique_id, - auth_kwargs=kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.image) as img_response: - img = process_image_response(await img_response.content.read()) - return (img,) + return IO.NodeOutput(await download_url_to_image_tensor(response_poll.assets.image)) -class LumaTextToVideoGenerationNode(ComfyNodeABC): - """ - Generates videos synchronously based on prompt and output_size. - """ - - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/Luma" +class LumaTextToVideoGenerationNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="LumaVideoNode", + display_name="Luma Text to Video", + category="api node/video/Luma", + description="Generates videos synchronously based on prompt and output_size.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + IO.Combo.Input( + "model", + options=LumaVideoModel, + ), + IO.Combo.Input( + "aspect_ratio", + options=LumaAspectRatio, + default=LumaAspectRatio.ratio_16_9, + ), + IO.Combo.Input( + "resolution", + options=LumaVideoOutputResolution, + default=LumaVideoOutputResolution.res_540p, + ), + IO.Combo.Input( + "duration", + options=LumaVideoModelOutputDuration, + ), + IO.Boolean.Input( + "loop", + default=False, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + IO.Custom(LumaIO.LUMA_CONCEPTS).Input( + "luma_concepts", + tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", + optional=True, + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "model": ([model.value for model in LumaVideoModel],), - "aspect_ratio": ( - [ratio.value for ratio in LumaAspectRatio], - { - "default": LumaAspectRatio.ratio_16_9, - }, - ), - "resolution": ( - [resolution.value for resolution in LumaVideoOutputResolution], - { - "default": LumaVideoOutputResolution.res_540p, - }, - ), - "duration": ([dur.value for dur in LumaVideoModelOutputDuration],), - "loop": ( - IO.BOOLEAN, - { - "default": False, - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "luma_concepts": ( - LumaIO.LUMA_CONCEPTS, - { - "tooltip": "Optional Camera Concepts to dictate camera motion via the Luma Concepts node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, aspect_ratio: str, @@ -505,22 +407,17 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): duration: str, loop: bool, seed, - luma_concepts: LumaConceptChain = None, - unique_id: str = None, - **kwargs, - ): + luma_concepts: Optional[LumaConceptChain] = None, + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, min_length=3) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations", - method=HttpMethod.POST, - request_model=LumaGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations", method="POST"), + response_model=LumaGeneration, + data=LumaGenerationRequest( prompt=prompt, model=model, resolution=resolution, @@ -529,108 +426,90 @@ class LumaTextToVideoGenerationNode(ComfyNodeABC): loop=loop, concepts=luma_concepts.create_api_model() if luma_concepts else None, ), - auth_kwargs=kwargs, ) - response_api: LumaGeneration = await operation.execute() - - if unique_id: - PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=video_result_url_extractor, - node_id=unique_id, estimated_duration=LUMA_T2V_AVERAGE_DURATION, - auth_kwargs=kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.video) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return IO.NodeOutput(await download_url_to_video_output(response_poll.assets.video)) -class LumaImageToVideoGenerationNode(ComfyNodeABC): - """ - Generates videos synchronously based on prompt, input images, and output_size. - """ - - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/Luma" +class LumaImageToVideoGenerationNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="LumaImageToVideoNode", + display_name="Luma Image to Video", + category="api node/video/Luma", + description="Generates videos synchronously based on prompt, input images, and output_size.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + IO.Combo.Input( + "model", + options=LumaVideoModel, + ), + # IO.Combo.Input( + # "aspect_ratio", + # options=[ratio.value for ratio in LumaAspectRatio], + # default=LumaAspectRatio.ratio_16_9, + # ), + IO.Combo.Input( + "resolution", + options=LumaVideoOutputResolution, + default=LumaVideoOutputResolution.res_540p, + ), + IO.Combo.Input( + "duration", + options=[dur.value for dur in LumaVideoModelOutputDuration], + ), + IO.Boolean.Input( + "loop", + default=False, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", + ), + IO.Image.Input( + "first_image", + tooltip="First frame of generated video.", + optional=True, + ), + IO.Image.Input( + "last_image", + tooltip="Last frame of generated video.", + optional=True, + ), + IO.Custom(LumaIO.LUMA_CONCEPTS).Input( + "luma_concepts", + tooltip="Optional Camera Concepts to dictate camera motion via the Luma Concepts node.", + optional=True, + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "model": ([model.value for model in LumaVideoModel],), - # "aspect_ratio": ([ratio.value for ratio in LumaAspectRatio], { - # "default": LumaAspectRatio.ratio_16_9, - # }), - "resolution": ( - [resolution.value for resolution in LumaVideoOutputResolution], - { - "default": LumaVideoOutputResolution.res_540p, - }, - ), - "duration": ([dur.value for dur in LumaVideoModelOutputDuration],), - "loop": ( - IO.BOOLEAN, - { - "default": False, - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "first_image": ( - IO.IMAGE, - {"tooltip": "First frame of generated video."}, - ), - "last_image": (IO.IMAGE, {"tooltip": "Last frame of generated video."}), - "luma_concepts": ( - LumaIO.LUMA_CONCEPTS, - { - "tooltip": "Optional Camera Concepts to dictate camera motion via the Luma Concepts node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, model: str, resolution: str, @@ -640,25 +519,17 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): first_image: torch.Tensor = None, last_image: torch.Tensor = None, luma_concepts: LumaConceptChain = None, - unique_id: str = None, - **kwargs, - ): + ) -> IO.NodeOutput: if first_image is None and last_image is None: - raise Exception( - "At least one of first_image and last_image requires an input." - ) - keyframes = await self._convert_to_keyframes(first_image, last_image, auth_kwargs=kwargs) + raise Exception("At least one of first_image and last_image requires an input.") + keyframes = await cls._convert_to_keyframes(first_image, last_image) duration = duration if model != LumaVideoModel.ray_1_6 else None resolution = resolution if model != LumaVideoModel.ray_1_6 else None - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/luma/generations", - method=HttpMethod.POST, - request_model=LumaGenerationRequest, - response_model=LumaGeneration, - ), - request=LumaGenerationRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/luma/generations", method="POST"), + response_model=LumaGeneration, + data=LumaGenerationRequest( prompt=prompt, model=model, aspect_ratio=LumaAspectRatio.ratio_16_9, # ignored, but still needed by the API for some reason @@ -668,74 +539,47 @@ class LumaImageToVideoGenerationNode(ComfyNodeABC): keyframes=keyframes, concepts=luma_concepts.create_api_model() if luma_concepts else None, ), - auth_kwargs=kwargs, ) - response_api: LumaGeneration = await operation.execute() - - if unique_id: - PromptServer.instance.send_progress_text(f"Luma video generation started: {response_api.id}", unique_id) - - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/luma/generations/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=LumaGeneration, - ), - completed_statuses=[LumaState.completed], - failed_statuses=[LumaState.failed], + response_poll = await poll_op( + cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/luma/generations/{response_api.id}"), + response_model=LumaGeneration, status_extractor=lambda x: x.state, - result_url_extractor=video_result_url_extractor, - node_id=unique_id, estimated_duration=LUMA_I2V_AVERAGE_DURATION, - auth_kwargs=kwargs, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.assets.video) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return IO.NodeOutput(await download_url_to_video_output(response_poll.assets.video)) + @classmethod async def _convert_to_keyframes( - self, + cls, first_image: torch.Tensor = None, last_image: torch.Tensor = None, - auth_kwargs: Optional[dict[str,str]] = None, ): if first_image is None and last_image is None: return None frame0 = None frame1 = None if first_image is not None: - download_urls = await upload_images_to_comfyapi( - first_image, max_images=1, auth_kwargs=auth_kwargs, - ) + download_urls = await upload_images_to_comfyapi(cls, first_image, max_images=1) frame0 = LumaImageReference(type="image", url=download_urls[0]) if last_image is not None: - download_urls = await upload_images_to_comfyapi( - last_image, max_images=1, auth_kwargs=auth_kwargs, - ) + download_urls = await upload_images_to_comfyapi(cls, last_image, max_images=1) frame1 = LumaImageReference(type="image", url=download_urls[0]) return LumaKeyframes(frame0=frame0, frame1=frame1) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "LumaImageNode": LumaImageGenerationNode, - "LumaImageModifyNode": LumaImageModifyNode, - "LumaVideoNode": LumaTextToVideoGenerationNode, - "LumaImageToVideoNode": LumaImageToVideoGenerationNode, - "LumaReferenceNode": LumaReferenceNode, - "LumaConceptsNode": LumaConceptsNode, -} +class LumaExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + LumaImageGenerationNode, + LumaImageModifyNode, + LumaTextToVideoGenerationNode, + LumaImageToVideoGenerationNode, + LumaReferenceNode, + LumaConceptsNode, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "LumaImageNode": "Luma Text to Image", - "LumaImageModifyNode": "Luma Image to Image", - "LumaVideoNode": "Luma Text to Video", - "LumaImageToVideoNode": "Luma Image to Video", - "LumaReferenceNode": "Luma Reference", - "LumaConceptsNode": "Luma Concepts", -} + +async def comfy_entrypoint() -> LumaExtension: + return LumaExtension() diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index bf560661c..05cbb700f 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -1,71 +1,57 @@ -from inspect import cleandoc from typing import Optional -import logging -import torch +import torch from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io -from comfy_api.input_impl.video_types import VideoFromFile -from comfy_api_nodes.apis import ( + +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apis.minimax_api import ( + MinimaxFileRetrieveResponse, + MiniMaxModel, + MinimaxTaskResultResponse, MinimaxVideoGenerationRequest, MinimaxVideoGenerationResponse, - MinimaxFileRetrieveResponse, - MinimaxTaskResultResponse, SubjectReferenceItem, - MiniMaxModel, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - download_url_to_bytesio, + download_url_to_video_output, + poll_op, + sync_op, upload_images_to_comfyapi, validate_string, ) -from server import PromptServer - I2V_AVERAGE_DURATION = 114 T2V_AVERAGE_DURATION = 234 async def _generate_mm_video( + cls: type[IO.ComfyNode], *, - auth: dict[str, str], - node_id: str, prompt_text: str, seed: int, model: str, - image: Optional[torch.Tensor] = None, # used for ImageToVideo - subject: Optional[torch.Tensor] = None, # used for SubjectToVideo + image: Optional[torch.Tensor] = None, # used for ImageToVideo + subject: Optional[torch.Tensor] = None, # used for SubjectToVideo average_duration: Optional[int] = None, -) -> comfy_io.NodeOutput: +) -> IO.NodeOutput: if image is None: validate_string(prompt_text, field_name="prompt_text") - # upload image, if passed in image_url = None if image is not None: - image_url = (await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=auth))[0] + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1))[0] # TODO: figure out how to deal with subject properly, API returns invalid params when using S2V-01 model subject_reference = None if subject is not None: - subject_url = (await upload_images_to_comfyapi(subject, max_images=1, auth_kwargs=auth))[0] + subject_url = (await upload_images_to_comfyapi(cls, subject, max_images=1))[0] subject_reference = [SubjectReferenceItem(image=subject_url)] - - video_generate_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/video_generation", - method=HttpMethod.POST, - request_model=MinimaxVideoGenerationRequest, - response_model=MinimaxVideoGenerationResponse, - ), - request=MinimaxVideoGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/video_generation", method="POST"), + response_model=MinimaxVideoGenerationResponse, + data=MinimaxVideoGenerationRequest( model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, @@ -73,95 +59,64 @@ async def _generate_mm_video( subject_reference=subject_reference, prompt_optimizer=None, ), - auth_kwargs=auth, ) - response = await video_generate_operation.execute() task_id = response.task_id if not task_id: raise Exception(f"MiniMax generation failed: {response.base_resp}") - video_generate_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path="/proxy/minimax/query/video_generation", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxTaskResultResponse, - query_params={"task_id": task_id}, - ), - completed_statuses=["Success"], - failed_statuses=["Fail"], + task_result = await poll_op( + cls, + ApiEndpoint(path="/proxy/minimax/query/video_generation", query_params={"task_id": task_id}), + response_model=MinimaxTaskResultResponse, status_extractor=lambda x: x.status.value, estimated_duration=average_duration, - node_id=node_id, - auth_kwargs=auth, ) - task_result = await video_generate_operation.execute() file_id = task_result.file_id if file_id is None: raise Exception("Request was not successful. Missing file ID.") - file_retrieve_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/files/retrieve", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxFileRetrieveResponse, - query_params={"file_id": int(file_id)}, - ), - request=EmptyRequest(), - auth_kwargs=auth, + file_result = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/files/retrieve", query_params={"file_id": int(file_id)}), + response_model=MinimaxFileRetrieveResponse, ) - file_result = await file_retrieve_operation.execute() file_url = file_result.file.download_url if file_url is None: - raise Exception( - f"No video was found in the response. Full response: {file_result.model_dump()}" - ) - logging.info("Generated video URL: %s", file_url) - if node_id: - if hasattr(file_result.file, "backup_download_url"): - message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" - else: - message = f"Result URL: {file_url}" - PromptServer.instance.send_progress_text(message, node_id) - - # Download and return as VideoFromFile - video_io = await download_url_to_bytesio(file_url) - if video_io is None: - error_msg = f"Failed to download video from {file_url}" - logging.error(error_msg) - raise Exception(error_msg) - return comfy_io.NodeOutput(VideoFromFile(video_io)) + raise Exception(f"No video was found in the response. Full response: {file_result.model_dump()}") + if file_result.file.backup_download_url: + try: + return IO.NodeOutput(await download_url_to_video_output(file_url, timeout=10, max_retries=2)) + except Exception: # if we have a second URL to retrieve the result, try again using that one + return IO.NodeOutput( + await download_url_to_video_output(file_result.file.backup_download_url, max_retries=3) + ) + return IO.NodeOutput(await download_url_to_video_output(file_url)) -class MinimaxTextToVideoNode(comfy_io.ComfyNode): - """ - Generates videos synchronously based on a prompt, and optional parameters using MiniMax's API. - """ - +class MinimaxTextToVideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxTextToVideoNode", display_name="MiniMax Text to Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on a prompt, and optional parameters.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["T2V-01", "T2V-01-Director"], default="T2V-01", tooltip="Model to use for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -172,11 +127,11 @@ class MinimaxTextToVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -187,13 +142,9 @@ class MinimaxTextToVideoNode(comfy_io.ComfyNode): prompt_text: str, model: str = "T2V-01", seed: int = 0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await _generate_mm_video( - auth={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt_text=prompt_text, seed=seed, model=model, @@ -203,36 +154,32 @@ class MinimaxTextToVideoNode(comfy_io.ComfyNode): ) -class MinimaxImageToVideoNode(comfy_io.ComfyNode): - """ - Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. - """ - +class MinimaxImageToVideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxImageToVideoNode", display_name="MiniMax Image to Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Image to use as first frame of video generation", ), - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["I2V-01-Director", "I2V-01", "I2V-01-live"], default="I2V-01", tooltip="Model to use for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -243,11 +190,11 @@ class MinimaxImageToVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -259,13 +206,9 @@ class MinimaxImageToVideoNode(comfy_io.ComfyNode): prompt_text: str, model: str = "I2V-01", seed: int = 0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await _generate_mm_video( - auth={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt_text=prompt_text, seed=seed, model=model, @@ -275,36 +218,32 @@ class MinimaxImageToVideoNode(comfy_io.ComfyNode): ) -class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): - """ - Generates videos synchronously based on an image and prompt, and optional parameters using MiniMax's API. - """ - +class MinimaxSubjectToVideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxSubjectToVideoNode", display_name="MiniMax Subject to Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "subject", tooltip="Image of subject to reference for video generation", ), - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["S2V-01"], default="S2V-01", tooltip="Model to use for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -315,11 +254,11 @@ class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -331,13 +270,9 @@ class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): prompt_text: str, model: str = "S2V-01", seed: int = 0, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: return await _generate_mm_video( - auth={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - node_id=cls.hidden.unique_id, + cls, prompt_text=prompt_text, seed=seed, model=model, @@ -347,24 +282,22 @@ class MinimaxSubjectToVideoNode(comfy_io.ComfyNode): ) -class MinimaxHailuoVideoNode(comfy_io.ComfyNode): - """Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.""" - +class MinimaxHailuoVideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MinimaxHailuoVideoNode", display_name="MiniMax Hailuo Video", category="api node/video/MiniMax", - description=cleandoc(cls.__doc__ or ""), + description="Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt_text", multiline=True, default="", tooltip="Text prompt to guide the video generation.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, @@ -374,25 +307,25 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): tooltip="The random seed used for creating the noise.", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "first_frame_image", tooltip="Optional image to use as the first frame to generate a video.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_optimizer", default=True, tooltip="Optimize prompt to improve generation quality when needed.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", options=[6, 10], default=6, tooltip="The length of the output video in seconds.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=["768P", "1080P"], default="768P", @@ -400,11 +333,11 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): optional=True, ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -419,11 +352,7 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): duration: int = 6, resolution: str = "768P", model: str = "MiniMax-Hailuo-02", - ) -> comfy_io.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } + ) -> IO.NodeOutput: if first_frame_image is None: validate_string(prompt_text, field_name="prompt_text") @@ -435,16 +364,13 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): # upload image, if passed in image_url = None if first_frame_image is not None: - image_url = (await upload_images_to_comfyapi(first_frame_image, max_images=1, auth_kwargs=auth))[0] + image_url = (await upload_images_to_comfyapi(cls, first_frame_image, max_images=1))[0] - video_generate_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/video_generation", - method=HttpMethod.POST, - request_model=MinimaxVideoGenerationRequest, - response_model=MinimaxVideoGenerationResponse, - ), - request=MinimaxVideoGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/video_generation", method="POST"), + response_model=MinimaxVideoGenerationResponse, + data=MinimaxVideoGenerationRequest( model=MiniMaxModel(model), prompt=prompt_text, callback_url=None, @@ -453,72 +379,47 @@ class MinimaxHailuoVideoNode(comfy_io.ComfyNode): duration=duration, resolution=resolution, ), - auth_kwargs=auth, ) - response = await video_generate_operation.execute() task_id = response.task_id if not task_id: raise Exception(f"MiniMax generation failed: {response.base_resp}") average_duration = 120 if resolution == "768P" else 240 - video_generate_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path="/proxy/minimax/query/video_generation", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxTaskResultResponse, - query_params={"task_id": task_id}, - ), - completed_statuses=["Success"], - failed_statuses=["Fail"], + task_result = await poll_op( + cls, + ApiEndpoint(path="/proxy/minimax/query/video_generation", query_params={"task_id": task_id}), + response_model=MinimaxTaskResultResponse, status_extractor=lambda x: x.status.value, estimated_duration=average_duration, - node_id=cls.hidden.unique_id, - auth_kwargs=auth, ) - task_result = await video_generate_operation.execute() file_id = task_result.file_id if file_id is None: raise Exception("Request was not successful. Missing file ID.") - file_retrieve_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/minimax/files/retrieve", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MinimaxFileRetrieveResponse, - query_params={"file_id": int(file_id)}, - ), - request=EmptyRequest(), - auth_kwargs=auth, + file_result = await sync_op( + cls, + ApiEndpoint(path="/proxy/minimax/files/retrieve", query_params={"file_id": int(file_id)}), + response_model=MinimaxFileRetrieveResponse, ) - file_result = await file_retrieve_operation.execute() file_url = file_result.file.download_url if file_url is None: - raise Exception( - f"No video was found in the response. Full response: {file_result.model_dump()}" - ) - logging.info(f"Generated video URL: {file_url}") - if cls.hidden.unique_id: - if hasattr(file_result.file, "backup_download_url"): - message = f"Result URL: {file_url}\nBackup URL: {file_result.file.backup_download_url}" - else: - message = f"Result URL: {file_url}" - PromptServer.instance.send_progress_text(message, cls.hidden.unique_id) + raise Exception(f"No video was found in the response. Full response: {file_result.model_dump()}") - video_io = await download_url_to_bytesio(file_url) - if video_io is None: - error_msg = f"Failed to download video from {file_url}" - logging.error(error_msg) - raise Exception(error_msg) - return comfy_io.NodeOutput(VideoFromFile(video_io)) + if file_result.file.backup_download_url: + try: + return IO.NodeOutput(await download_url_to_video_output(file_url, timeout=10, max_retries=2)) + except Exception: # if we have a second URL to retrieve the result, try again using that one + return IO.NodeOutput( + await download_url_to_video_output(file_result.file.backup_download_url, max_retries=3) + ) + return IO.NodeOutput(await download_url_to_video_output(file_url)) class MinimaxExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ MinimaxTextToVideoNode, MinimaxImageToVideoNode, diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py index 08e838fef..7c31d95b3 100644 --- a/comfy_api_nodes/nodes_moonvalley.py +++ b/comfy_api_nodes/nodes_moonvalley.py @@ -1,37 +1,30 @@ import logging -from typing import Any, Callable, Optional, TypeVar +from typing import Optional + import torch from typing_extensions import override -from comfy_api_nodes.util.validation_utils import ( - get_image_dimensions, - validate_image_dimensions, -) - - -from comfy_api_nodes.apis import ( - MoonvalleyTextToVideoRequest, - MoonvalleyTextToVideoInferenceParams, - MoonvalleyVideoToVideoInferenceParams, - MoonvalleyVideoToVideoRequest, - MoonvalleyPromptResponse, -) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - download_url_to_video_output, - upload_images_to_comfyapi, - upload_video_to_comfyapi, -) from comfy_api.input import VideoInput -from comfy_api.latest import ComfyExtension, InputImpl, io as comfy_io -import av -import io +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apis import ( + MoonvalleyPromptResponse, + MoonvalleyTextToVideoInferenceParams, + MoonvalleyTextToVideoRequest, + MoonvalleyVideoToVideoInferenceParams, + MoonvalleyVideoToVideoRequest, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_video_output, + poll_op, + sync_op, + trim_video, + upload_images_to_comfyapi, + upload_video_to_comfyapi, + validate_container_format_is_mp4, + validate_image_dimensions, + validate_string, +) API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads" API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts" @@ -54,13 +47,6 @@ MAX_VID_HEIGHT = 10000 MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000 -R = TypeVar("R") - - -class MoonvalleyApiError(Exception): - """Base exception for Moonvalley API errors.""" - - pass def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool: @@ -72,105 +58,7 @@ def validate_task_creation_response(response) -> None: if not is_valid_task_creation_response(response): error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}" logging.error(error_msg) - raise MoonvalleyApiError(error_msg) - - -def get_video_from_response(response): - video = response.output_url - logging.info( - "Moonvalley Marey API: Task %s succeeded. Video URL: %s", response.id, video - ) - return video - - -def get_video_url_from_response(response) -> Optional[str]: - """Returns the first video url from the Moonvalley video generation task result. - Will not raise an error if the response is not valid. - """ - if response: - return str(get_video_from_response(response)) - else: - return None - - -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, R], - result_url_extractor: Optional[Callable[[R], str]] = None, - node_id: Optional[str] = None, -) -> R: - """Polls the Moonvalley API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[ - "completed", - ], - max_poll_attempts=240, # 64 minutes with 16s interval - poll_interval=16.0, - failed_statuses=["error"], - status_extractor=lambda response: ( - response.status if response and response.status else None - ), - auth_kwargs=auth_kwargs, - result_url_extractor=result_url_extractor, - node_id=node_id, - ).execute() - - -def validate_prompts( - prompt: str, negative_prompt: str, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH -): - """Verifies that the prompt isn't empty and that neither prompt is too long.""" - if not prompt: - raise ValueError("Positive prompt is empty") - if len(prompt) > max_length: - raise ValueError(f"Positive prompt is too long: {len(prompt)} characters") - if negative_prompt and len(negative_prompt) > max_length: - raise ValueError( - f"Negative prompt is too long: {len(negative_prompt)} characters" - ) - return True - - -def validate_input_media(width, height, with_frame_conditioning, num_frames_in=None): - # inference validation - # T = num_frames - # in all cases, the following must be true: T divisible by 16 and H,W by 8. in addition... - # with image conditioning: H*W must be divisible by 8192 - # without image conditioning: T divisible by 32 - if num_frames_in and not num_frames_in % 16 == 0: - return False, ("The input video total frame count must be divisible by 16!") - - if height % 8 != 0 or width % 8 != 0: - return False, ( - f"Height ({height}) and width ({width}) must be " "divisible by 8" - ) - - if with_frame_conditioning: - if (height * width) % 8192 != 0: - return False, ( - f"Height * width ({height * width}) must be " - "divisible by 8192 for frame conditioning" - ) - else: - if num_frames_in and not num_frames_in % 32 == 0: - return False, ("The input video total frame count must be divisible by 32!") - - -def validate_input_image( - image: torch.Tensor, with_frame_conditioning: bool = False -) -> None: - """ - Validates the input image adheres to the expectations of the API: - - The image resolution should not be less than 300*300px - - The aspect ratio of the image should be between 1:2.5 ~ 2.5:1 - - """ - height, width = get_image_dimensions(image) - validate_input_media(width, height, with_frame_conditioning) - validate_image_dimensions( - image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH - ) + raise RuntimeError(error_msg) def validate_video_to_video_input(video: VideoInput) -> VideoInput: @@ -189,7 +77,7 @@ def validate_video_to_video_input(video: VideoInput) -> VideoInput: """ width, height = _get_video_dimensions(video) _validate_video_dimensions(width, height) - _validate_container_format(video) + validate_container_format_is_mp4(video) return _validate_and_trim_duration(video) @@ -214,21 +102,8 @@ def _validate_video_dimensions(width: int, height: int) -> None: } if (width, height) not in supported_resolutions: - supported_list = ", ".join( - [f"{w}x{h}" for w, h in sorted(supported_resolutions)] - ) - raise ValueError( - f"Resolution {width}x{height} not supported. Supported: {supported_list}" - ) - - -def _validate_container_format(video: VideoInput) -> None: - """Validates video container format is MP4.""" - container_format = video.get_container_format() - if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: - raise ValueError( - f"Only MP4 container format supported. Got: {container_format}" - ) + supported_list = ", ".join([f"{w}x{h}" for w, h in sorted(supported_resolutions)]) + raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}") def _validate_and_trim_duration(video: VideoInput) -> VideoInput: @@ -241,7 +116,7 @@ def _validate_and_trim_duration(video: VideoInput) -> VideoInput: def _validate_minimum_duration(duration: float) -> None: """Ensures video is at least 5 seconds long.""" if duration < 5: - raise MoonvalleyApiError("Input video must be at least 5 seconds long.") + raise ValueError("Input video must be at least 5 seconds long.") def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: @@ -251,127 +126,6 @@ def _trim_if_too_long(video: VideoInput, duration: float) -> VideoInput: return video -def trim_video(video: VideoInput, duration_sec: float) -> VideoInput: - """ - Returns a new VideoInput object trimmed from the beginning to the specified duration, - using av to avoid loading entire video into memory. - - Args: - video: Input video to trim - duration_sec: Duration in seconds to keep from the beginning - - Returns: - VideoFromFile object that owns the output buffer - """ - output_buffer = io.BytesIO() - - input_container = None - output_container = None - - try: - # Get the stream source - this avoids loading entire video into memory - # when the source is already a file path - input_source = video.get_stream_source() - - # Open containers - input_container = av.open(input_source, mode="r") - output_container = av.open(output_buffer, mode="w", format="mp4") - - # Set up output streams for re-encoding - video_stream = None - audio_stream = None - - for stream in input_container.streams: - logging.info(f"Found stream: type={stream.type}, class={type(stream)}") - if isinstance(stream, av.VideoStream): - # Create output video stream with same parameters - video_stream = output_container.add_stream( - "h264", rate=stream.average_rate - ) - video_stream.width = stream.width - video_stream.height = stream.height - video_stream.pix_fmt = "yuv420p" - logging.info( - f"Added video stream: {stream.width}x{stream.height} @ {stream.average_rate}fps" - ) - elif isinstance(stream, av.AudioStream): - # Create output audio stream with same parameters - audio_stream = output_container.add_stream( - "aac", rate=stream.sample_rate - ) - audio_stream.sample_rate = stream.sample_rate - audio_stream.layout = stream.layout - logging.info( - f"Added audio stream: {stream.sample_rate}Hz, {stream.channels} channels" - ) - - # Calculate target frame count that's divisible by 16 - fps = input_container.streams.video[0].average_rate - estimated_frames = int(duration_sec * fps) - target_frames = ( - estimated_frames // 16 - ) * 16 # Round down to nearest multiple of 16 - - if target_frames == 0: - raise ValueError("Video too short: need at least 16 frames for Moonvalley") - - frame_count = 0 - audio_frame_count = 0 - - # Decode and re-encode video frames - if video_stream: - for frame in input_container.decode(video=0): - if frame_count >= target_frames: - break - - # Re-encode frame - for packet in video_stream.encode(frame): - output_container.mux(packet) - frame_count += 1 - - # Flush encoder - for packet in video_stream.encode(): - output_container.mux(packet) - - logging.info( - f"Encoded {frame_count} video frames (target: {target_frames})" - ) - - # Decode and re-encode audio frames - if audio_stream: - input_container.seek(0) # Reset to beginning for audio - for frame in input_container.decode(audio=0): - if frame.time >= duration_sec: - break - - # Re-encode frame - for packet in audio_stream.encode(frame): - output_container.mux(packet) - audio_frame_count += 1 - - # Flush encoder - for packet in audio_stream.encode(): - output_container.mux(packet) - - logging.info(f"Encoded {audio_frame_count} audio frames") - - # Close containers - output_container.close() - input_container.close() - - # Return as VideoFromFile using the buffer - output_buffer.seek(0) - return InputImpl.VideoFromFile(output_buffer) - - except Exception as e: - # Clean up on error - if input_container is not None: - input_container.close() - if output_container is not None: - output_container.close() - raise RuntimeError(f"Failed to trim video: {str(e)}") from e - - def parse_width_height_from_res(resolution: str): # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict res_map = { @@ -380,7 +134,7 @@ def parse_width_height_from_res(resolution: str): "1:1 (1152 x 1152)": {"width": 1152, "height": 1152}, "4:3 (1536 x 1152)": {"width": 1536, "height": 1152}, "3:4 (1152 x 1536)": {"width": 1152, "height": 1536}, - "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, + # "21:9 (2560 x 1080)": {"width": 2560, "height": 1080}, } return res_map.get(resolution, {"width": 1920, "height": 1080}) @@ -395,52 +149,47 @@ def parse_control_parameter(value): return control_map.get(value, control_map["Motion Transfer"]) -async def get_response( - task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None -) -> MoonvalleyPromptResponse: - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{API_PROMPTS_ENDPOINT}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=MoonvalleyPromptResponse, - ), - result_url_extractor=get_video_url_from_response, - node_id=node_id, +async def get_response(cls: type[IO.ComfyNode], task_id: str) -> MoonvalleyPromptResponse: + return await poll_op( + cls, + ApiEndpoint(path=f"{API_PROMPTS_ENDPOINT}/{task_id}"), + response_model=MoonvalleyPromptResponse, + status_extractor=lambda r: (r.status if r and r.status else None), + poll_interval=16.0, + max_poll_attempts=240, ) -class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): +class MoonvalleyImg2VideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MoonvalleyImg2VideoNode", display_name="Moonvalley Marey Image to Video", category="api node/video/Moonvalley Marey", description="Moonvalley Marey Image to Video Node", inputs=[ - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="The reference image used to generate the video", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " - "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " - "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " - "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " - "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " - "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[ "16:9 (1920 x 1080)", @@ -448,42 +197,43 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): "1:1 (1152 x 1152)", "4:3 (1536 x 1152)", "3:4 (1152 x 1536)", - "21:9 (2560 x 1080)", + # "21:9 (2560 x 1080)", ], default="16:9 (1920 x 1080)", tooltip="Resolution of the output video", ), - comfy_io.Float.Input( + IO.Float.Input( "prompt_adherence", - default=10.0, + default=4.5, min=1.0, max=20.0, step=1.0, tooltip="Guidance scale for generation control", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed value", + control_after_generate=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", - default=100, + default=33, min=1, max=100, step=1, tooltip="Number of denoising steps", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -498,108 +248,87 @@ class MoonvalleyImg2VideoNode(comfy_io.ComfyNode): prompt_adherence: float, seed: int, steps: int, - ) -> comfy_io.NodeOutput: - validate_input_image(image, True) - validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + ) -> IO.NodeOutput: + validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH) + validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - inference_params = MoonvalleyTextToVideoInferenceParams( negative_prompt=negative_prompt, steps=steps, seed=seed, guidance_scale=prompt_adherence, - num_frames=128, width=width_height["width"], height=width_height["height"], use_negative_prompts=True, ) - """Upload image to comfy backend to have a URL available for further processing""" + # Get MIME type from tensor - assuming PNG format for image tensors mime_type = "image/png" - - image_url = ( - await upload_images_to_comfyapi( - image, max_images=1, auth_kwargs=auth, mime_type=mime_type - ) - )[0] - - request = MoonvalleyTextToVideoRequest( - image_url=image_url, prompt_text=prompt, inference_params=inference_params - ) - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=API_IMG2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse, + image_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type=mime_type))[0] + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, method="POST"), + response_model=MoonvalleyPromptResponse, + data=MoonvalleyTextToVideoRequest( + image_url=image_url, prompt_text=prompt, inference_params=inference_params ), - request=request, - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) - task_id = task_creation_response.id - - final_response = await get_response( - task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id - ) + final_response = await get_response(cls, task_creation_response.id) video = await download_url_to_video_output(final_response.output_url) - return comfy_io.NodeOutput(video) + return IO.NodeOutput(video) -class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): +class MoonvalleyVideo2VideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MoonvalleyVideo2VideoNode", display_name="Moonvalley Marey Video to Video", category="api node/video/Moonvalley Marey", description="", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="Describes the video to generate", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " - "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " - "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " - "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " - "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " - "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed value", control_after_generate=False, ), - comfy_io.Video.Input( + IO.Video.Input( "video", tooltip="The reference video used to generate the output video. Must be at least 5 seconds long. " - "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", + "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "control_type", options=["Motion Transfer", "Pose Transfer"], default="Motion Transfer", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "motion_intensity", default=100, min=0, @@ -608,12 +337,21 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): tooltip="Only used if control_type is 'Motion Transfer'", optional=True, ), + IO.Int.Input( + "steps", + default=33, + min=1, + max=100, + step=1, + display_mode=IO.NumberDisplay.number, + tooltip="Number of inference steps", + ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -627,17 +365,13 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): video: Optional[VideoInput] = None, control_type: str = "Motion Transfer", motion_intensity: Optional[int] = 100, - ) -> comfy_io.NodeOutput: - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - + steps=33, + prompt_adherence=4.5, + ) -> IO.NodeOutput: validated_video = validate_video_to_video_input(video) - video_url = await upload_video_to_comfyapi(validated_video, auth_kwargs=auth) - - """Validate prompts and inference input""" - validate_prompts(prompt, negative_prompt) + video_url = await upload_video_to_comfyapi(cls, validated_video) + validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) # Only include motion_intensity for Motion Transfer control_params = {} @@ -648,65 +382,52 @@ class MoonvalleyVideo2VideoNode(comfy_io.ComfyNode): negative_prompt=negative_prompt, seed=seed, control_params=control_params, + steps=steps, + guidance_scale=prompt_adherence, ) - control = parse_control_parameter(control_type) - - request = MoonvalleyVideoToVideoRequest( - control_type=control, - video_url=video_url, - prompt_text=prompt, - inference_params=inference_params, - ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=API_VIDEO2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyVideoToVideoRequest, - response_model=MoonvalleyPromptResponse, + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, method="POST"), + response_model=MoonvalleyPromptResponse, + data=MoonvalleyVideoToVideoRequest( + control_type=parse_control_parameter(control_type), + video_url=video_url, + prompt_text=prompt, + inference_params=inference_params, ), - request=request, - auth_kwargs=auth, ) - task_creation_response = await initial_operation.execute() validate_task_creation_response(task_creation_response) - task_id = task_creation_response.id - - final_response = await get_response( - task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id - ) - - video = await download_url_to_video_output(final_response.output_url) - return comfy_io.NodeOutput(video) + final_response = await get_response(cls, task_creation_response.id) + return IO.NodeOutput(await download_url_to_video_output(final_response.output_url)) -class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): +class MoonvalleyTxt2VideoNode(IO.ComfyNode): @classmethod - def define_schema(cls) -> comfy_io.Schema: - return comfy_io.Schema( + def define_schema(cls) -> IO.Schema: + return IO.Schema( node_id="MoonvalleyTxt2VideoNode", display_name="Moonvalley Marey Text to Video", category="api node/video/Moonvalley Marey", description="", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default=" gopro, bright, contrast, static, overexposed, vignette, " - "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " - "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " - "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " - "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " - "wobbly, weird, low quality, plastic, stock footage, video camera, boring", + "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, " + "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, " + "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, " + "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, " + "wobbly, weird, low quality, plastic, stock footage, video camera, boring", tooltip="Negative prompt text", ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[ "16:9 (1920 x 1080)", @@ -719,37 +440,38 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): default="16:9 (1920 x 1080)", tooltip="Resolution of the output video", ), - comfy_io.Float.Input( + IO.Float.Input( "prompt_adherence", - default=10.0, + default=4.0, min=1.0, max=20.0, step=1.0, tooltip="Guidance scale for generation control", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=9, min=0, max=4294967295, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, tooltip="Random seed value", ), - comfy_io.Int.Input( + IO.Int.Input( "steps", - default=100, + default=33, min=1, max=100, step=1, tooltip="Inference steps", ), ], - outputs=[comfy_io.Video.Output()], + outputs=[IO.Video.Output()], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -763,15 +485,11 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): prompt_adherence: float, seed: int, steps: int, - ) -> comfy_io.NodeOutput: - validate_prompts(prompt, negative_prompt, MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + ) -> IO.NodeOutput: + validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) + validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH) width_height = parse_width_height_from_res(resolution) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - inference_params = MoonvalleyTextToVideoInferenceParams( negative_prompt=negative_prompt, steps=steps, @@ -781,35 +499,21 @@ class MoonvalleyTxt2VideoNode(comfy_io.ComfyNode): width=width_height["width"], height=width_height["height"], ) - request = MoonvalleyTextToVideoRequest( - prompt_text=prompt, inference_params=inference_params - ) - init_op = SynchronousOperation( - endpoint=ApiEndpoint( - path=API_TXT2VIDEO_ENDPOINT, - method=HttpMethod.POST, - request_model=MoonvalleyTextToVideoRequest, - response_model=MoonvalleyPromptResponse, - ), - request=request, - auth_kwargs=auth, + task_creation_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, method="POST"), + response_model=MoonvalleyPromptResponse, + data=MoonvalleyTextToVideoRequest(prompt_text=prompt, inference_params=inference_params), ) - task_creation_response = await init_op.execute() validate_task_creation_response(task_creation_response) - task_id = task_creation_response.id - - final_response = await get_response( - task_id, auth_kwargs=auth, node_id=cls.hidden.unique_id - ) - - video = await download_url_to_video_output(final_response.output_url) - return comfy_io.NodeOutput(video) + final_response = await get_response(cls, task_creation_response.id) + return IO.NodeOutput(await download_url_to_video_output(final_response.output_url)) class MoonvalleyExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ MoonvalleyImg2VideoNode, MoonvalleyTxt2VideoNode, diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index e3b81de75..acf35d276 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -1,18 +1,19 @@ -import io -from typing import TypedDict, Optional +from io import BytesIO +from typing import Optional, Union import json import os import time -import re import uuid from enum import Enum from inspect import cleandoc import numpy as np import torch from PIL import Image -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeDict from server import PromptServer import folder_paths +import base64 +from comfy_api.latest import IO, ComfyExtension +from typing_extensions import override from comfy_api_nodes.apis import ( @@ -23,7 +24,6 @@ from comfy_api_nodes.apis import ( OpenAIResponse, CreateModelResponseProperties, Item, - Includable, OutputContent, InputImageContent, Detail, @@ -34,43 +34,22 @@ from comfy_api_nodes.apis import ( InputFileContent, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) - -from comfy_api_nodes.apinode_utils import ( +from comfy_api_nodes.util import ( downscale_image_tensor, - validate_and_cast_response, + download_url_to_bytesio, validate_string, tensor_to_base64_string, + ApiEndpoint, + sync_op, + poll_op, text_filepath_to_data_uri, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input RESPONSES_ENDPOINT = "/proxy/openai/v1/responses" STARTING_POINT_ID_PATTERN = r"" -class HistoryEntry(TypedDict): - """Type definition for a single history entry in the chat.""" - - prompt: str - response: str - response_id: str - timestamp: float - - -class ChatHistory(TypedDict): - """Type definition for the chat history dictionary.""" - - __annotations__: dict[str, list[HistoryEntry]] - - class SupportedOpenAIModel(str, Enum): o4_mini = "o4-mini" o1 = "o1" @@ -85,98 +64,123 @@ class SupportedOpenAIModel(str, Enum): gpt_5_nano = "gpt-5-nano" -class OpenAIDalle2(ComfyNodeABC): +async def validate_and_cast_response(response, timeout: int = None) -> torch.Tensor: + """Validates and casts a response to a torch.Tensor. + + Args: + response: The response to validate and cast. + timeout: Request timeout in seconds. Defaults to None (no timeout). + + Returns: + A torch.Tensor representing the image (1, H, W, C). + + Raises: + ValueError: If the response is not valid. + """ + # validate raw JSON response + data = response.data + if not data or len(data) == 0: + raise ValueError("No images returned from API endpoint") + + # Initialize list to store image tensors + image_tensors: list[torch.Tensor] = [] + + # Process each image in the data array + for img_data in data: + if img_data.b64_json: + img_io = BytesIO(base64.b64decode(img_data.b64_json)) + elif img_data.url: + img_io = BytesIO() + await download_url_to_bytesio(img_data.url, img_io, timeout=timeout) + else: + raise ValueError("Invalid image payload – neither URL nor base64 data present.") + + pil_img = Image.open(img_io).convert("RGBA") + arr = np.asarray(pil_img).astype(np.float32) / 255.0 + image_tensors.append(torch.from_numpy(arr)) + + return torch.stack(image_tensors, dim=0) + + +class OpenAIDalle2(IO.ComfyNode): """ Generates images synchronously via OpenAI's DALL·E 2 endpoint. """ - def __init__(self): - pass + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIDalle2", + display_name="OpenAI DALL·E 2", + category="api node/image/OpenAI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text prompt for DALL·E", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2**31 - 1, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="not implemented yet in backend", + optional=True, + ), + IO.Combo.Input( + "size", + default="1024x1024", + options=["256x256", "512x512", "1024x1024"], + tooltip="Image size", + optional=True, + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=8, + step=1, + tooltip="How many images to generate", + display_mode=IO.NumberDisplay.number, + optional=True, + ), + IO.Image.Input( + "image", + tooltip="Optional reference image for image editing.", + optional=True, + ), + IO.Mask.Input( + "mask", + tooltip="Optional mask for inpainting (white areas will be replaced)", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for DALL·E", - }, - ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2**31 - 1, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "not implemented yet in backend", - }, - ), - "size": ( - IO.COMBO, - { - "options": ["256x256", "512x512", "1024x1024"], - "default": "1024x1024", - "tooltip": "Image size", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "How many images to generate", - }, - ), - "image": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional reference image for image editing.", - }, - ), - "mask": ( - IO.MASK, - { - "default": None, - "tooltip": "Optional mask for inpainting (white areas will be replaced)", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/OpenAI" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + async def execute( + cls, prompt, seed=0, image=None, mask=None, n=1, size="1024x1024", - unique_id=None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) model = "dall-e-2" path = "/proxy/openai/images/generations" @@ -202,7 +206,7 @@ class OpenAIDalle2(ComfyNodeABC): image_np = (rgba_tensor.numpy() * 255).astype(np.uint8) img = Image.fromarray(image_np) - img_byte_arr = io.BytesIO() + img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) img_binary = img_byte_arr # .getvalue() @@ -210,15 +214,11 @@ class OpenAIDalle2(ComfyNodeABC): elif image is not None or mask is not None: raise Exception("Dall-E 2 image editing requires an image AND a mask") - # Build the operation - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=request_class, - response_model=OpenAIImageGenerationResponse, - ), - request=request_class( + response = await sync_op( + cls, + ApiEndpoint(path=path, method="POST"), + response_model=OpenAIImageGenerationResponse, + data=request_class( model=model, prompt=prompt, n=n, @@ -227,115 +227,98 @@ class OpenAIDalle2(ComfyNodeABC): ), files=( { - "image": img_binary, + "image": ("image.png", img_binary, "image/png"), } if img_binary else None ), content_type=content_type, - auth_kwargs=kwargs, ) - response = await operation.execute() - - img_tensor = await validate_and_cast_response(response, node_id=unique_id) - return (img_tensor,) + return IO.NodeOutput(await validate_and_cast_response(response)) -class OpenAIDalle3(ComfyNodeABC): +class OpenAIDalle3(IO.ComfyNode): """ Generates images synchronously via OpenAI's DALL·E 3 endpoint. """ - def __init__(self): - pass + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIDalle3", + display_name="OpenAI DALL·E 3", + category="api node/image/OpenAI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text prompt for DALL·E", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2 ** 31 - 1, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="not implemented yet in backend", + optional=True, + ), + IO.Combo.Input( + "quality", + default="standard", + options=["standard", "hd"], + tooltip="Image quality", + optional=True, + ), + IO.Combo.Input( + "style", + default="natural", + options=["natural", "vivid"], + tooltip="Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images.", + optional=True, + ), + IO.Combo.Input( + "size", + default="1024x1024", + options=["1024x1024", "1024x1792", "1792x1024"], + tooltip="Image size", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for DALL·E", - }, - ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2**31 - 1, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "not implemented yet in backend", - }, - ), - "quality": ( - IO.COMBO, - { - "options": ["standard", "hd"], - "default": "standard", - "tooltip": "Image quality", - }, - ), - "style": ( - IO.COMBO, - { - "options": ["natural", "vivid"], - "default": "natural", - "tooltip": "Vivid causes the model to lean towards generating hyper-real and dramatic images. Natural causes the model to produce more natural, less hyper-real looking images.", - }, - ), - "size": ( - IO.COMBO, - { - "options": ["1024x1024", "1024x1792", "1792x1024"], - "default": "1024x1024", - "tooltip": "Image size", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/OpenAI" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + async def execute( + cls, prompt, seed=0, style="natural", quality="standard", size="1024x1024", - unique_id=None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) model = "dall-e-3" # build the operation - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/openai/images/generations", - method=HttpMethod.POST, - request_model=OpenAIImageGenerationRequest, - response_model=OpenAIImageGenerationResponse, - ), - request=OpenAIImageGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/openai/images/generations", method="POST"), + response_model=OpenAIImageGenerationResponse, + data=OpenAIImageGenerationRequest( model=model, prompt=prompt, quality=quality, @@ -343,114 +326,97 @@ class OpenAIDalle3(ComfyNodeABC): style=style, seed=seed, ), - auth_kwargs=kwargs, ) - response = await operation.execute() - - img_tensor = await validate_and_cast_response(response, node_id=unique_id) - return (img_tensor,) + return IO.NodeOutput(await validate_and_cast_response(response)) -class OpenAIGPTImage1(ComfyNodeABC): +class OpenAIGPTImage1(IO.ComfyNode): """ Generates images synchronously via OpenAI's GPT Image 1 endpoint. """ - def __init__(self): - pass + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIGPTImage1", + display_name="OpenAI GPT Image 1", + category="api node/image/OpenAI", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text prompt for GPT Image 1", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2 ** 31 - 1, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="not implemented yet in backend", + optional=True, + ), + IO.Combo.Input( + "quality", + default="low", + options=["low", "medium", "high"], + tooltip="Image quality, affects cost and generation time.", + optional=True, + ), + IO.Combo.Input( + "background", + default="opaque", + options=["opaque", "transparent"], + tooltip="Return image with or without background", + optional=True, + ), + IO.Combo.Input( + "size", + default="auto", + options=["auto", "1024x1024", "1024x1536", "1536x1024"], + tooltip="Image size", + optional=True, + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=8, + step=1, + tooltip="How many images to generate", + display_mode=IO.NumberDisplay.number, + optional=True, + ), + IO.Image.Input( + "image", + tooltip="Optional reference image for image editing.", + optional=True, + ), + IO.Mask.Input( + "mask", + tooltip="Optional mask for inpainting (white areas will be replaced)", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text prompt for GPT Image 1", - }, - ), - }, - "optional": { - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2**31 - 1, - "step": 1, - "display": "number", - "control_after_generate": True, - "tooltip": "not implemented yet in backend", - }, - ), - "quality": ( - IO.COMBO, - { - "options": ["low", "medium", "high"], - "default": "low", - "tooltip": "Image quality, affects cost and generation time.", - }, - ), - "background": ( - IO.COMBO, - { - "options": ["opaque", "transparent"], - "default": "opaque", - "tooltip": "Return image with or without background", - }, - ), - "size": ( - IO.COMBO, - { - "options": ["auto", "1024x1024", "1024x1536", "1536x1024"], - "default": "auto", - "tooltip": "Image size", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 8, - "step": 1, - "display": "number", - "tooltip": "How many images to generate", - }, - ), - "image": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional reference image for image editing.", - }, - ), - "mask": ( - IO.MASK, - { - "default": None, - "tooltip": "Optional mask for inpainting (white areas will be replaced)", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = (IO.IMAGE,) - FUNCTION = "api_call" - CATEGORY = "api node/image/OpenAI" - DESCRIPTION = cleandoc(__doc__ or "") - API_NODE = True - - async def api_call( - self, + async def execute( + cls, prompt, seed=0, quality="low", @@ -459,9 +425,7 @@ class OpenAIGPTImage1(ComfyNodeABC): mask=None, n=1, size="1024x1024", - unique_id=None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) model = "gpt-image-1" path = "/proxy/openai/images/generations" @@ -482,7 +446,7 @@ class OpenAIGPTImage1(ComfyNodeABC): image_np = (scaled_image.numpy() * 255).astype(np.uint8) img = Image.fromarray(image_np) - img_byte_arr = io.BytesIO() + img_byte_arr = BytesIO() img.save(img_byte_arr, format="PNG") img_byte_arr.seek(0) @@ -506,20 +470,17 @@ class OpenAIGPTImage1(ComfyNodeABC): mask_np = (scaled_mask.numpy() * 255).astype(np.uint8) mask_img = Image.fromarray(mask_np) - mask_img_byte_arr = io.BytesIO() + mask_img_byte_arr = BytesIO() mask_img.save(mask_img_byte_arr, format="PNG") mask_img_byte_arr.seek(0) files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png"))) # Build the operation - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=request_class, - response_model=OpenAIImageGenerationResponse, - ), - request=request_class( + response = await sync_op( + cls, + ApiEndpoint(path=path, method="POST"), + response_model=OpenAIImageGenerationResponse, + data=request_class( model=model, prompt=prompt, quality=quality, @@ -530,127 +491,70 @@ class OpenAIGPTImage1(ComfyNodeABC): ), files=files if files else None, content_type=content_type, - auth_kwargs=kwargs, ) - response = await operation.execute() - - img_tensor = await validate_and_cast_response(response, node_id=unique_id) - return (img_tensor,) + return IO.NodeOutput(await validate_and_cast_response(response)) -class OpenAITextNode(ComfyNodeABC): - """ - Base class for OpenAI text generation nodes. - """ - - RETURN_TYPES = (IO.STRING,) - FUNCTION = "api_call" - CATEGORY = "api node/text/OpenAI" - API_NODE = True - - -class OpenAIChatNode(OpenAITextNode): +class OpenAIChatNode(IO.ComfyNode): """ Node to generate text responses from an OpenAI model. """ - def __init__(self) -> None: - """Initialize the chat node with a new session ID and empty history.""" - self.current_session_id: str = str(uuid.uuid4()) - self.history: dict[str, list[HistoryEntry]] = {} - self.previous_response_id: Optional[str] = None + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIChatNode", + display_name="OpenAI ChatGPT", + category="api node/text/OpenAI", + description="Generate text responses from an OpenAI model.", + inputs=[ + IO.String.Input( + "prompt", + default="", + multiline=True, + tooltip="Text inputs to the model, used to generate a response.", + ), + IO.Boolean.Input( + "persist_context", + default=False, + tooltip="This parameter is deprecated and has no effect.", + ), + IO.Combo.Input( + "model", + options=SupportedOpenAIModel, + tooltip="The model used to generate the response", + ), + IO.Image.Input( + "images", + tooltip="Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", + optional=True, + ), + IO.Custom("OPENAI_INPUT_FILES").Input( + "files", + optional=True, + tooltip="Optional file(s) to use as context for the model. Accepts inputs from the OpenAI Chat Input Files node.", + ), + IO.Custom("OPENAI_CHAT_CONFIG").Input( + "advanced_options", + optional=True, + tooltip="Optional configuration for the model. Accepts inputs from the OpenAI Chat Advanced Options node.", + ), + ], + outputs=[ + IO.String.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Text inputs to the model, used to generate a response.", - }, - ), - "persist_context": ( - IO.BOOLEAN, - { - "default": True, - "tooltip": "Persist chat context between calls (multi-turn conversation)", - }, - ), - "model": model_field_to_node_input( - IO.COMBO, - OpenAICreateResponse, - "model", - enum_type=SupportedOpenAIModel, - ), - }, - "optional": { - "images": ( - IO.IMAGE, - { - "default": None, - "tooltip": "Optional image(s) to use as context for the model. To include multiple images, you can use the Batch Images node.", - }, - ), - "files": ( - "OPENAI_INPUT_FILES", - { - "default": None, - "tooltip": "Optional file(s) to use as context for the model. Accepts inputs from the OpenAI Chat Input Files node.", - }, - ), - "advanced_options": ( - "OPENAI_CHAT_CONFIG", - { - "default": None, - "tooltip": "Optional configuration for the model. Accepts inputs from the OpenAI Chat Advanced Options node.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - DESCRIPTION = "Generate text responses from an OpenAI model." - - async def get_result_response( - self, - response_id: str, - include: Optional[list[Includable]] = None, - auth_kwargs: Optional[dict[str, str]] = None, - ) -> OpenAIResponse: - """ - Retrieve a model response with the given ID from the OpenAI API. - - Args: - response_id (str): The ID of the response to retrieve. - include (Optional[List[Includable]]): Additional fields to include - in the response. See the `include` parameter for Response - creation above for more information. - - """ - return await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{RESPONSES_ENDPOINT}/{response_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=OpenAIResponse, - query_params={"include": include}, - ), - completed_statuses=["completed"], - failed_statuses=["failed"], - status_extractor=lambda response: response.status, - auth_kwargs=auth_kwargs, - ).execute() - def get_message_content_from_response( - self, response: OpenAIResponse + cls, response: OpenAIResponse ) -> list[OutputContent]: """Extract message content from the API response.""" for output in response.output: @@ -658,8 +562,9 @@ class OpenAIChatNode(OpenAITextNode): return output.root.content raise TypeError("No output message found in response") + @classmethod def get_text_from_message_content( - self, message_content: list[OutputContent] + cls, message_content: list[OutputContent] ) -> str: """Extract text content from message content.""" for content_item in message_content: @@ -667,58 +572,9 @@ class OpenAIChatNode(OpenAITextNode): return str(content_item.root.text) return "No text output found in response" - def get_history_text(self, session_id: str) -> str: - """Convert the entire history for a given session to JSON string.""" - return json.dumps(self.history[session_id]) - - def display_history_on_node(self, session_id: str, node_id: str) -> None: - """Display formatted chat history on the node UI.""" - render_spec = { - "node_id": node_id, - "component": "ChatHistoryWidget", - "props": { - "history": self.get_history_text(session_id), - }, - } - PromptServer.instance.send_sync( - "display_component", - render_spec, - ) - - def add_to_history( - self, session_id: str, prompt: str, output_text: str, response_id: str - ) -> None: - """Add a new entry to the chat history.""" - if session_id not in self.history: - self.history[session_id] = [] - self.history[session_id].append( - { - "prompt": prompt, - "response": output_text, - "response_id": response_id, - "timestamp": time.time(), - } - ) - - def parse_output_text_from_response(self, response: OpenAIResponse) -> str: - """Extract text output from the API response.""" - message_contents = self.get_message_content_from_response(response) - return self.get_text_from_message_content(message_contents) - - def generate_new_session_id(self) -> str: - """Generate a new unique session ID.""" - return str(uuid.uuid4()) - - def get_session_id(self, persist_context: bool) -> str: - """Get the current or generate a new session ID based on context persistence.""" - return ( - self.current_session_id - if persist_context - else self.generate_new_session_id() - ) - + @classmethod def tensor_to_input_image_content( - self, image: torch.Tensor, detail_level: Detail = "auto" + cls, image: torch.Tensor, detail_level: Detail = "auto" ) -> InputImageContent: """Convert a tensor to an input image content object.""" return InputImageContent( @@ -727,21 +583,27 @@ class OpenAIChatNode(OpenAITextNode): type="input_image", ) + @classmethod def create_input_message_contents( - self, + cls, prompt: str, image: Optional[torch.Tensor] = None, files: Optional[list[InputFileContent]] = None, ) -> InputMessageContentList: """Create a list of input message contents from prompt and optional image.""" - content_list: list[InputContent] = [ + content_list: list[Union[InputContent, InputTextContent, InputImageContent, InputFileContent]] = [ InputTextContent(text=prompt, type="input_text"), ] if image is not None: for i in range(image.shape[0]): content_list.append( - self.tensor_to_input_image_content(image[i].unsqueeze(0)) + InputImageContent( + detail="auto", + image_url=f"data:image/png;base64,{tensor_to_base64_string(image[i].unsqueeze(0))}", + type="input_image", + ) ) + if files is not None: content_list.extend(files) @@ -749,80 +611,28 @@ class OpenAIChatNode(OpenAITextNode): root=content_list, ) - def parse_response_id_from_prompt(self, prompt: str) -> Optional[str]: - """Extract response ID from prompt if it exists.""" - parsed_id = re.search(STARTING_POINT_ID_PATTERN, prompt) - return parsed_id.group(1) if parsed_id else None - - def strip_response_tag_from_prompt(self, prompt: str) -> str: - """Remove the response ID tag from the prompt.""" - return re.sub(STARTING_POINT_ID_PATTERN, "", prompt.strip()) - - def delete_history_after_response_id( - self, new_start_id: str, session_id: str - ) -> None: - """Delete history entries after a specific response ID.""" - if session_id not in self.history: - return - - new_history = [] - i = 0 - while ( - i < len(self.history[session_id]) - and self.history[session_id][i]["response_id"] != new_start_id - ): - new_history.append(self.history[session_id][i]) - i += 1 - - # Since it's the new starting point (not the response being edited), we include it as well - if i < len(self.history[session_id]): - new_history.append(self.history[session_id][i]) - - self.history[session_id] = new_history - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt: str, - persist_context: bool, - model: SupportedOpenAIModel, - unique_id: Optional[str] = None, + persist_context: bool = False, + model: SupportedOpenAIModel = SupportedOpenAIModel.gpt_5.value, images: Optional[torch.Tensor] = None, files: Optional[list[InputFileContent]] = None, advanced_options: Optional[CreateModelResponseProperties] = None, - **kwargs, - ) -> tuple[str]: - # Validate inputs + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) - session_id = self.get_session_id(persist_context) - response_id_override = self.parse_response_id_from_prompt(prompt) - if response_id_override: - is_starting_from_beginning = response_id_override == "start" - if is_starting_from_beginning: - self.history[session_id] = [] - previous_response_id = None - else: - previous_response_id = response_id_override - self.delete_history_after_response_id(response_id_override, session_id) - prompt = self.strip_response_tag_from_prompt(prompt) - elif persist_context: - previous_response_id = self.previous_response_id - else: - previous_response_id = None - # Create response - create_response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=RESPONSES_ENDPOINT, - method=HttpMethod.POST, - request_model=OpenAICreateResponse, - response_model=OpenAIResponse, - ), - request=OpenAICreateResponse( + create_response = await sync_op( + cls, + ApiEndpoint(path=RESPONSES_ENDPOINT, method="POST"), + response_model=OpenAIResponse, + data=OpenAICreateResponse( input=[ Item( root=InputMessage( - content=self.create_input_message_contents( + content=cls.create_input_message_contents( prompt, images, files ), role="user", @@ -832,36 +642,57 @@ class OpenAIChatNode(OpenAITextNode): store=True, stream=False, model=model, - previous_response_id=previous_response_id, + previous_response_id=None, **( advanced_options.model_dump(exclude_none=True) if advanced_options else {} ), ), - auth_kwargs=kwargs, - ).execute() + ) response_id = create_response.id # Get result output - result_response = await self.get_result_response(response_id, auth_kwargs=kwargs) - output_text = self.parse_output_text_from_response(result_response) + result_response = await poll_op( + cls, + ApiEndpoint(path=f"{RESPONSES_ENDPOINT}/{response_id}"), + response_model=OpenAIResponse, + status_extractor=lambda response: response.status, + completed_statuses=["incomplete", "completed"] + ) + output_text = cls.get_text_from_message_content(cls.get_message_content_from_response(result_response)) # Update history - self.add_to_history(session_id, prompt, output_text, response_id) - self.display_history_on_node(session_id, unique_id) - self.previous_response_id = response_id - - return (output_text,) + render_spec = { + "node_id": cls.hidden.unique_id, + "component": "ChatHistoryWidget", + "props": { + "history": json.dumps( + [ + { + "prompt": prompt, + "response": output_text, + "response_id": str(uuid.uuid4()), + "timestamp": time.time(), + } + ] + ), + }, + } + PromptServer.instance.send_sync( + "display_component", + render_spec, + ) + return IO.NodeOutput(output_text) -class OpenAIInputFiles(ComfyNodeABC): +class OpenAIInputFiles(IO.ComfyNode): """ Loads and formats input files for OpenAI API. """ @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: + def define_schema(cls): """ For details about the supported file input types, see: https://platform.openai.com/docs/guides/pdf-files?api-mode=responses @@ -876,97 +707,92 @@ class OpenAIInputFiles(ComfyNodeABC): ] input_files = sorted(input_files, key=lambda x: x.name) input_files = [f.name for f in input_files] - return { - "required": { - "file": ( - IO.COMBO, - { - "tooltip": "Input files to include as context for the model. Only accepts text (.txt) and PDF (.pdf) files for now.", - "options": input_files, - "default": input_files[0] if input_files else None, - }, + return IO.Schema( + node_id="OpenAIInputFiles", + display_name="OpenAI ChatGPT Input Files", + category="api node/text/OpenAI", + description="Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes.", + inputs=[ + IO.Combo.Input( + "file", + options=input_files, + default=input_files[0] if input_files else None, + tooltip="Input files to include as context for the model. Only accepts text (.txt) and PDF (.pdf) files for now.", ), - }, - "optional": { - "OPENAI_INPUT_FILES": ( + IO.Custom("OPENAI_INPUT_FILES").Input( "OPENAI_INPUT_FILES", - { - "tooltip": "An optional additional file(s) to batch together with the file loaded from this node. Allows chaining of input files so that a single message can include multiple input files.", - "default": None, - }, + tooltip="An optional additional file(s) to batch together with the file loaded from this node. Allows chaining of input files so that a single message can include multiple input files.", + optional=True, ), - }, - } + ], + outputs=[ + IO.Custom("OPENAI_INPUT_FILES").Output(), + ], + ) - DESCRIPTION = "Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes." - RETURN_TYPES = ("OPENAI_INPUT_FILES",) - FUNCTION = "prepare_files" - CATEGORY = "api node/text/OpenAI" - - def create_input_file_content(self, file_path: str) -> InputFileContent: + @classmethod + def create_input_file_content(cls, file_path: str) -> InputFileContent: return InputFileContent( file_data=text_filepath_to_data_uri(file_path), filename=os.path.basename(file_path), type="input_file", ) - def prepare_files( - self, file: str, OPENAI_INPUT_FILES: list[InputFileContent] = [] - ) -> tuple[list[InputFileContent]]: + @classmethod + def execute(cls, file: str, OPENAI_INPUT_FILES: list[InputFileContent] = []) -> IO.NodeOutput: """ Loads and formats input files for OpenAI API. """ file_path = folder_paths.get_annotated_filepath(file) - input_file_content = self.create_input_file_content(file_path) + input_file_content = cls.create_input_file_content(file_path) files = [input_file_content] + OPENAI_INPUT_FILES - return (files,) + return IO.NodeOutput(files) -class OpenAIChatConfig(ComfyNodeABC): +class OpenAIChatConfig(IO.ComfyNode): """Allows setting additional configuration for the OpenAI Chat Node.""" - RETURN_TYPES = ("OPENAI_CHAT_CONFIG",) - FUNCTION = "configure" - DESCRIPTION = ( - "Allows specifying advanced configuration options for the OpenAI Chat Nodes." - ) - CATEGORY = "api node/text/OpenAI" - @classmethod - def INPUT_TYPES(cls) -> InputTypeDict: - return { - "required": { - "truncation": ( - IO.COMBO, - { - "options": ["auto", "disabled"], - "default": "auto", - "tooltip": "The truncation strategy to use for the model response. auto: If the context of this response and previous ones exceeds the model's context window size, the model will truncate the response to fit the context window by dropping input items in the middle of the conversation.disabled: If a model response will exceed the context window size for a model, the request will fail with a 400 error", - }, + def define_schema(cls): + return IO.Schema( + node_id="OpenAIChatConfig", + display_name="OpenAI ChatGPT Advanced Options", + category="api node/text/OpenAI", + description="Allows specifying advanced configuration options for the OpenAI Chat Nodes.", + inputs=[ + IO.Combo.Input( + "truncation", + options=["auto", "disabled"], + default="auto", + tooltip="The truncation strategy to use for the model response. auto: If the context of this response and previous ones exceeds the model's context window size, the model will truncate the response to fit the context window by dropping input items in the middle of the conversation.disabled: If a model response will exceed the context window size for a model, the request will fail with a 400 error", ), - }, - "optional": { - "max_output_tokens": model_field_to_node_input( - IO.INT, - OpenAICreateResponse, + IO.Int.Input( "max_output_tokens", min=16, default=4096, max=16384, tooltip="An upper bound for the number of tokens that can be generated for a response, including visible output tokens", + optional=True, ), - "instructions": model_field_to_node_input( - IO.STRING, OpenAICreateResponse, "instructions", multiline=True + IO.String.Input( + "instructions", + multiline=True, + optional=True, + tooltip="Instructions for the model on how to generate the response", ), - }, - } + ], + outputs=[ + IO.Custom("OPENAI_CHAT_CONFIG").Output(), + ], + ) - def configure( - self, + @classmethod + def execute( + cls, truncation: bool, instructions: Optional[str] = None, max_output_tokens: Optional[int] = None, - ) -> tuple[CreateModelResponseProperties]: + ) -> IO.NodeOutput: """ Configure advanced options for the OpenAI Chat Node. @@ -976,29 +802,27 @@ class OpenAIChatConfig(ComfyNodeABC): They are not exposed as inputs at all to avoid having to manually remove depending on model choice. """ - return ( + return IO.NodeOutput( CreateModelResponseProperties( instructions=instructions, truncation=truncation, max_output_tokens=max_output_tokens, - ), + ) ) -NODE_CLASS_MAPPINGS = { - "OpenAIDalle2": OpenAIDalle2, - "OpenAIDalle3": OpenAIDalle3, - "OpenAIGPTImage1": OpenAIGPTImage1, - "OpenAIChatNode": OpenAIChatNode, - "OpenAIInputFiles": OpenAIInputFiles, - "OpenAIChatConfig": OpenAIChatConfig, -} +class OpenAIExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + OpenAIDalle2, + OpenAIDalle3, + OpenAIGPTImage1, + OpenAIChatNode, + OpenAIInputFiles, + OpenAIChatConfig, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "OpenAIDalle2": "OpenAI DALL·E 2", - "OpenAIDalle3": "OpenAI DALL·E 3", - "OpenAIGPTImage1": "OpenAI GPT Image 1", - "OpenAIChatNode": "OpenAI ChatGPT", - "OpenAIInputFiles": "OpenAI ChatGPT Input Files", - "OpenAIChatConfig": "OpenAI ChatGPT Advanced Options", -} + +async def comfy_entrypoint() -> OpenAIExtension: + return OpenAIExtension() diff --git a/comfy_api_nodes/nodes_pika.py b/comfy_api_nodes/nodes_pika.py index a8dc43cb3..51148211b 100644 --- a/comfy_api_nodes/nodes_pika.py +++ b/comfy_api_nodes/nodes_pika.py @@ -5,45 +5,25 @@ Pika API docs: https://pika-827374fb.mintlify.app/api-reference """ from __future__ import annotations -import io +from io import BytesIO import logging -from typing import Optional, TypeVar +from typing import Optional -import numpy as np import torch -from comfy.comfy_types.node_typing import IO, ComfyNodeABC, InputTypeOptions -from comfy_api.input_impl import VideoFromFile +from typing_extensions import override +from comfy_api.latest import ComfyExtension, IO from comfy_api.input_impl.video_types import VideoCodec, VideoContainer, VideoInput -from comfy_api_nodes.apinode_utils import ( +from comfy_api_nodes.apis import pika_api as pika_defs +from comfy_api_nodes.util import ( + validate_string, download_url_to_video_output, tensor_to_bytesio, -) -from comfy_api_nodes.apis import ( - IngredientsMode, - PikaBodyGenerate22C2vGenerate22PikascenesPost, - PikaBodyGenerate22I2vGenerate22I2vPost, - PikaBodyGenerate22KeyframeGenerate22PikaframesPost, - PikaBodyGenerate22T2vGenerate22T2vPost, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - PikaDurationEnum, - Pikaffect, - PikaGenerateResponse, - PikaResolutionEnum, - PikaVideoResponse, -) -from comfy_api_nodes.apis.client import ( ApiEndpoint, - EmptyRequest, - HttpMethod, - PollingOperation, - SynchronousOperation, + sync_op, + poll_op, ) -from comfy_api_nodes.mapper_utils import model_field_to_node_input -R = TypeVar("R") PATH_PIKADDITIONS = "/proxy/pika/generate/pikadditions" PATH_PIKASWAPS = "/proxy/pika/generate/pikaswaps" @@ -58,248 +38,137 @@ PATH_PIKASCENES = f"/proxy/pika/generate/{PIKA_API_VERSION}/pikascenes" PATH_VIDEO_GET = "/proxy/pika/videos" -class PikaApiError(Exception): - """Exception for Pika API errors.""" - - pass +async def execute_task( + task_id: str, + cls: type[IO.ComfyNode], +) -> IO.NodeOutput: + final_response: pika_defs.PikaVideoResponse = await poll_op( + cls, + ApiEndpoint(path=f"{PATH_VIDEO_GET}/{task_id}"), + response_model=pika_defs.PikaVideoResponse, + status_extractor=lambda response: (response.status.value if response.status else None), + progress_extractor=lambda response: (response.progress if hasattr(response, "progress") else None), + estimated_duration=60, + max_poll_attempts=240, + ) + if not final_response.url: + error_msg = f"Pika task {task_id} succeeded but no video data found in response:\n{final_response}" + logging.error(error_msg) + raise Exception(error_msg) + video_url = final_response.url + logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) + return IO.NodeOutput(await download_url_to_video_output(video_url)) -def is_valid_video_response(response: PikaVideoResponse) -> bool: - """Check if the video response is valid.""" - return hasattr(response, "url") and response.url is not None +def get_base_inputs_types() -> list[IO.Input]: + """Get the base required inputs types common to all Pika nodes.""" + return [ + IO.String.Input("prompt_text", multiline=True), + IO.String.Input("negative_prompt", multiline=True), + IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + IO.Combo.Input("resolution", options=["1080p", "720p"], default="1080p"), + IO.Combo.Input("duration", options=[5, 10], default=5), + ] -def is_valid_initial_response(response: PikaGenerateResponse) -> bool: - """Check if the initial response is valid.""" - return hasattr(response, "video_id") and response.video_id is not None - - -class PikaNodeBase(ComfyNodeABC): - """Base class for Pika nodes.""" - - @classmethod - def get_base_inputs_types( - cls, request_model - ) -> dict[str, tuple[IO, InputTypeOptions]]: - """Get the base required inputs types common to all Pika nodes.""" - return { - "prompt_text": model_field_to_node_input( - IO.STRING, - request_model, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - request_model, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - request_model, - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - "resolution": model_field_to_node_input( - IO.COMBO, - request_model, - "resolution", - enum_type=PikaResolutionEnum, - ), - "duration": model_field_to_node_input( - IO.COMBO, - request_model, - "duration", - enum_type=PikaDurationEnum, - ), - } - - CATEGORY = "api node/video/Pika" - API_NODE = True - FUNCTION = "api_call" - RETURN_TYPES = ("VIDEO",) - - async def poll_for_task_status( - self, - task_id: str, - auth_kwargs: Optional[dict[str, str]] = None, - node_id: Optional[str] = None, - ) -> PikaGenerateResponse: - polling_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"{PATH_VIDEO_GET}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PikaVideoResponse, - ), - completed_statuses=[ - "finished", - ], - failed_statuses=["failed", "cancelled"], - status_extractor=lambda response: ( - response.status.value if response.status else None - ), - progress_extractor=lambda response: ( - response.progress if hasattr(response, "progress") else None - ), - auth_kwargs=auth_kwargs, - result_url_extractor=lambda response: ( - response.url if hasattr(response, "url") else None - ), - node_id=node_id, - estimated_duration=60 - ) - return await polling_operation.execute() - - async def execute_task( - self, - initial_operation: SynchronousOperation[R, PikaGenerateResponse], - auth_kwargs: Optional[dict[str, str]] = None, - node_id: Optional[str] = None, - ) -> tuple[VideoFromFile]: - """Executes the initial operation then polls for the task status until it is completed. - - Args: - initial_operation: The initial operation to execute. - auth_kwargs: The authentication token(s) to use for the API call. - - Returns: - A tuple containing the video file as a VIDEO output. - """ - initial_response = await initial_operation.execute() - if not is_valid_initial_response(initial_response): - error_msg = f"Pika initial request failed. Code: {initial_response.code}, Message: {initial_response.message}, Data: {initial_response.data}" - logging.error(error_msg) - raise PikaApiError(error_msg) - - task_id = initial_response.video_id - final_response = await self.poll_for_task_status(task_id, auth_kwargs) - if not is_valid_video_response(final_response): - error_msg = ( - f"Pika task {task_id} succeeded but no video data found in response." - ) - logging.error(error_msg) - raise PikaApiError(error_msg) - - video_url = str(final_response.url) - logging.info("Pika task %s succeeded. Video URL: %s", task_id, video_url) - - return (await download_url_to_video_output(video_url),) - - -class PikaImageToVideoV2_2(PikaNodeBase): +class PikaImageToVideo(IO.ComfyNode): """Pika 2.2 Image to Video Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "image": ( - IO.IMAGE, - {"tooltip": "The image to convert to video"}, - ), - **cls.get_base_inputs_types(PikaBodyGenerate22I2vGenerate22I2vPost), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PikaImageToVideoNode2_2", + display_name="Pika Image to Video", + description="Sends an image and prompt to the Pika API v2.2 to generate a video.", + category="api node/video/Pika", + inputs=[ + IO.Image.Input("image", tooltip="The image to convert to video"), + *get_base_inputs_types(), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Sends an image and prompt to the Pika API v2.2 to generate a video." - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, prompt_text: str, negative_prompt: str, seed: int, resolution: str, duration: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - # Convert image to BytesIO + ) -> IO.NodeOutput: image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - pika_files = {"image": ("image.png", image_bytes_io, "image/png")} - - # Prepare non-file data - pika_request_data = PikaBodyGenerate22I2vGenerate22I2vPost( + pika_request_data = pika_defs.PikaBodyGenerate22I2vGenerate22I2vPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, resolution=resolution, duration=duration, ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=PikaBodyGenerate22I2vGenerate22I2vPost, - response_model=PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -class PikaTextToVideoNodeV2_2(PikaNodeBase): +class PikaTextToVideoNode(IO.ComfyNode): """Pika Text2Video v2.2 Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - **cls.get_base_inputs_types(PikaBodyGenerate22T2vGenerate22T2vPost), - "aspect_ratio": model_field_to_node_input( - IO.FLOAT, - PikaBodyGenerate22T2vGenerate22T2vPost, - "aspectRatio", + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PikaTextToVideoNode2_2", + display_name="Pika Text to Video", + description="Sends a text prompt to the Pika API v2.2 to generate a video.", + category="api node/video/Pika", + inputs=[ + *get_base_inputs_types(), + IO.Float.Input( + "aspect_ratio", step=0.001, min=0.4, max=2.5, default=1.7777777777777777, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + tooltip="Aspect ratio (width / height)", + ) + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Sends a text prompt to the Pika API v2.2 to generate a video." - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt_text: str, negative_prompt: str, seed: int, resolution: str, duration: int, aspect_ratio: float, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_VIDEO, - method=HttpMethod.POST, - request_model=PikaBodyGenerate22T2vGenerate22T2vPost, - response_model=PikaGenerateResponse, - ), - request=PikaBodyGenerate22T2vGenerate22T2vPost( + ) -> IO.NodeOutput: + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_TEXT_TO_VIDEO, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_defs.PikaBodyGenerate22T2vGenerate22T2vPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -307,62 +176,74 @@ class PikaTextToVideoNodeV2_2(PikaNodeBase): duration=duration, aspectRatio=aspect_ratio, ), - auth_kwargs=kwargs, content_type="application/x-www-form-urlencoded", ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -class PikaScenesV2_2(PikaNodeBase): +class PikaScenes(IO.ComfyNode): """PikaScenes v2.2 Node.""" @classmethod - def INPUT_TYPES(cls): - image_ingredient_input = ( - IO.IMAGE, - {"tooltip": "Image that will be used as ingredient to create a video."}, - ) - return { - "required": { - **cls.get_base_inputs_types( - PikaBodyGenerate22C2vGenerate22PikascenesPost, - ), - "ingredients_mode": model_field_to_node_input( - IO.COMBO, - PikaBodyGenerate22C2vGenerate22PikascenesPost, - "ingredientsMode", - enum_type=IngredientsMode, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PikaScenesV2_2", + display_name="Pika Scenes (Video Image Composition)", + description="Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them.", + category="api node/video/Pika", + inputs=[ + *get_base_inputs_types(), + IO.Combo.Input( + "ingredients_mode", + options=["creative", "precise"], default="creative", ), - "aspect_ratio": model_field_to_node_input( - IO.FLOAT, - PikaBodyGenerate22C2vGenerate22PikascenesPost, - "aspectRatio", + IO.Float.Input( + "aspect_ratio", step=0.001, min=0.4, max=2.5, default=1.7777777777777777, + tooltip="Aspect ratio (width / height)", ), - }, - "optional": { - "image_ingredient_1": image_ingredient_input, - "image_ingredient_2": image_ingredient_input, - "image_ingredient_3": image_ingredient_input, - "image_ingredient_4": image_ingredient_input, - "image_ingredient_5": image_ingredient_input, - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + IO.Image.Input( + "image_ingredient_1", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + IO.Image.Input( + "image_ingredient_2", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + IO.Image.Input( + "image_ingredient_3", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + IO.Image.Input( + "image_ingredient_4", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + IO.Image.Input( + "image_ingredient_5", + optional=True, + tooltip="Image that will be used as ingredient to create a video.", + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Combine your images to create a video with the objects in them. Upload multiple images as ingredients and generate a high-quality video that incorporates all of them." - - async def api_call( - self, + @classmethod + async def execute( + cls, prompt_text: str, negative_prompt: str, seed: int, @@ -370,15 +251,12 @@ class PikaScenesV2_2(PikaNodeBase): duration: int, ingredients_mode: str, aspect_ratio: float, - unique_id: str, image_ingredient_1: Optional[torch.Tensor] = None, image_ingredient_2: Optional[torch.Tensor] = None, image_ingredient_3: Optional[torch.Tensor] = None, image_ingredient_4: Optional[torch.Tensor] = None, image_ingredient_5: Optional[torch.Tensor] = None, - **kwargs, - ) -> tuple[VideoFromFile]: - # Convert all passed images to BytesIO + ) -> IO.NodeOutput: all_image_bytes_io = [] for image in [ image_ingredient_1, @@ -388,16 +266,14 @@ class PikaScenesV2_2(PikaNodeBase): image_ingredient_5, ]: if image is not None: - image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - all_image_bytes_io.append(image_bytes_io) + all_image_bytes_io.append(tensor_to_bytesio(image)) pika_files = [ ("images", (f"image_{i}.png", image_bytes_io, "image/png")) for i, image_bytes_io in enumerate(all_image_bytes_io) ] - pika_request_data = PikaBodyGenerate22C2vGenerate22PikascenesPost( + pika_request_data = pika_defs.PikaBodyGenerate22C2vGenerate22PikascenesPost( ingredientsMode=ingredients_mode, promptText=prompt_text, negativePrompt=negative_prompt, @@ -406,283 +282,206 @@ class PikaScenesV2_2(PikaNodeBase): duration=duration, aspectRatio=aspect_ratio, ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKASCENES, - method=HttpMethod.POST, - request_model=PikaBodyGenerate22C2vGenerate22PikascenesPost, - response_model=PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKASCENES, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, ) - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -class PikAdditionsNode(PikaNodeBase): +class PikAdditionsNode(IO.ComfyNode): """Pika Pikadditions Node. Add an image into a video.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to add an image to."}), - "image": (IO.IMAGE, {"tooltip": "The image to add to the video."}), - "prompt_text": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - PikaBodyGeneratePikadditionsGeneratePikadditionsPost, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Pikadditions", + display_name="Pikadditions (Video Object Insertion)", + description="Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result.", + category="api node/video/Pika", + inputs=[ + IO.Video.Input("video", tooltip="The video to add an image to."), + IO.Image.Input("image", tooltip="The image to add to the video."), + IO.String.Input("prompt_text", multiline=True), + IO.String.Input("negative_prompt", multiline=True), + IO.Int.Input( "seed", min=0, max=0xFFFFFFFF, control_after_generate=True, ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Add any object or image into your video. Upload a video and specify what you'd like to add to create a seamlessly integrated result." - - async def api_call( - self, + @classmethod + async def execute( + cls, video: VideoInput, image: torch.Tensor, prompt_text: str, negative_prompt: str, seed: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - # Convert video to BytesIO - video_bytes_io = io.BytesIO() + ) -> IO.NodeOutput: + video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) - # Convert image to BytesIO image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - pika_files = { "video": ("video.mp4", video_bytes_io, "video/mp4"), "image": ("image.png", image_bytes_io, "image/png"), } - - # Prepare non-file data - pika_request_data = PikaBodyGeneratePikadditionsGeneratePikadditionsPost( + pika_request_data = pika_defs.PikaBodyGeneratePikadditionsGeneratePikadditionsPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKADDITIONS, - method=HttpMethod.POST, - request_model=PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - response_model=PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKADDITIONS, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, ) - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -class PikaSwapsNode(PikaNodeBase): +class PikaSwapsNode(IO.ComfyNode): """Pika Pikaswaps Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "video": (IO.VIDEO, {"tooltip": "The video to swap an object in."}), - "image": ( - IO.IMAGE, - { - "tooltip": "The image used to replace the masked object in the video." - }, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Pikaswaps", + display_name="Pika Swaps (Video Object Replacement)", + description="Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates.", + category="api node/video/Pika", + inputs=[ + IO.Video.Input("video", tooltip="The video to swap an object in."), + IO.Image.Input( + "image", + tooltip="The image used to replace the masked object in the video.", + optional=True, ), - "mask": ( - IO.MASK, - {"tooltip": "Use the mask to define areas in the video to replace"}, + IO.Mask.Input( + "mask", + tooltip="Use the mask to define areas in the video to replace.", + optional=True, ), - "prompt_text": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - "promptText", + IO.String.Input("prompt_text", multiline=True, optional=True), + IO.String.Input("negative_prompt", multiline=True, optional=True), + IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True, optional=True), + IO.String.Input( + "region_to_modify", multiline=True, + optional=True, + tooltip="Plaintext description of the object / region to modify.", ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - PikaBodyGeneratePikaswapsGeneratePikaswapsPost, - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Swap out any object or region of your video with a new image or object. Define areas to replace either with a mask or coordinates." - RETURN_TYPES = ("VIDEO",) - - async def api_call( - self, + @classmethod + async def execute( + cls, video: VideoInput, - image: torch.Tensor, - mask: torch.Tensor, - prompt_text: str, - negative_prompt: str, - seed: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - # Convert video to BytesIO - video_bytes_io = io.BytesIO() + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, + prompt_text: str = "", + negative_prompt: str = "", + seed: int = 0, + region_to_modify: str = "", + ) -> IO.NodeOutput: + video_bytes_io = BytesIO() video.save_to(video_bytes_io, format=VideoContainer.MP4, codec=VideoCodec.H264) video_bytes_io.seek(0) - - # Convert mask to binary mask with three channels - mask = torch.round(mask) - mask = mask.repeat(1, 3, 1, 1) - - # Convert 3-channel binary mask to BytesIO - mask_bytes_io = io.BytesIO() - mask_bytes_io.write(mask.numpy().astype(np.uint8)) - mask_bytes_io.seek(0) - - # Convert image to BytesIO - image_bytes_io = tensor_to_bytesio(image) - image_bytes_io.seek(0) - pika_files = { "video": ("video.mp4", video_bytes_io, "video/mp4"), - "image": ("image.png", image_bytes_io, "image/png"), - "modifyRegionMask": ("mask.png", mask_bytes_io, "image/png"), } + if mask is not None: + pika_files["modifyRegionMask"] = ("mask.png", tensor_to_bytesio(mask), "image/png") + if image is not None: + pika_files["image"] = ("image.png", tensor_to_bytesio(image), "image/png") - # Prepare non-file data - pika_request_data = PikaBodyGeneratePikaswapsGeneratePikaswapsPost( + pika_request_data = pika_defs.PikaBodyGeneratePikaswapsGeneratePikaswapsPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, + modifyRegionRoi=region_to_modify if region_to_modify else None, ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKADDITIONS, - method=HttpMethod.POST, - request_model=PikaBodyGeneratePikadditionsGeneratePikadditionsPost, - response_model=PikaGenerateResponse, - ), - request=pika_request_data, + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKASWAPS, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_request_data, files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -class PikaffectsNode(PikaNodeBase): +class PikaffectsNode(IO.ComfyNode): """Pika Pikaffects Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "image": ( - IO.IMAGE, - {"tooltip": "The reference image to apply the Pikaffect to."}, + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Pikaffects", + display_name="Pikaffects (Video Effects)", + description="Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear", + category="api node/video/Pika", + inputs=[ + IO.Image.Input("image", tooltip="The reference image to apply the Pikaffect to."), + IO.Combo.Input( + "pikaffect", options=pika_defs.Pikaffect, default="Cake-ify" ), - "pikaffect": model_field_to_node_input( - IO.COMBO, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "pikaffect", - enum_type=Pikaffect, - default="Cake-ify", - ), - "prompt_text": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "promptText", - multiline=True, - ), - "negative_prompt": model_field_to_node_input( - IO.STRING, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "negativePrompt", - multiline=True, - ), - "seed": model_field_to_node_input( - IO.INT, - PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - "seed", - min=0, - max=0xFFFFFFFF, - control_after_generate=True, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + IO.String.Input("prompt_text", multiline=True), + IO.String.Input("negative_prompt", multiline=True), + IO.Int.Input("seed", min=0, max=0xFFFFFFFF, control_after_generate=True), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Generate a video with a specific Pikaffect. Supported Pikaffects: Cake-ify, Crumble, Crush, Decapitate, Deflate, Dissolve, Explode, Eye-pop, Inflate, Levitate, Melt, Peel, Poke, Squish, Ta-da, Tear" - - async def api_call( - self, + @classmethod + async def execute( + cls, image: torch.Tensor, pikaffect: str, prompt_text: str, negative_prompt: str, seed: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKAFFECTS, - method=HttpMethod.POST, - request_model=PikaBodyGeneratePikaffectsGeneratePikaffectsPost, - response_model=PikaGenerateResponse, - ), - request=PikaBodyGeneratePikaffectsGeneratePikaffectsPost( + ) -> IO.NodeOutput: + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKAFFECTS, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_defs.PikaBodyGeneratePikaffectsGeneratePikaffectsPost( pikaffect=pikaffect, promptText=prompt_text, negativePrompt=negative_prompt, @@ -690,36 +489,37 @@ class PikaffectsNode(PikaNodeBase): ), files={"image": ("image.png", tensor_to_bytesio(image), "image/png")}, content_type="multipart/form-data", - auth_kwargs=kwargs, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -class PikaStartEndFrameNode2_2(PikaNodeBase): +class PikaStartEndFrameNode(IO.ComfyNode): """PikaFrames v2.2 Node.""" @classmethod - def INPUT_TYPES(cls): - return { - "required": { - "image_start": (IO.IMAGE, {"tooltip": "The first image to combine."}), - "image_end": (IO.IMAGE, {"tooltip": "The last image to combine."}), - **cls.get_base_inputs_types( - PikaBodyGenerate22KeyframeGenerate22PikaframesPost - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PikaStartEndFrameNode2_2", + display_name="Pika Start and End Frame to Video", + description="Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them.", + category="api node/video/Pika", + inputs=[ + IO.Image.Input("image_start", tooltip="The first image to combine."), + IO.Image.Input("image_end", tooltip="The last image to combine."), + *get_base_inputs_types(), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - DESCRIPTION = "Generate a video by combining your first and last frame. Upload two images to define the start and end points, and let the AI create a smooth transition between them." - - async def api_call( - self, + @classmethod + async def execute( + cls, image_start: torch.Tensor, image_end: torch.Tensor, prompt_text: str, @@ -727,23 +527,17 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): seed: int, resolution: str, duration: int, - unique_id: str, - **kwargs, - ) -> tuple[VideoFromFile]: - + ) -> IO.NodeOutput: + validate_string(prompt_text, field_name="prompt_text", min_length=1) pika_files = [ ("keyFrames", ("image_start.png", tensor_to_bytesio(image_start), "image/png")), ("keyFrames", ("image_end.png", tensor_to_bytesio(image_end), "image/png")), ] - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_PIKAFRAMES, - method=HttpMethod.POST, - request_model=PikaBodyGenerate22KeyframeGenerate22PikaframesPost, - response_model=PikaGenerateResponse, - ), - request=PikaBodyGenerate22KeyframeGenerate22PikaframesPost( + initial_operation = await sync_op( + cls, + ApiEndpoint(path=PATH_PIKAFRAMES, method="POST"), + response_model=pika_defs.PikaGenerateResponse, + data=pika_defs.PikaBodyGenerate22KeyframeGenerate22PikaframesPost( promptText=prompt_text, negativePrompt=negative_prompt, seed=seed, @@ -752,28 +546,23 @@ class PikaStartEndFrameNode2_2(PikaNodeBase): ), files=pika_files, content_type="multipart/form-data", - auth_kwargs=kwargs, ) - - return await self.execute_task(initial_operation, auth_kwargs=kwargs, node_id=unique_id) + return await execute_task(initial_operation.video_id, cls) -NODE_CLASS_MAPPINGS = { - "PikaImageToVideoNode2_2": PikaImageToVideoV2_2, - "PikaTextToVideoNode2_2": PikaTextToVideoNodeV2_2, - "PikaScenesV2_2": PikaScenesV2_2, - "Pikadditions": PikAdditionsNode, - "Pikaswaps": PikaSwapsNode, - "Pikaffects": PikaffectsNode, - "PikaStartEndFrameNode2_2": PikaStartEndFrameNode2_2, -} +class PikaApiNodesExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + PikaImageToVideo, + PikaTextToVideoNode, + PikaScenes, + PikAdditionsNode, + PikaSwapsNode, + PikaffectsNode, + PikaStartEndFrameNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "PikaImageToVideoNode2_2": "Pika Image to Video", - "PikaTextToVideoNode2_2": "Pika Text to Video", - "PikaScenesV2_2": "Pika Scenes (Video Image Composition)", - "Pikadditions": "Pikadditions (Video Object Insertion)", - "Pikaswaps": "Pika Swaps (Video Object Replacement)", - "Pikaffects": "Pikaffects (Video Effects)", - "PikaStartEndFrameNode2_2": "Pika Start and End Frame to Video", -} + +async def comfy_entrypoint() -> PikaApiNodesExtension: + return PikaApiNodesExtension() diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index 7c5a52feb..6e1686af0 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -1,5 +1,6 @@ -from inspect import cleandoc -from typing import Optional +import torch +from typing_extensions import override +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.pixverse_api import ( PixverseTextVideoRequest, PixverseImageVideoRequest, @@ -15,157 +16,123 @@ from comfy_api_nodes.apis.pixverse_api import ( PixverseIO, pixverse_templates, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( + download_url_to_video_output, + poll_op, + sync_op, tensor_to_bytesio, validate_string, ) -from comfy.comfy_types.node_typing import IO, ComfyNodeABC -from comfy_api.input_impl import VideoFromFile - -import torch -import aiohttp -from io import BytesIO - AVERAGE_DURATION_T2V = 32 AVERAGE_DURATION_I2V = 30 AVERAGE_DURATION_T2T = 52 -def get_video_url_from_response( - response: PixverseGenerationStatusResponse, -) -> Optional[str]: - if response.Resp is None or response.Resp.url is None: - return None - return str(response.Resp.url) - - -async def upload_image_to_pixverse(image: torch.Tensor, auth_kwargs=None): - # first, upload image to Pixverse and get image id to use in actual generation call - files = {"image": tensor_to_bytesio(image)} - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/image/upload", - method=HttpMethod.POST, - request_model=EmptyRequest, - response_model=PixverseImageUploadResponse, - ), - request=EmptyRequest(), - files=files, +async def upload_image_to_pixverse(cls: type[IO.ComfyNode], image: torch.Tensor): + response_upload = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/image/upload", method="POST"), + response_model=PixverseImageUploadResponse, + files={"image": tensor_to_bytesio(image)}, content_type="multipart/form-data", - auth_kwargs=auth_kwargs, ) - response_upload: PixverseImageUploadResponse = await operation.execute() - if response_upload.Resp is None: - raise Exception( - f"PixVerse image upload request failed: '{response_upload.ErrMsg}'" - ) - + raise Exception(f"PixVerse image upload request failed: '{response_upload.ErrMsg}'") return response_upload.Resp.img_id -class PixverseTemplateNode: +class PixverseTemplateNode(IO.ComfyNode): """ Select template for PixVerse Video generation. """ - RETURN_TYPES = (PixverseIO.TEMPLATE,) - RETURN_NAMES = ("pixverse_template",) - FUNCTION = "create_template" - CATEGORY = "api node/video/PixVerse" + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PixverseTemplateNode", + display_name="PixVerse Template", + category="api node/video/PixVerse", + inputs=[ + IO.Combo.Input("template", options=list(pixverse_templates.keys())), + ], + outputs=[IO.Custom(PixverseIO.TEMPLATE).Output(display_name="pixverse_template")], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "template": (list(pixverse_templates.keys()),), - } - } - - def create_template(self, template: str): + def execute(cls, template: str) -> IO.NodeOutput: template_id = pixverse_templates.get(template, None) if template_id is None: raise Exception(f"Template '{template}' is not recognized.") - # just return the integer - return (template_id,) + return IO.NodeOutput(template_id) -class PixverseTextToVideoNode(ComfyNodeABC): - """ - Generates videos based on prompt and output_size. - """ - - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/PixVerse" +class PixverseTextToVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PixverseTextToVideoNode", + display_name="PixVerse Text to Video", + category="api node/video/PixVerse", + description="Generates videos based on prompt and output_size.", + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + IO.Combo.Input( + "aspect_ratio", + options=PixverseAspectRatio, + ), + IO.Combo.Input( + "quality", + options=PixverseQuality, + default=PixverseQuality.res_540p, + ), + IO.Combo.Input( + "duration_seconds", + options=PixverseDuration, + ), + IO.Combo.Input( + "motion_mode", + options=PixverseMotionMode, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for video generation.", + ), + IO.String.Input( + "negative_prompt", + default="", + multiline=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(PixverseIO.TEMPLATE).Input( + "pixverse_template", + tooltip="An optional template to influence style of generation, created by the PixVerse Template node.", + optional=True, + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "aspect_ratio": ([ratio.value for ratio in PixverseAspectRatio],), - "quality": ( - [resolution.value for resolution in PixverseQuality], - { - "default": PixverseQuality.res_540p, - }, - ), - "duration_seconds": ([dur.value for dur in PixverseDuration],), - "motion_mode": ([mode.value for mode in PixverseMotionMode],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "control_after_generate": True, - "tooltip": "Seed for video generation.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "pixverse_template": ( - PixverseIO.TEMPLATE, - { - "tooltip": "An optional template to influence style of generation, created by the PixVerse Template node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, aspect_ratio: str, quality: str, @@ -174,10 +141,8 @@ class PixverseTextToVideoNode(ComfyNodeABC): seed, negative_prompt: str = None, pixverse_template: int = None, - unique_id: Optional[str] = None, - **kwargs, - ): - validate_string(prompt, strip_whitespace=False) + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=False, min_length=1) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration if quality == PixverseQuality.res_1080p: @@ -186,14 +151,11 @@ class PixverseTextToVideoNode(ComfyNodeABC): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/video/text/generate", - method=HttpMethod.POST, - request_model=PixverseTextVideoRequest, - response_model=PixverseVideoResponse, - ), - request=PixverseTextVideoRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/video/text/generate", method="POST"), + response_model=PixverseVideoResponse, + data=PixverseTextVideoRequest( prompt=prompt, aspect_ratio=aspect_ratio, quality=quality, @@ -203,20 +165,14 @@ class PixverseTextToVideoNode(ComfyNodeABC): template_id=pixverse_template, seed=seed, ), - auth_kwargs=kwargs, ) - response_api = await operation.execute() - if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PixverseGenerationStatusResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}"), + response_model=PixverseGenerationStatusResponse, completed_statuses=[PixverseStatus.successful], failed_statuses=[ PixverseStatus.contents_moderation, @@ -224,86 +180,73 @@ class PixverseTextToVideoNode(ComfyNodeABC): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=kwargs, - node_id=unique_id, - result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.Resp.url) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return IO.NodeOutput(await download_url_to_video_output(response_poll.Resp.url)) -class PixverseImageToVideoNode(ComfyNodeABC): - """ - Generates videos based on prompt and output_size. - """ - - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/PixVerse" +class PixverseImageToVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PixverseImageToVideoNode", + display_name="PixVerse Image to Video", + category="api node/video/PixVerse", + description="Generates videos based on prompt and output_size.", + inputs=[ + IO.Image.Input("image"), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + IO.Combo.Input( + "quality", + options=PixverseQuality, + default=PixverseQuality.res_540p, + ), + IO.Combo.Input( + "duration_seconds", + options=PixverseDuration, + ), + IO.Combo.Input( + "motion_mode", + options=PixverseMotionMode, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for video generation.", + ), + IO.String.Input( + "negative_prompt", + default="", + multiline=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(PixverseIO.TEMPLATE).Input( + "pixverse_template", + tooltip="An optional template to influence style of generation, created by the PixVerse Template node.", + optional=True, + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "quality": ( - [resolution.value for resolution in PixverseQuality], - { - "default": PixverseQuality.res_540p, - }, - ), - "duration_seconds": ([dur.value for dur in PixverseDuration],), - "motion_mode": ([mode.value for mode in PixverseMotionMode],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "control_after_generate": True, - "tooltip": "Seed for video generation.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "pixverse_template": ( - PixverseIO.TEMPLATE, - { - "tooltip": "An optional template to influence style of generation, created by the PixVerse Template node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, prompt: str, quality: str, @@ -312,11 +255,9 @@ class PixverseImageToVideoNode(ComfyNodeABC): seed, negative_prompt: str = None, pixverse_template: int = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) - img_id = await upload_image_to_pixverse(image, auth_kwargs=kwargs) + img_id = await upload_image_to_pixverse(cls, image) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -326,14 +267,11 @@ class PixverseImageToVideoNode(ComfyNodeABC): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/video/img/generate", - method=HttpMethod.POST, - request_model=PixverseImageVideoRequest, - response_model=PixverseVideoResponse, - ), - request=PixverseImageVideoRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/video/img/generate", method="POST"), + response_model=PixverseVideoResponse, + data=PixverseImageVideoRequest( img_id=img_id, prompt=prompt, quality=quality, @@ -343,20 +281,15 @@ class PixverseImageToVideoNode(ComfyNodeABC): template_id=pixverse_template, seed=seed, ), - auth_kwargs=kwargs, ) - response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PixverseGenerationStatusResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}"), + response_model=PixverseGenerationStatusResponse, completed_statuses=[PixverseStatus.successful], failed_statuses=[ PixverseStatus.contents_moderation, @@ -364,81 +297,69 @@ class PixverseImageToVideoNode(ComfyNodeABC): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=kwargs, - node_id=unique_id, - result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_I2V, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.Resp.url) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return IO.NodeOutput(await download_url_to_video_output(response_poll.Resp.url)) -class PixverseTransitionVideoNode(ComfyNodeABC): - """ - Generates videos based on prompt and output_size. - """ - - RETURN_TYPES = (IO.VIDEO,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/video/PixVerse" +class PixverseTransitionVideoNode(IO.ComfyNode): + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="PixverseTransitionVideoNode", + display_name="PixVerse Transition Video", + category="api node/video/PixVerse", + description="Generates videos based on prompt and output_size.", + inputs=[ + IO.Image.Input("first_frame"), + IO.Image.Input("last_frame"), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt for the video generation", + ), + IO.Combo.Input( + "quality", + options=PixverseQuality, + default=PixverseQuality.res_540p, + ), + IO.Combo.Input( + "duration_seconds", + options=PixverseDuration, + ), + IO.Combo.Input( + "motion_mode", + options=PixverseMotionMode, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Seed for video generation.", + ), + IO.String.Input( + "negative_prompt", + default="", + multiline=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + ], + outputs=[IO.Video.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "first_frame": (IO.IMAGE,), - "last_frame": (IO.IMAGE,), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the video generation", - }, - ), - "quality": ( - [resolution.value for resolution in PixverseQuality], - { - "default": PixverseQuality.res_540p, - }, - ), - "duration_seconds": ([dur.value for dur in PixverseDuration],), - "motion_mode": ([mode.value for mode in PixverseMotionMode],), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 2147483647, - "control_after_generate": True, - "tooltip": "Seed for video generation.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, first_frame: torch.Tensor, last_frame: torch.Tensor, prompt: str, @@ -447,12 +368,10 @@ class PixverseTransitionVideoNode(ComfyNodeABC): motion_mode: str, seed, negative_prompt: str = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) - first_frame_id = await upload_image_to_pixverse(first_frame, auth_kwargs=kwargs) - last_frame_id = await upload_image_to_pixverse(last_frame, auth_kwargs=kwargs) + first_frame_id = await upload_image_to_pixverse(cls, first_frame) + last_frame_id = await upload_image_to_pixverse(cls, last_frame) # 1080p is limited to 5 seconds duration # only normal motion_mode supported for 1080p or for non-5 second duration @@ -462,14 +381,11 @@ class PixverseTransitionVideoNode(ComfyNodeABC): elif duration_seconds != PixverseDuration.dur_5: motion_mode = PixverseMotionMode.normal - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/pixverse/video/transition/generate", - method=HttpMethod.POST, - request_model=PixverseTransitionVideoRequest, - response_model=PixverseVideoResponse, - ), - request=PixverseTransitionVideoRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/pixverse/video/transition/generate", method="POST"), + response_model=PixverseVideoResponse, + data=PixverseTransitionVideoRequest( first_frame_img=first_frame_id, last_frame_img=last_frame_id, prompt=prompt, @@ -479,20 +395,15 @@ class PixverseTransitionVideoNode(ComfyNodeABC): negative_prompt=negative_prompt if negative_prompt else None, seed=seed, ), - auth_kwargs=kwargs, ) - response_api = await operation.execute() if response_api.Resp is None: raise Exception(f"PixVerse request failed: '{response_api.ErrMsg}'") - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=PixverseGenerationStatusResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/pixverse/video/result/{response_api.Resp.video_id}"), + response_model=PixverseGenerationStatusResponse, completed_statuses=[PixverseStatus.successful], failed_statuses=[ PixverseStatus.contents_moderation, @@ -500,28 +411,21 @@ class PixverseTransitionVideoNode(ComfyNodeABC): PixverseStatus.deleted, ], status_extractor=lambda x: x.Resp.status, - auth_kwargs=kwargs, - node_id=unique_id, - result_url_extractor=get_video_url_from_response, estimated_duration=AVERAGE_DURATION_T2V, ) - response_poll = await operation.execute() - - async with aiohttp.ClientSession() as session: - async with session.get(response_poll.Resp.url) as vid_response: - return (VideoFromFile(BytesIO(await vid_response.content.read())),) + return IO.NodeOutput(await download_url_to_video_output(response_poll.Resp.url)) -NODE_CLASS_MAPPINGS = { - "PixverseTextToVideoNode": PixverseTextToVideoNode, - "PixverseImageToVideoNode": PixverseImageToVideoNode, - "PixverseTransitionVideoNode": PixverseTransitionVideoNode, - "PixverseTemplateNode": PixverseTemplateNode, -} +class PixVerseExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + PixverseTextToVideoNode, + PixverseImageToVideoNode, + PixverseTransitionVideoNode, + PixverseTemplateNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "PixverseTextToVideoNode": "PixVerse Text to Video", - "PixverseImageToVideoNode": "PixVerse Image to Video", - "PixverseTransitionVideoNode": "PixVerse Transition Video", - "PixverseTemplateNode": "PixVerse Template", -} + +async def comfy_entrypoint() -> PixVerseExtension: + return PixVerseExtension() diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index c8516b368..e3440b946 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -1,91 +1,83 @@ -from __future__ import annotations -from inspect import cleandoc -from typing import Optional +from io import BytesIO +from typing import Optional, Union + +import aiohttp +import torch +from PIL import UnidentifiedImageError +from typing_extensions import override + from comfy.utils import ProgressBar -from comfy_extras.nodes_images import SVG # Added -from comfy.comfy_types.node_typing import IO +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.recraft_api import ( - RecraftImageGenerationRequest, - RecraftImageGenerationResponse, - RecraftImageSize, - RecraftModel, - RecraftStyle, - RecraftStyleV3, RecraftColor, RecraftColorChain, RecraftControls, + RecraftImageGenerationRequest, + RecraftImageGenerationResponse, + RecraftImageSize, RecraftIO, + RecraftModel, + RecraftStyle, + RecraftStyleV3, get_v3_substyles, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( bytesio_to_image_tensor, - download_url_to_bytesio, - tensor_to_bytesio, + download_url_as_bytesio, resize_mask_to_image, + sync_op, + tensor_to_bytesio, validate_string, ) -from server import PromptServer - -import torch -from io import BytesIO -from PIL import UnidentifiedImageError +from comfy_extras.nodes_images import SVG async def handle_recraft_file_request( - image: torch.Tensor, - path: str, - mask: torch.Tensor=None, - total_pixels=4096*4096, - timeout=1024, - request=None, - auth_kwargs: dict[str,str] = None, - ) -> list[BytesIO]: - """ - Handle sending common Recraft file-only request to get back file bytes. - """ - if request is None: - request = EmptyRequest() + cls: type[IO.ComfyNode], + image: torch.Tensor, + path: str, + mask: Optional[torch.Tensor] = None, + total_pixels: int = 4096 * 4096, + timeout: int = 1024, + request=None, +) -> list[BytesIO]: + """Handle sending common Recraft file-only request to get back file bytes.""" - files = { - 'image': tensor_to_bytesio(image, total_pixels=total_pixels).read() - } - if mask is not None: - files['mask'] = tensor_to_bytesio(mask, total_pixels=total_pixels).read() + files = {"image": tensor_to_bytesio(image, total_pixels=total_pixels).read()} + if mask is not None: + files["mask"] = tensor_to_bytesio(mask, total_pixels=total_pixels).read() - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=type(request), - response_model=RecraftImageGenerationResponse, - ), - request=request, - files=files, - content_type="multipart/form-data", - auth_kwargs=auth_kwargs, - multipart_parser=recraft_multipart_parser, - ) - response: RecraftImageGenerationResponse = await operation.execute() - all_bytesio = [] - if response.image is not None: - all_bytesio.append(await download_url_to_bytesio(response.image.url, timeout=timeout)) - else: - for data in response.data: - all_bytesio.append(await download_url_to_bytesio(data.url, timeout=timeout)) + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=path, method="POST"), + response_model=RecraftImageGenerationResponse, + data=request if request else None, + files=files, + content_type="multipart/form-data", + multipart_parser=recraft_multipart_parser, + max_retries=1, + ) + all_bytesio = [] + if response.image is not None: + all_bytesio.append(await download_url_as_bytesio(response.image.url, timeout=timeout)) + else: + for data in response.data: + all_bytesio.append(await download_url_as_bytesio(data.url, timeout=timeout)) - return all_bytesio + return all_bytesio -def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, converted_to_check: list[list]=None, is_list=False) -> dict: +def recraft_multipart_parser( + data, + parent_key=None, + formatter: Optional[type[callable]] = None, + converted_to_check: Optional[list[list]] = None, + is_list: bool = False, + return_mode: str = "formdata", # "dict" | "formdata" +) -> Union[dict, aiohttp.FormData]: """ - Formats data such that multipart/form-data will work with requests library - when both files and data are present. + Formats data such that multipart/form-data will work with aiohttp library when both files and data are present. The OpenAI client that Recraft uses has a bizarre way of serializing lists: @@ -103,24 +95,24 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co # Modification of a function that handled a different type of multipart parsing, big ups: # https://gist.github.com/kazqvaizer/4cebebe5db654a414132809f9f88067b - def handle_converted_lists(data, parent_key, lists_to_check=tuple[list]): - # if list already exists exists, just extend list with data + def handle_converted_lists(item, parent_key, lists_to_check=list[list]): + # if list already exists, just extend list with data for check_list in lists_to_check: for conv_tuple in check_list: - if conv_tuple[0] == parent_key and type(conv_tuple[1]) is list: - conv_tuple[1].append(formatter(data)) + if conv_tuple[0] == parent_key and isinstance(conv_tuple[1], list): + conv_tuple[1].append(formatter(item)) return True return False if converted_to_check is None: converted_to_check = [] - + effective_mode = return_mode if parent_key is None else "dict" if formatter is None: formatter = lambda v: v # Multipart representation of value - if type(data) is not dict: - # if list already exists exists, just extend list with data + if not isinstance(data, dict): + # if list already exists, just extend list with data added = handle_converted_lists(data, parent_key, converted_to_check) if added: return {} @@ -136,15 +128,26 @@ def recraft_multipart_parser(data, parent_key=None, formatter: callable=None, co for key, value in data.items(): current_key = key if parent_key is None else f"{parent_key}[{key}]" - if type(value) is dict: + if isinstance(value, dict): converted.extend(recraft_multipart_parser(value, current_key, formatter, next_check).items()) - elif type(value) is list: + elif isinstance(value, list): for ind, list_value in enumerate(value): iter_key = f"{current_key}[]" - converted.extend(recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items()) + converted.extend( + recraft_multipart_parser(list_value, iter_key, formatter, next_check, is_list=True).items() + ) else: converted.append((current_key, formatter(value))) + if effective_mode == "formdata": + fd = aiohttp.FormData() + for k, v in dict(converted).items(): + if isinstance(v, list): + for item in v: + fd.add_field(k, str(item)) + else: + fd.add_field(k, str(v)) + return fd return dict(converted) @@ -152,6 +155,7 @@ class handle_recraft_image_output: """ Catch an exception related to receiving SVG data instead of image, when Infinite Style Library style_id is in use. """ + def __init__(self): pass @@ -160,243 +164,225 @@ class handle_recraft_image_output: def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is not None and exc_type is UnidentifiedImageError: - raise Exception("Received output data was not an image; likely an SVG. If you used style_id, make sure it is not a Vector art style.") + raise Exception( + "Received output data was not an image; likely an SVG. " + "If you used style_id, make sure it is not a Vector art style." + ) -class RecraftColorRGBNode: - """ - Create Recraft Color by choosing specific RGB values. - """ - - RETURN_TYPES = (RecraftIO.COLOR,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - RETURN_NAMES = ("recraft_color",) - FUNCTION = "create_color" - CATEGORY = "api node/image/Recraft" +class RecraftColorRGBNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftColorRGB", + display_name="Recraft Color RGB", + category="api node/image/Recraft", + description="Create Recraft Color by choosing specific RGB values.", + inputs=[ + IO.Int.Input("r", default=0, min=0, max=255, tooltip="Red value of color."), + IO.Int.Input("g", default=0, min=0, max=255, tooltip="Green value of color."), + IO.Int.Input("b", default=0, min=0, max=255, tooltip="Blue value of color."), + IO.Custom(RecraftIO.COLOR).Input("recraft_color", optional=True), + ], + outputs=[ + IO.Custom(RecraftIO.COLOR).Output(display_name="recraft_color"), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "r": (IO.INT, { - "default": 0, - "min": 0, - "max": 255, - "tooltip": "Red value of color." - }), - "g": (IO.INT, { - "default": 0, - "min": 0, - "max": 255, - "tooltip": "Green value of color." - }), - "b": (IO.INT, { - "default": 0, - "min": 0, - "max": 255, - "tooltip": "Blue value of color." - }), - }, - "optional": { - "recraft_color": (RecraftIO.COLOR,), - } - } - - def create_color(self, r: int, g: int, b: int, recraft_color: RecraftColorChain=None): + def execute(cls, r: int, g: int, b: int, recraft_color: RecraftColorChain = None) -> IO.NodeOutput: recraft_color = recraft_color.clone() if recraft_color else RecraftColorChain() recraft_color.add(RecraftColor(r, g, b)) - return (recraft_color, ) + return IO.NodeOutput(recraft_color) -class RecraftControlsNode: - """ - Create Recraft Controls for customizing Recraft generation. - """ - - RETURN_TYPES = (RecraftIO.CONTROLS,) - RETURN_NAMES = ("recraft_controls",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_controls" - CATEGORY = "api node/image/Recraft" +class RecraftControlsNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftControls", + display_name="Recraft Controls", + category="api node/image/Recraft", + description="Create Recraft Controls for customizing Recraft generation.", + inputs=[ + IO.Custom(RecraftIO.COLOR).Input("colors", optional=True), + IO.Custom(RecraftIO.COLOR).Input("background_color", optional=True), + ], + outputs=[ + IO.Custom(RecraftIO.CONTROLS).Output(display_name="recraft_controls"), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - }, - "optional": { - "colors": (RecraftIO.COLOR,), - "background_color": (RecraftIO.COLOR,), - } - } - - def create_controls(self, colors: RecraftColorChain=None, background_color: RecraftColorChain=None): - return (RecraftControls(colors=colors, background_color=background_color), ) + def execute(cls, colors: RecraftColorChain = None, background_color: RecraftColorChain = None) -> IO.NodeOutput: + return IO.NodeOutput(RecraftControls(colors=colors, background_color=background_color)) -class RecraftStyleV3RealisticImageNode: - """ - Select realistic_image style and optional substyle. - """ - - RETURN_TYPES = (RecraftIO.STYLEV3,) - RETURN_NAMES = ("recraft_style",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_style" - CATEGORY = "api node/image/Recraft" - +class RecraftStyleV3RealisticImageNode(IO.ComfyNode): RECRAFT_STYLE = RecraftStyleV3.realistic_image @classmethod - def INPUT_TYPES(s): - return { - "required": { - "substyle": (get_v3_substyles(s.RECRAFT_STYLE),), - } - } + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3RealisticImage", + display_name="Recraft Style - Realistic Image", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) - def create_style(self, substyle: str): + @classmethod + def execute(cls, substyle: str) -> IO.NodeOutput: if substyle == "None": substyle = None - return (RecraftStyle(self.RECRAFT_STYLE, substyle),) + return IO.NodeOutput(RecraftStyle(cls.RECRAFT_STYLE, substyle)) class RecraftStyleV3DigitalIllustrationNode(RecraftStyleV3RealisticImageNode): - """ - Select digital_illustration style and optional substyle. - """ - RECRAFT_STYLE = RecraftStyleV3.digital_illustration + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3DigitalIllustration", + display_name="Recraft Style - Digital Illustration", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) + class RecraftStyleV3VectorIllustrationNode(RecraftStyleV3RealisticImageNode): - """ - Select vector_illustration style and optional substyle. - """ - RECRAFT_STYLE = RecraftStyleV3.vector_illustration + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3VectorIllustrationNode", + display_name="Recraft Style - Realistic Image", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) + class RecraftStyleV3LogoRasterNode(RecraftStyleV3RealisticImageNode): - """ - Select vector_illustration style and optional substyle. - """ - - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "substyle": (get_v3_substyles(s.RECRAFT_STYLE, include_none=False),), - } - } - RECRAFT_STYLE = RecraftStyleV3.logo_raster + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3LogoRaster", + display_name="Recraft Style - Logo Raster", + category="api node/image/Recraft", + description="Select realistic_image style and optional substyle.", + inputs=[ + IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE, include_none=False)), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) -class RecraftStyleInfiniteStyleLibrary: - """ - Select style based on preexisting UUID from Recraft's Infinite Style Library. - """ - RETURN_TYPES = (RecraftIO.STYLEV3,) - RETURN_NAMES = ("recraft_style",) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "create_style" - CATEGORY = "api node/image/Recraft" +class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftStyleV3InfiniteStyleLibrary", + display_name="Recraft Style - Infinite Style Library", + category="api node/image/Recraft", + description="Select style based on preexisting UUID from Recraft's Infinite Style Library.", + inputs=[ + IO.String.Input("style_id", default="", tooltip="UUID of style from Infinite Style Library."), + ], + outputs=[ + IO.Custom(RecraftIO.STYLEV3).Output(display_name="recraft_style"), + ], + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "style_id": (IO.STRING, { - "default": "", - "tooltip": "UUID of style from Infinite Style Library.", - }) - } - } - - def create_style(self, style_id: str): + def execute(cls, style_id: str) -> IO.NodeOutput: if not style_id: raise Exception("The style_id input cannot be empty.") - return (RecraftStyle(style_id=style_id),) + return IO.NodeOutput(RecraftStyle(style_id=style_id)) -class RecraftTextToImageNode: - """ - Generates images synchronously based on prompt and resolution. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftTextToImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftTextToImageNode", + display_name="Recraft Text to Image", + category="api node/image/Recraft", + description="Generates images synchronously based on prompt and resolution.", + inputs=[ + IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), + IO.Combo.Input( + "size", + options=[res.value for res in RecraftImageSize], + default=RecraftImageSize.res_1024x1024, + tooltip="The size of the generated image.", + ), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "size": ( - [res.value for res in RecraftImageSize], - { - "default": RecraftImageSize.res_1024x1024, - "tooltip": "The size of the generated image.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "recraft_controls": ( - RecraftIO.CONTROLS, - { - "tooltip": "Optional additional controls over the generation via the Recraft Controls node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, size: str, n: int, @@ -404,9 +390,7 @@ class RecraftTextToImageNode: recraft_style: RecraftStyle = None, negative_prompt: str = None, recraft_controls: RecraftControls = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: @@ -419,14 +403,11 @@ class RecraftTextToImageNode: if not negative_prompt: negative_prompt = None - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/recraft/image_generation", - method=HttpMethod.POST, - request_model=RecraftImageGenerationRequest, - response_model=RecraftImageGenerationResponse, - ), - request=RecraftImageGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/recraft/image_generation", method="POST"), + response_model=RecraftImageGenerationResponse, + data=RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, model=RecraftModel.recraftv3, @@ -437,109 +418,83 @@ class RecraftTextToImageNode: style_id=recraft_style.style_id, controls=controls_api, ), - auth_kwargs=kwargs, + max_retries=1, ) - response: RecraftImageGenerationResponse = await operation.execute() images = [] - urls = [] for data in response.data: with handle_recraft_image_output(): - if unique_id and data.url: - urls.append(data.url) - urls_string = '\n'.join(urls) - PromptServer.instance.send_progress_text( - f"Result URL: {urls_string}", unique_id - ) - image = bytesio_to_image_tensor( - await download_url_to_bytesio(data.url, timeout=1024) - ) + image = bytesio_to_image_tensor(await download_url_as_bytesio(data.url, timeout=1024)) if len(image.shape) < 4: image = image.unsqueeze(0) images.append(image) - output_image = torch.cat(images, dim=0) - return (output_image,) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftImageToImageNode: - """ - Modify image based on prompt and strength. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftImageToImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftImageToImageNode", + display_name="Recraft Image to Image", + category="api node/image/Recraft", + description="Modify image based on prompt and strength.", + inputs=[ + IO.Image.Input("image"), + IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Float.Input( + "strength", + default=0.5, + min=0.0, + max=1.0, + step=0.01, + tooltip="Defines the difference with the original image, should lie in [0, 1], " + "where 0 means almost identical, and 1 means miserable similarity.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "strength": ( - IO.FLOAT, - { - "default": 0.5, - "min": 0.0, - "max": 1.0, - "step": 0.01, - "tooltip": "Defines the difference with the original image, should lie in [0, 1], where 0 means almost identical, and 1 means miserable similarity." - } - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "recraft_controls": ( - RecraftIO.CONTROLS, - { - "tooltip": "Optional additional controls over the generation via the Recraft Controls node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, prompt: str, n: int, @@ -548,8 +503,7 @@ class RecraftImageToImageNode: recraft_style: RecraftStyle = None, negative_prompt: str = None, recraft_controls: RecraftControls = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: @@ -579,83 +533,69 @@ class RecraftImageToImageNode: pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/imageToImage", request=request, - auth_kwargs=kwargs, ) with handle_recraft_image_output(): images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor, ) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftImageInpaintingNode: - """ - Modify image based on prompt and mask. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftImageInpaintingNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftImageInpaintingNode", + display_name="Recraft Image Inpainting", + category="api node/image/Recraft", + description="Modify image based on prompt and mask.", + inputs=[ + IO.Image.Input("image"), + IO.Mask.Input("mask"), + IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), + IO.Int.Input( + "n", + default=1, + min=1, + max=6, + tooltip="The number of images to generate.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - "mask": (IO.MASK, ), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, mask: torch.Tensor, prompt: str, @@ -663,8 +603,7 @@ class RecraftImageInpaintingNode: seed, recraft_style: RecraftStyle = None, negative_prompt: str = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: @@ -691,96 +630,73 @@ class RecraftImageInpaintingNode: pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], - mask=mask[i:i+1], + mask=mask[i : i + 1], path="/proxy/recraft/images/inpaint", request=request, - auth_kwargs=kwargs, ) with handle_recraft_image_output(): images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor, ) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftTextToVectorNode: - """ - Generates SVG synchronously based on prompt and resolution. - """ - - RETURN_TYPES = ("SVG",) # Changed - DESCRIPTION = cleandoc(__doc__ or "") if 'cleandoc' in globals() else __doc__ # Keep cleandoc if other nodes use it - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftTextToVectorNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftTextToVectorNode", + display_name="Recraft Text to Vector", + category="api node/image/Recraft", + description="Generates SVG synchronously based on prompt and resolution.", + inputs=[ + IO.String.Input("prompt", default="", tooltip="Prompt for the image generation.", multiline=True), + IO.Combo.Input("substyle", options=get_v3_substyles(RecraftStyleV3.vector_illustration)), + IO.Combo.Input( + "size", + options=[res.value for res in RecraftImageSize], + default=RecraftImageSize.res_1024x1024, + tooltip="The size of the generated image.", + ), + IO.Int.Input("n", default=1, min=1, max=6, tooltip="The number of images to generate."), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + IO.Custom(RecraftIO.CONTROLS).Input( + "recraft_controls", + tooltip="Optional additional controls over the generation via the Recraft Controls node.", + optional=True, + ), + ], + outputs=[ + IO.SVG.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "substyle": (get_v3_substyles(RecraftStyleV3.vector_illustration),), - "size": ( - [res.value for res in RecraftImageSize], - { - "default": RecraftImageSize.res_1024x1024, - "tooltip": "The size of the generated image.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - "recraft_controls": ( - RecraftIO.CONTROLS, - { - "tooltip": "Optional additional controls over the generation via the Recraft Controls node." - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - async def api_call( - self, + async def execute( + cls, prompt: str, substyle: str, size: str, @@ -788,9 +704,7 @@ class RecraftTextToVectorNode: seed, negative_prompt: str = None, recraft_controls: RecraftControls = None, - unique_id: Optional[str] = None, - **kwargs, - ): + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False, max_length=1000) # create RecraftStyle so strings will be formatted properly (i.e. "None" will become None) recraft_style = RecraftStyle(RecraftStyleV3.vector_illustration, substyle=substyle) @@ -802,14 +716,11 @@ class RecraftTextToVectorNode: if not negative_prompt: negative_prompt = None - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/recraft/image_generation", - method=HttpMethod.POST, - request_model=RecraftImageGenerationRequest, - response_model=RecraftImageGenerationResponse, - ), - request=RecraftImageGenerationRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/recraft/image_generation", method="POST"), + response_model=RecraftImageGenerationResponse, + data=RecraftImageGenerationRequest( prompt=prompt, negative_prompt=negative_prompt, model=RecraftModel.recraftv3, @@ -819,139 +730,105 @@ class RecraftTextToVectorNode: substyle=recraft_style.substyle, controls=controls_api, ), - auth_kwargs=kwargs, + max_retries=1, ) - response: RecraftImageGenerationResponse = await operation.execute() svg_data = [] - urls = [] for data in response.data: - if unique_id and data.url: - urls.append(data.url) - # Print result on each iteration in case of error - PromptServer.instance.send_progress_text( - f"Result URL: {' '.join(urls)}", unique_id - ) - svg_data.append(await download_url_to_bytesio(data.url, timeout=1024)) + svg_data.append(await download_url_as_bytesio(data.url, timeout=1024)) - return (SVG(svg_data),) + return IO.NodeOutput(SVG(svg_data)) -class RecraftVectorizeImageNode: - """ - Generates SVG synchronously from an input image. - """ - - RETURN_TYPES = ("SVG",) # Changed - DESCRIPTION = cleandoc(__doc__ or "") if 'cleandoc' in globals() else __doc__ # Keep cleandoc if other nodes use it - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftVectorizeImageNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftVectorizeImageNode", + display_name="Recraft Vectorize Image", + category="api node/image/Recraft", + description="Generates SVG synchronously from an input image.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.SVG.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, - image: torch.Tensor, - **kwargs, - ): + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: svgs = [] total = image.shape[0] pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/vectorize", - auth_kwargs=kwargs, ) svgs.append(SVG(sub_bytes)) pbar.update(1) - return (SVG.combine_all(svgs), ) + return IO.NodeOutput(SVG.combine_all(svgs)) -class RecraftReplaceBackgroundNode: - """ - Replace background on image, based on provided prompt. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftReplaceBackgroundNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftReplaceBackgroundNode", + display_name="Recraft Replace Background", + category="api node/image/Recraft", + description="Replace background on image, based on provided prompt.", + inputs=[ + IO.Image.Input("image"), + IO.String.Input("prompt", tooltip="Prompt for the image generation.", default="", multiline=True), + IO.Int.Input("n", default=1, min=1, max=6, tooltip="The number of images to generate."), + IO.Int.Input( + "seed", + default=0, + min=0, + max=0xFFFFFFFFFFFFFFFF, + control_after_generate=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + IO.Custom(RecraftIO.STYLEV3).Input("recraft_style", optional=True), + IO.String.Input( + "negative_prompt", + default="", + force_input=True, + tooltip="An optional text description of undesired elements on an image.", + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - "prompt": ( - IO.STRING, - { - "multiline": True, - "default": "", - "tooltip": "Prompt for the image generation.", - }, - ), - "n": ( - IO.INT, - { - "default": 1, - "min": 1, - "max": 6, - "tooltip": "The number of images to generate.", - }, - ), - "seed": ( - IO.INT, - { - "default": 0, - "min": 0, - "max": 0xFFFFFFFFFFFFFFFF, - "control_after_generate": True, - "tooltip": "Seed to determine if node should re-run; actual results are nondeterministic regardless of seed.", - }, - ), - }, - "optional": { - "recraft_style": (RecraftIO.STYLEV3,), - "negative_prompt": ( - IO.STRING, - { - "default": "", - "forceInput": True, - "tooltip": "An optional text description of undesired elements on an image.", - }, - ), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, + async def execute( + cls, image: torch.Tensor, prompt: str, n: int, seed, recraft_style: RecraftStyle = None, negative_prompt: str = None, - **kwargs, - ): + ) -> IO.NodeOutput: default_style = RecraftStyle(RecraftStyleV3.realistic_image) if recraft_style is None: recraft_style = default_style @@ -974,165 +851,151 @@ class RecraftReplaceBackgroundNode: pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/replaceBackground", request=request, - auth_kwargs=kwargs, ) images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor, ) + return IO.NodeOutput(torch.cat(images, dim=0)) -class RecraftRemoveBackgroundNode: - """ - Remove background from image, and return processed image and mask. - """ - - RETURN_TYPES = (IO.IMAGE, IO.MASK) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" +class RecraftRemoveBackgroundNode(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftRemoveBackgroundNode", + display_name="Recraft Remove Background", + category="api node/image/Recraft", + description="Remove background from image, and return processed image and mask.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.Image.Output(), + IO.Mask.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } - - async def api_call( - self, - image: torch.Tensor, - **kwargs, - ): + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: images = [] total = image.shape[0] pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], path="/proxy/recraft/images/removeBackground", - auth_kwargs=kwargs, ) images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) images_tensor = torch.cat(images, dim=0) # use alpha channel as masks, in B,H,W format - masks_tensor = images_tensor[:,:,:,-1:].squeeze(-1) - return (images_tensor, masks_tensor) + masks_tensor = images_tensor[:, :, :, -1:].squeeze(-1) + return IO.NodeOutput(images_tensor, masks_tensor) -class RecraftCrispUpscaleNode: - """ - Upscale image synchronously. - Enhances a given raster image using ‘crisp upscale’ tool, increasing image resolution, making the image sharper and cleaner. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" - +class RecraftCrispUpscaleNode(IO.ComfyNode): RECRAFT_PATH = "/proxy/recraft/images/crispUpscale" @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": (IO.IMAGE, ), - }, - "optional": { - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="RecraftCrispUpscaleNode", + display_name="Recraft Crisp Upscale Image", + category="api node/image/Recraft", + description="Upscale image synchronously.\n" + "Enhances a given raster image using ‘crisp upscale’ tool, " + "increasing image resolution, making the image sharper and cleaner.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, - image: torch.Tensor, - **kwargs, - ): + @classmethod + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: images = [] total = image.shape[0] pbar = ProgressBar(total) for i in range(total): sub_bytes = await handle_recraft_file_request( + cls, image=image[i], - path=self.RECRAFT_PATH, - auth_kwargs=kwargs, + path=cls.RECRAFT_PATH, ) images.append(torch.cat([bytesio_to_image_tensor(x) for x in sub_bytes], dim=0)) pbar.update(1) - images_tensor = torch.cat(images, dim=0) - return (images_tensor,) + return IO.NodeOutput(torch.cat(images, dim=0)) class RecraftCreativeUpscaleNode(RecraftCrispUpscaleNode): - """ - Upscale image synchronously. - Enhances a given raster image using ‘creative upscale’ tool, boosting resolution with a focus on refining small details and faces. - """ - - RETURN_TYPES = (IO.IMAGE,) - DESCRIPTION = cleandoc(__doc__ or "") # Handle potential None value - FUNCTION = "api_call" - API_NODE = True - CATEGORY = "api node/image/Recraft" - RECRAFT_PATH = "/proxy/recraft/images/creativeUpscale" + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RecraftCreativeUpscaleNode", + display_name="Recraft Creative Upscale Image", + category="api node/image/Recraft", + description="Upscale image synchronously.\n" + "Enhances a given raster image using ‘creative upscale’ tool, " + "boosting resolution with a focus on refining small details and faces.", + inputs=[ + IO.Image.Input("image"), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "RecraftTextToImageNode": RecraftTextToImageNode, - "RecraftImageToImageNode": RecraftImageToImageNode, - "RecraftImageInpaintingNode": RecraftImageInpaintingNode, - "RecraftTextToVectorNode": RecraftTextToVectorNode, - "RecraftVectorizeImageNode": RecraftVectorizeImageNode, - "RecraftRemoveBackgroundNode": RecraftRemoveBackgroundNode, - "RecraftReplaceBackgroundNode": RecraftReplaceBackgroundNode, - "RecraftCrispUpscaleNode": RecraftCrispUpscaleNode, - "RecraftCreativeUpscaleNode": RecraftCreativeUpscaleNode, - "RecraftStyleV3RealisticImage": RecraftStyleV3RealisticImageNode, - "RecraftStyleV3DigitalIllustration": RecraftStyleV3DigitalIllustrationNode, - "RecraftStyleV3LogoRaster": RecraftStyleV3LogoRasterNode, - "RecraftStyleV3InfiniteStyleLibrary": RecraftStyleInfiniteStyleLibrary, - "RecraftColorRGB": RecraftColorRGBNode, - "RecraftControls": RecraftControlsNode, -} -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "RecraftTextToImageNode": "Recraft Text to Image", - "RecraftImageToImageNode": "Recraft Image to Image", - "RecraftImageInpaintingNode": "Recraft Image Inpainting", - "RecraftTextToVectorNode": "Recraft Text to Vector", - "RecraftVectorizeImageNode": "Recraft Vectorize Image", - "RecraftRemoveBackgroundNode": "Recraft Remove Background", - "RecraftReplaceBackgroundNode": "Recraft Replace Background", - "RecraftCrispUpscaleNode": "Recraft Crisp Upscale Image", - "RecraftCreativeUpscaleNode": "Recraft Creative Upscale Image", - "RecraftStyleV3RealisticImage": "Recraft Style - Realistic Image", - "RecraftStyleV3DigitalIllustration": "Recraft Style - Digital Illustration", - "RecraftStyleV3LogoRaster": "Recraft Style - Logo Raster", - "RecraftStyleV3InfiniteStyleLibrary": "Recraft Style - Infinite Style Library", - "RecraftColorRGB": "Recraft Color RGB", - "RecraftControls": "Recraft Controls", -} +class RecraftExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + RecraftTextToImageNode, + RecraftImageToImageNode, + RecraftImageInpaintingNode, + RecraftTextToVectorNode, + RecraftVectorizeImageNode, + RecraftRemoveBackgroundNode, + RecraftReplaceBackgroundNode, + RecraftCrispUpscaleNode, + RecraftCreativeUpscaleNode, + RecraftStyleV3RealisticImageNode, + RecraftStyleV3DigitalIllustrationNode, + RecraftStyleV3LogoRasterNode, + RecraftStyleInfiniteStyleLibrary, + RecraftColorRGBNode, + RecraftControlsNode, + ] + + +async def comfy_entrypoint() -> RecraftExtension: + return RecraftExtension() diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 1af393eba..e60e7a6d6 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -5,17 +5,14 @@ Rodin API docs: https://developer.hyper3d.ai/ """ -from __future__ import annotations from inspect import cleandoc -from comfy.comfy_types.node_typing import IO import folder_paths as comfy_paths -import aiohttp import os -import datetime -import asyncio -import io import logging import math +from typing import Optional +from io import BytesIO +from typing_extensions import override from PIL import Image from comfy_api_nodes.apis.rodin_api import ( Rodin3DGenerateRequest, @@ -26,540 +23,498 @@ from comfy_api_nodes.apis.rodin_api import ( Rodin3DDownloadResponse, JobStatus, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( + sync_op, + poll_op, ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, + download_url_to_bytesio, ) +from comfy_api.latest import ComfyExtension, IO -COMMON_PARAMETERS = { - "Seed": ( - IO.INT, - { - "default":0, - "min":0, - "max":65535, - "display":"number" - } +COMMON_PARAMETERS = [ + IO.Int.Input( + "Seed", + default=0, + min=0, + max=65535, + display_mode=IO.NumberDisplay.number, + optional=True, ), - "Material_Type": ( - IO.COMBO, - { - "options": ["PBR", "Shaded"], - "default": "PBR" - } + IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), + IO.Combo.Input( + "Polygon_count", + options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "200K-Triangle"], + default="18K-Quad", + optional=True, ), - "Polygon_count": ( - IO.COMBO, - { - "options": ["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "200K-Triangle"], - "default": "18K-Quad" - } +] + + +def get_quality_mode(poly_count): + polycount = poly_count.split("-") + poly = polycount[1] + count = polycount[0] + if poly == "Triangle": + mesh_mode = "Raw" + elif poly == "Quad": + mesh_mode = "Quad" + else: + mesh_mode = "Quad" + + if count == "4K": + quality_override = 4000 + elif count == "8K": + quality_override = 8000 + elif count == "18K": + quality_override = 18000 + elif count == "50K": + quality_override = 50000 + elif count == "2K": + quality_override = 2000 + elif count == "20K": + quality_override = 20000 + elif count == "150K": + quality_override = 150000 + elif count == "500K": + quality_override = 500000 + else: + quality_override = 18000 + + return mesh_mode, quality_override + + +def tensor_to_filelike(tensor, max_pixels: int = 2048*2048): + """ + Converts a PyTorch tensor to a file-like object. + + Args: + - tensor (torch.Tensor): A tensor representing an image of shape (H, W, C) + where C is the number of channels (3 for RGB), H is height, and W is width. + + Returns: + - io.BytesIO: A file-like object containing the image data. + """ + array = tensor.cpu().numpy() + array = (array * 255).astype('uint8') + image = Image.fromarray(array, 'RGB') + + original_width, original_height = image.size + original_pixels = original_width * original_height + if original_pixels > max_pixels: + scale = math.sqrt(max_pixels / original_pixels) + new_width = int(original_width * scale) + new_height = int(original_height * scale) + else: + new_width, new_height = original_width, original_height + + if new_width != original_width or new_height != original_height: + image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) + + img_byte_arr = BytesIO() + image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression + img_byte_arr.seek(0) + return img_byte_arr + + +async def create_generate_task( + cls: type[IO.ComfyNode], + images=None, + seed=1, + material="PBR", + quality_override=18000, + tier="Regular", + mesh_mode="Quad", + ta_pose: bool = False, +): + if images is None: + raise Exception("Rodin 3D generate requires at least 1 image.") + if len(images) > 5: + raise Exception("Rodin 3D generate requires up to 5 image.") + + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/rodin", method="POST"), + response_model=Rodin3DGenerateResponse, + data=Rodin3DGenerateRequest( + seed=seed, + tier=tier, + material=material, + quality_override=quality_override, + mesh_mode=mesh_mode, + TAPose=ta_pose, + ), + files=[ + ( + "images", + open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image) + ) + for image in images if image is not None + ], + content_type="multipart/form-data", ) -} -def create_task_error(response: Rodin3DGenerateResponse): - """Check if the response has error""" - return hasattr(response, "error") + if hasattr(response, "error"): + error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" + logging.error(error_message) + raise Exception(error_message) + + logging.info("[ Rodin3D API - Submit Jobs ] Submit Generate Task Success!") + subscription_key = response.jobs.subscription_key + task_uuid = response.uuid + logging.info("[ Rodin3D API - Submit Jobs ] UUID: %s", task_uuid) + return task_uuid, subscription_key -class Rodin3DAPI: - """ - Generate 3D Assets using Rodin API - """ - RETURN_TYPES = (IO.STRING,) - RETURN_NAMES = ("3D Model Path",) - CATEGORY = "api node/3d/Rodin" - DESCRIPTION = cleandoc(__doc__ or "") - FUNCTION = "api_call" - API_NODE = True +def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str: + all_done = all(job.status == JobStatus.Done for job in response.jobs) + status_list = [str(job.status) for job in response.jobs] + logging.info("[ Rodin3D API - CheckStatus ] Generate Status: %s", status_list) + if any(job.status == JobStatus.Failed for job in response.jobs): + logging.error("[ Rodin3D API - CheckStatus ] Generate Failed: %s, Please try again.", status_list) + raise Exception("[ Rodin3D API ] Generate Failed, Please Try again.") + if all_done: + return "DONE" + return "Generating" - def tensor_to_filelike(self, tensor, max_pixels: int = 2048*2048): - """ - Converts a PyTorch tensor to a file-like object. - - Args: - - tensor (torch.Tensor): A tensor representing an image of shape (H, W, C) - where C is the number of channels (3 for RGB), H is height, and W is width. - - Returns: - - io.BytesIO: A file-like object containing the image data. - """ - array = tensor.cpu().numpy() - array = (array * 255).astype('uint8') - image = Image.fromarray(array, 'RGB') - - original_width, original_height = image.size - original_pixels = original_width * original_height - if original_pixels > max_pixels: - scale = math.sqrt(max_pixels / original_pixels) - new_width = int(original_width * scale) - new_height = int(original_height * scale) - else: - new_width, new_height = original_width, original_height - - if new_width != original_width or new_height != original_height: - image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) - - img_byte_arr = io.BytesIO() - image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression - img_byte_arr.seek(0) - return img_byte_arr - - def check_rodin_status(self, response: Rodin3DCheckStatusResponse) -> str: - has_failed = any(job.status == JobStatus.Failed for job in response.jobs) - all_done = all(job.status == JobStatus.Done for job in response.jobs) - status_list = [str(job.status) for job in response.jobs] - logging.info(f"[ Rodin3D API - CheckStatus ] Generate Status: {status_list}") - if has_failed: - logging.error(f"[ Rodin3D API - CheckStatus ] Generate Failed: {status_list}, Please try again.") - raise Exception("[ Rodin3D API ] Generate Failed, Please Try again.") - elif all_done: - return "DONE" - else: - return "Generating" - - async def create_generate_task(self, images=None, seed=1, material="PBR", quality_override=18000, tier="Regular", mesh_mode="Quad", TAPose = False, **kwargs): - if images is None: - raise Exception("Rodin 3D generate requires at least 1 image.") - if len(images) > 5: - raise Exception("Rodin 3D generate requires up to 5 image.") - - path = "/proxy/rodin/api/v2/rodin" - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=Rodin3DGenerateRequest, - response_model=Rodin3DGenerateResponse, - ), - request=Rodin3DGenerateRequest( - seed=seed, - tier=tier, - material=material, - quality_override=quality_override, - mesh_mode=mesh_mode, - TAPose=TAPose, - ), - files=[ - ( - "images", - open(image, "rb") if isinstance(image, str) else self.tensor_to_filelike(image) - ) - for image in images if image is not None - ], - content_type = "multipart/form-data", - auth_kwargs=kwargs, - ) - - response = await operation.execute() - - if create_task_error(response): - error_message = f"Rodin3D Create 3D generate Task Failed. Message: {response.message}, error: {response.error}" - logging.error(error_message) - raise Exception(error_message) - - logging.info("[ Rodin3D API - Submit Jobs ] Submit Generate Task Success!") - subscription_key = response.jobs.subscription_key - task_uuid = response.uuid - logging.info(f"[ Rodin3D API - Submit Jobs ] UUID: {task_uuid}") - return task_uuid, subscription_key - - async def poll_for_task_status(self, subscription_key, **kwargs) -> Rodin3DCheckStatusResponse: - - path = "/proxy/rodin/api/v2/status" - - poll_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path = path, - method=HttpMethod.POST, - request_model=Rodin3DCheckStatusRequest, - response_model=Rodin3DCheckStatusResponse, - ), - request=Rodin3DCheckStatusRequest( - subscription_key = subscription_key - ), - completed_statuses=["DONE"], - failed_statuses=["FAILED"], - status_extractor=self.check_rodin_status, - poll_interval=3.0, - auth_kwargs=kwargs, - ) - - logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") - - return await poll_operation.execute() - - async def get_rodin_download_list(self, uuid, **kwargs) -> Rodin3DDownloadResponse: - logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") - - path = "/proxy/rodin/api/v2/download" - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=path, - method=HttpMethod.POST, - request_model=Rodin3DDownloadRequest, - response_model=Rodin3DDownloadResponse, - ), - request=Rodin3DDownloadRequest( - task_uuid=uuid - ), - auth_kwargs=kwargs - ) - - return await operation.execute() - - def get_quality_mode(self, poly_count): - polycount = poly_count.split("-") - poly = polycount[1] - count = polycount[0] - if poly == "Triangle": - mesh_mode = "Raw" - elif poly == "Quad": - mesh_mode = "Quad" - else: - mesh_mode = "Quad" - - if count == "4K": - quality_override = 4000 - elif count == "8K": - quality_override = 8000 - elif count == "18K": - quality_override = 18000 - elif count == "50K": - quality_override = 50000 - elif count == "2K": - quality_override = 2000 - elif count == "20K": - quality_override = 20000 - elif count == "150K": - quality_override = 150000 - elif count == "500K": - quality_override = 500000 - else: - quality_override = 18000 - - return mesh_mode, quality_override - - async def download_files(self, url_list): - save_path = os.path.join(comfy_paths.get_output_directory(), "Rodin3D", datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) - os.makedirs(save_path, exist_ok=True) - model_file_path = None - async with aiohttp.ClientSession() as session: - for i in url_list.list: - url = i.url - file_name = i.name - file_path = os.path.join(save_path, file_name) - if file_path.endswith(".glb"): - model_file_path = file_path - logging.info(f"[ Rodin3D API - download_files ] Downloading file: {file_path}") - max_retries = 5 - for attempt in range(max_retries): - try: - async with session.get(url) as resp: - resp.raise_for_status() - with open(file_path, "wb") as f: - async for chunk in resp.content.iter_chunked(32 * 1024): - f.write(chunk) - break - except Exception as e: - logging.info(f"[ Rodin3D API - download_files ] Error downloading {file_path}:{e}") - if attempt < max_retries - 1: - logging.info("Retrying...") - await asyncio.sleep(2) - else: - logging.info( - "[ Rodin3D API - download_files ] Failed to download %s after %s attempts.", - file_path, - max_retries, - ) - - return model_file_path +def extract_progress(response: Rodin3DCheckStatusResponse) -> Optional[int]: + if not response.jobs: + return None + completed_count = sum(1 for job in response.jobs if job.status == JobStatus.Done) + return int((completed_count / len(response.jobs)) * 100) -class Rodin3D_Regular(Rodin3DAPI): +async def poll_for_task_status(subscription_key: str, cls: type[IO.ComfyNode]) -> Rodin3DCheckStatusResponse: + logging.info("[ Rodin3D API - CheckStatus ] Generate Start!") + return await poll_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/status", method="POST"), + response_model=Rodin3DCheckStatusResponse, + data=Rodin3DCheckStatusRequest(subscription_key=subscription_key), + status_extractor=check_rodin_status, + progress_extractor=extract_progress, + ) + + +async def get_rodin_download_list(uuid: str, cls: type[IO.ComfyNode]) -> Rodin3DDownloadResponse: + logging.info("[ Rodin3D API - Downloading ] Generate Successfully!") + return await sync_op( + cls, + ApiEndpoint(path="/proxy/rodin/api/v2/download", method="POST"), + response_model=Rodin3DDownloadResponse, + data=Rodin3DDownloadRequest(task_uuid=uuid), + monitor_progress=False, + ) + + +async def download_files(url_list, task_uuid: str): + result_folder_name = f"Rodin3D_{task_uuid}" + save_path = os.path.join(comfy_paths.get_output_directory(), result_folder_name) + os.makedirs(save_path, exist_ok=True) + model_file_path = None + for i in url_list.list: + file_path = os.path.join(save_path, i.name) + if file_path.endswith(".glb"): + model_file_path = os.path.join(result_folder_name, i.name) + await download_url_to_bytesio(i.url, file_path) + return model_file_path + + +class Rodin3D_Regular(IO.ComfyNode): + """Generate 3D Assets using Rodin API""" + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - **COMMON_PARAMETERS - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Regular", + display_name="Rodin 3D Generate - Regular Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("Images"), + *COMMON_PARAMETERS, + ], + outputs=[IO.String.Output(display_name="3D Model Path")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, - **kwargs - ): + ) -> IO.NodeOutput: tier = "Regular" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + mesh_mode, quality_override = get_quality_mode(Polygon_count) + task_uuid, subscription_key = await create_generate_task( + cls, + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + ) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + model = await download_files(download_list, task_uuid) - return (model,) + return IO.NodeOutput(model) -class Rodin3D_Detail(Rodin3DAPI): +class Rodin3D_Detail(IO.ComfyNode): + """Generate 3D Assets using Rodin API""" + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - **COMMON_PARAMETERS - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Detail", + display_name="Rodin 3D Generate - Detail Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("Images"), + *COMMON_PARAMETERS, + ], + outputs=[IO.String.Output(display_name="3D Model Path")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, - **kwargs - ): + ) -> IO.NodeOutput: tier = "Detail" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + mesh_mode, quality_override = get_quality_mode(Polygon_count) + task_uuid, subscription_key = await create_generate_task( + cls, + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + ) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + model = await download_files(download_list, task_uuid) - return (model,) + return IO.NodeOutput(model) -class Rodin3D_Smooth(Rodin3DAPI): +class Rodin3D_Smooth(IO.ComfyNode): + """Generate 3D Assets using Rodin API""" + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - **COMMON_PARAMETERS - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Smooth", + display_name="Rodin 3D Generate - Smooth Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("Images"), + *COMMON_PARAMETERS, + ], + outputs=[IO.String.Output(display_name="3D Model Path")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, - **kwargs - ): - tier = "Smooth" + ) -> IO.NodeOutput: num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + mesh_mode, quality_override = get_quality_mode(Polygon_count) + task_uuid, subscription_key = await create_generate_task( + cls, + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier="Smooth", + mesh_mode=mesh_mode, + ) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + model = await download_files(download_list, task_uuid) - return (model,) + return IO.NodeOutput(model) -class Rodin3D_Sketch(Rodin3DAPI): +class Rodin3D_Sketch(IO.ComfyNode): + """Generate 3D Assets using Rodin API""" + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - "Seed": - ( - IO.INT, - { - "default":0, - "min":0, - "max":65535, - "display":"number" - } - ) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Sketch", + display_name="Rodin 3D Generate - Sketch Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("Images"), + IO.Int.Input( + "Seed", + default=0, + min=0, + max=65535, + display_mode=IO.NumberDisplay.number, + optional=True, + ), + ], + outputs=[IO.String.Output(display_name="3D Model Path")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, Images, Seed, - **kwargs - ): - tier = "Sketch" + ) -> IO.NodeOutput: num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - material_type = "PBR" - quality_override = 18000 - mesh_mode = "Quad" - task_uuid, subscription_key = await self.create_generate_task( - images=m_images, seed=Seed, material=material_type, quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, **kwargs + task_uuid, subscription_key = await create_generate_task( + cls, + images=m_images, + seed=Seed, + material="PBR", + quality_override=18000, + tier="Sketch", + mesh_mode="Quad", ) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + model = await download_files(download_list, task_uuid) - return (model,) + return IO.NodeOutput(model) + + +class Rodin3D_Gen2(IO.ComfyNode): + """Generate 3D Assets using Rodin API""" -class Rodin3D_Gen2(Rodin3DAPI): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "Images": - ( - IO.IMAGE, - { - "forceInput":True, - } - ) - }, - "optional": { - "Seed": ( - IO.INT, - { - "default":0, - "min":0, - "max":65535, - "display":"number" - } + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Rodin3D_Gen2", + display_name="Rodin 3D Generate - Gen-2 Generate", + category="api node/3d/Rodin", + description=cleandoc(cls.__doc__ or ""), + inputs=[ + IO.Image.Input("Images"), + IO.Int.Input( + "Seed", + default=0, + min=0, + max=65535, + display_mode=IO.NumberDisplay.number, + optional=True, ), - "Material_Type": ( - IO.COMBO, - { - "options": ["PBR", "Shaded"], - "default": "PBR" - } + IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True), + IO.Combo.Input( + "Polygon_count", + options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], + default="500K-Triangle", + optional=True, ), - "Polygon_count": ( - IO.COMBO, - { - "options": ["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"], - "default": "500K-Triangle" - } - ), - "TAPose": ( - IO.BOOLEAN, - { - "default": False, - } - ) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - }, - } + IO.Boolean.Input("TAPose", default=False), + ], + outputs=[IO.String.Output(display_name="3D Model Path")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) - async def api_call( - self, + @classmethod + async def execute( + cls, Images, Seed, Material_Type, Polygon_count, TAPose, - **kwargs - ): + ) -> IO.NodeOutput: tier = "Gen-2" num_images = Images.shape[0] m_images = [] for i in range(num_images): m_images.append(Images[i]) - mesh_mode, quality_override = self.get_quality_mode(Polygon_count) - task_uuid, subscription_key = await self.create_generate_task(images=m_images, seed=Seed, material=Material_Type, - quality_override=quality_override, tier=tier, mesh_mode=mesh_mode, TAPose=TAPose, - **kwargs) - await self.poll_for_task_status(subscription_key, **kwargs) - download_list = await self.get_rodin_download_list(task_uuid, **kwargs) - model = await self.download_files(download_list) + mesh_mode, quality_override = get_quality_mode(Polygon_count) + task_uuid, subscription_key = await create_generate_task( + cls, + images=m_images, + seed=Seed, + material=Material_Type, + quality_override=quality_override, + tier=tier, + mesh_mode=mesh_mode, + ta_pose=TAPose, + ) + await poll_for_task_status(subscription_key, cls) + download_list = await get_rodin_download_list(task_uuid, cls) + model = await download_files(download_list, task_uuid) - return (model,) + return IO.NodeOutput(model) -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "Rodin3D_Regular": Rodin3D_Regular, - "Rodin3D_Detail": Rodin3D_Detail, - "Rodin3D_Smooth": Rodin3D_Smooth, - "Rodin3D_Sketch": Rodin3D_Sketch, - "Rodin3D_Gen2": Rodin3D_Gen2, -} -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "Rodin3D_Regular": "Rodin 3D Generate - Regular Generate", - "Rodin3D_Detail": "Rodin 3D Generate - Detail Generate", - "Rodin3D_Smooth": "Rodin 3D Generate - Smooth Generate", - "Rodin3D_Sketch": "Rodin 3D Generate - Sketch Generate", - "Rodin3D_Gen2": "Rodin 3D Generate - Gen-2 Generate", -} +class Rodin3DExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + Rodin3D_Regular, + Rodin3D_Detail, + Rodin3D_Smooth, + Rodin3D_Sketch, + Rodin3D_Gen2, + ] + + +async def comfy_entrypoint() -> Rodin3DExtension: + return Rodin3DExtension() diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 27b2bf748..2fdafbbfe 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -11,7 +11,7 @@ User Guides: """ -from typing import Union, Optional, Any +from typing import Union, Optional from typing_extensions import override from enum import Enum @@ -21,7 +21,6 @@ from comfy_api_nodes.apis import ( RunwayImageToVideoRequest, RunwayImageToVideoResponse, RunwayTaskStatusResponse as TaskStatusResponse, - RunwayTaskStatusEnum as TaskStatus, RunwayModelEnum as Model, RunwayDurationEnum as Duration, RunwayAspectRatioEnum as AspectRatio, @@ -33,23 +32,20 @@ from comfy_api_nodes.apis import ( ReferenceImage, RunwayTextToImageAspectRatioEnum, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( - upload_images_to_comfyapi, - download_url_to_video_output, +from comfy_api_nodes.util import ( image_tensor_pair_to_batch, validate_string, + validate_image_dimensions, + validate_image_aspect_ratio, + upload_images_to_comfyapi, + download_url_to_video_output, download_url_to_image_tensor, + ApiEndpoint, + sync_op, + poll_op, ) from comfy_api.input_impl import VideoFromFile -from comfy_api.latest import ComfyExtension, io as comfy_io -from comfy_api_nodes.util.validation_utils import validate_image_dimensions, validate_image_aspect_ratio +from comfy_api.latest import ComfyExtension, IO PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video" PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image" @@ -91,31 +87,6 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> Union[str, N return None -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, TaskStatusResponse], - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> TaskStatusResponse: - """Polls the Runway API endpoint until the task reaches a terminal state, then returns the response.""" - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[ - TaskStatus.SUCCEEDED.value, - ], - failed_statuses=[ - TaskStatus.FAILED.value, - TaskStatus.CANCELLED.value, - ], - status_extractor=lambda response: response.status.value, - auth_kwargs=auth_kwargs, - result_url_extractor=get_video_url_from_task_status, - estimated_duration=estimated_duration, - node_id=node_id, - progress_extractor=extract_progress_from_task_status, - ).execute() - - def extract_progress_from_task_status( response: TaskStatusResponse, ) -> Union[float, None]: @@ -132,42 +103,32 @@ def get_image_url_from_task_status(response: TaskStatusResponse) -> Union[str, N async def get_response( - task_id: str, auth_kwargs: dict[str, str], node_id: Optional[str] = None, estimated_duration: Optional[int] = None + cls: type[IO.ComfyNode], task_id: str, estimated_duration: Optional[int] = None ) -> TaskStatusResponse: """Poll the task status until it is finished then get the response.""" - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=f"{PATH_GET_TASK_STATUS}/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), + return await poll_op( + cls, + ApiEndpoint(path=f"{PATH_GET_TASK_STATUS}/{task_id}"), + response_model=TaskStatusResponse, + status_extractor=lambda r: r.status.value, estimated_duration=estimated_duration, - node_id=node_id, + progress_extractor=extract_progress_from_task_status, ) async def generate_video( + cls: type[IO.ComfyNode], request: RunwayImageToVideoRequest, - auth_kwargs: dict[str, str], - node_id: Optional[str] = None, estimated_duration: Optional[int] = None, ) -> VideoFromFile: - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_IMAGE_TO_VIDEO, - method=HttpMethod.POST, - request_model=RunwayImageToVideoRequest, - response_model=RunwayImageToVideoResponse, - ), - request=request, - auth_kwargs=auth_kwargs, + initial_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=PATH_IMAGE_TO_VIDEO, method="POST"), + response_model=RunwayImageToVideoResponse, + data=request, ) - initial_response = await initial_operation.execute() - - final_response = await get_response(initial_response.id, auth_kwargs, node_id, estimated_duration) + final_response = await get_response(cls, initial_response.id, estimated_duration) if not final_response.output: raise RunwayApiError("Runway task succeeded but no video data found in response.") @@ -175,55 +136,55 @@ async def generate_video( return await download_url_to_video_output(video_url) -class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): +class RunwayImageToVideoNodeGen3a(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayImageToVideoNodeGen3a", display_name="Runway Image to Video (Gen3a Turbo)", category="api node/video/Runway", description="Generate a video from a single starting frame using Gen3a Turbo model. " - "Before diving in, review these best practices to ensure that " - "your input selections will set your generation up for success: " - "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/33927968552339-Creating-with-Act-One-on-Gen-3-Alpha-and-Turbo.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Start frame to be used for the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", - options=[model.value for model in Duration], + options=Duration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", - options=[model.value for model in RunwayGen3aAspectRatio], + options=RunwayGen3aAspectRatio, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967295, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed for generation", ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -236,25 +197,21 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): duration: str, ratio: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) - validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } + validate_image_aspect_ratio(start_frame, (1, 2), (2, 1)) download_urls = await upload_images_to_comfyapi( + cls, start_frame, max_images=1, mime_type="image/png", - auth_kwargs=auth_kwargs, ) - return comfy_io.NodeOutput( + return IO.NodeOutput( await generate_video( + cls, RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -262,68 +219,62 @@ class RunwayImageToVideoNodeGen3a(comfy_io.ComfyNode): duration=Duration(duration), ratio=AspectRatio(ratio), promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] + root=[RunwayPromptImageDetailedObject(uri=str(download_urls[0]), position="first")] ), ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, ) ) -class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): +class RunwayImageToVideoNodeGen4(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayImageToVideoNodeGen4", display_name="Runway Image to Video (Gen4 Turbo)", category="api node/video/Runway", description="Generate a video from a single starting frame using Gen4 Turbo model. " - "Before diving in, review these best practices to ensure that " - "your input selections will set your generation up for success: " - "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", + "Before diving in, review these best practices to ensure that " + "your input selections will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/37327109429011-Creating-with-Gen-4-Video.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Start frame to be used for the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", - options=[model.value for model in Duration], + options=Duration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", - options=[model.value for model in RunwayGen4TurboAspectRatio], + options=RunwayGen4TurboAspectRatio, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967295, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed for generation", ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -336,25 +287,21 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): duration: str, ratio: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) - validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } + validate_image_aspect_ratio(start_frame, (1, 2), (2, 1)) download_urls = await upload_images_to_comfyapi( + cls, start_frame, max_images=1, mime_type="image/png", - auth_kwargs=auth_kwargs, ) - return comfy_io.NodeOutput( + return IO.NodeOutput( await generate_video( + cls, RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -362,76 +309,70 @@ class RunwayImageToVideoNodeGen4(comfy_io.ComfyNode): duration=Duration(duration), ratio=AspectRatio(ratio), promptImage=RunwayPromptImageObject( - root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ) - ] + root=[RunwayPromptImageDetailedObject(uri=str(download_urls[0]), position="first")] ), ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_FLF_SECONDS, ) ) -class RunwayFirstLastFrameNode(comfy_io.ComfyNode): +class RunwayFirstLastFrameNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayFirstLastFrameNode", display_name="Runway First-Last-Frame to Video", category="api node/video/Runway", description="Upload first and last keyframes, draft a prompt, and generate a video. " - "More complex transitions, such as cases where the Last frame is completely different " - "from the First frame, may benefit from the longer 10s duration. " - "This would give the generation more time to smoothly transition between the two inputs. " - "Before diving in, review these best practices to ensure that your input selections " - "will set your generation up for success: " - "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", + "More complex transitions, such as cases where the Last frame is completely different " + "from the First frame, may benefit from the longer 10s duration. " + "This would give the generation more time to smoothly transition between the two inputs. " + "Before diving in, review these best practices to ensure that your input selections " + "will set your generation up for success: " + "https://help.runwayml.com/hc/en-us/articles/34170748696595-Creating-with-Keyframes-on-Gen-3.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Image.Input( + IO.Image.Input( "start_frame", tooltip="Start frame to be used for the video", ), - comfy_io.Image.Input( + IO.Image.Input( "end_frame", tooltip="End frame to be used for the video. Supported for gen3a_turbo only.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "duration", - options=[model.value for model in Duration], + options=Duration, ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", - options=[model.value for model in RunwayGen3aAspectRatio], + options=RunwayGen3aAspectRatio, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967295, step=1, control_after_generate=True, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Random seed for generation", ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -445,30 +386,26 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): duration: str, ratio: str, seed: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) validate_image_dimensions(start_frame, max_width=7999, max_height=7999) validate_image_dimensions(end_frame, max_width=7999, max_height=7999) - validate_image_aspect_ratio(start_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - validate_image_aspect_ratio(end_frame, min_aspect_ratio=0.5, max_aspect_ratio=2.0) - - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } + validate_image_aspect_ratio(start_frame, (1, 2), (2, 1)) + validate_image_aspect_ratio(end_frame, (1, 2), (2, 1)) stacked_input_images = image_tensor_pair_to_batch(start_frame, end_frame) download_urls = await upload_images_to_comfyapi( + cls, stacked_input_images, max_images=2, mime_type="image/png", - auth_kwargs=auth_kwargs, ) if len(download_urls) != 2: raise RunwayApiError("Failed to upload one or more images to comfy api.") - return comfy_io.NodeOutput( + return IO.NodeOutput( await generate_video( + cls, RunwayImageToVideoRequest( promptText=prompt, seed=seed, @@ -477,56 +414,50 @@ class RunwayFirstLastFrameNode(comfy_io.ComfyNode): ratio=AspectRatio(ratio), promptImage=RunwayPromptImageObject( root=[ - RunwayPromptImageDetailedObject( - uri=str(download_urls[0]), position="first" - ), - RunwayPromptImageDetailedObject( - uri=str(download_urls[1]), position="last" - ), + RunwayPromptImageDetailedObject(uri=str(download_urls[0]), position="first"), + RunwayPromptImageDetailedObject(uri=str(download_urls[1]), position="last"), ] ), ), - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_FLF_SECONDS, ) ) -class RunwayTextToImageNode(comfy_io.ComfyNode): +class RunwayTextToImageNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="RunwayTextToImageNode", display_name="Runway Text to Image", category="api node/image/Runway", description="Generate an image from a text prompt using Runway's Gen 4 model. " - "You can also include reference image to guide the generation.", + "You can also include reference image to guide the generation.", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text prompt for the generation", ), - comfy_io.Combo.Input( + IO.Combo.Input( "ratio", options=[model.value for model in RunwayTextToImageAspectRatioEnum], ), - comfy_io.Image.Input( + IO.Image.Input( "reference_image", tooltip="Optional reference image to guide the generation", optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -537,63 +468,48 @@ class RunwayTextToImageNode(comfy_io.ComfyNode): prompt: str, ratio: str, reference_image: Optional[torch.Tensor] = None, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, min_length=1) - auth_kwargs = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - # Prepare reference images if provided reference_images = None if reference_image is not None: validate_image_dimensions(reference_image, max_width=7999, max_height=7999) - validate_image_aspect_ratio(reference_image, min_aspect_ratio=0.5, max_aspect_ratio=2.0) + validate_image_aspect_ratio(reference_image, (1, 2), (2, 1)) download_urls = await upload_images_to_comfyapi( + cls, reference_image, max_images=1, mime_type="image/png", - auth_kwargs=auth_kwargs, ) reference_images = [ReferenceImage(uri=str(download_urls[0]))] - request = RunwayTextToImageRequest( - promptText=prompt, - model=Model4.gen4_image, - ratio=ratio, - referenceImages=reference_images, - ) - - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=PATH_TEXT_TO_IMAGE, - method=HttpMethod.POST, - request_model=RunwayTextToImageRequest, - response_model=RunwayTextToImageResponse, + initial_response = await sync_op( + cls, + endpoint=ApiEndpoint(path=PATH_TEXT_TO_IMAGE, method="POST"), + response_model=RunwayTextToImageResponse, + data=RunwayTextToImageRequest( + promptText=prompt, + model=Model4.gen4_image, + ratio=ratio, + referenceImages=reference_images, ), - request=request, - auth_kwargs=auth_kwargs, ) - initial_response = await initial_operation.execute() - - # Poll for completion final_response = await get_response( + cls, initial_response.id, - auth_kwargs=auth_kwargs, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_T2I_SECONDS, ) if not final_response.output: raise RunwayApiError("Runway task succeeded but no image data found in response.") - return comfy_io.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response))) + return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response))) class RunwayExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ RunwayFirstLastFrameNode, RunwayImageToVideoNodeGen3a, @@ -601,5 +517,6 @@ class RunwayExtension(ComfyExtension): RunwayTextToImageNode, ] + async def comfy_entrypoint() -> RunwayExtension: return RunwayExtension() diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py new file mode 100644 index 000000000..92b225d40 --- /dev/null +++ b/comfy_api_nodes/nodes_sora.py @@ -0,0 +1,151 @@ +from typing import Optional + +import torch +from pydantic import BaseModel, Field +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, + tensor_to_bytesio, +) + + +class Sora2GenerationRequest(BaseModel): + prompt: str = Field(...) + model: str = Field(...) + seconds: str = Field(...) + size: str = Field(...) + + +class Sora2GenerationResponse(BaseModel): + id: str = Field(...) + error: Optional[dict] = Field(None) + status: Optional[str] = Field(None) + + +class OpenAIVideoSora2(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="OpenAIVideoSora2", + display_name="OpenAI Sora - Video", + category="api node/video/Sora", + description="OpenAI video and audio generation.", + inputs=[ + IO.Combo.Input( + "model", + options=["sora-2", "sora-2-pro"], + default="sora-2", + ), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Guiding text; may be empty if an input image is present.", + ), + IO.Combo.Input( + "size", + options=[ + "720x1280", + "1280x720", + "1024x1792", + "1792x1024", + ], + default="1280x720", + ), + IO.Combo.Input( + "duration", + options=[4, 8, 12], + default=8, + ), + IO.Image.Input( + "image", + optional=True, + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + optional=True, + tooltip="Seed to determine if node should re-run; " + "actual results are nondeterministic regardless of seed.", + ), + ], + outputs=[ + IO.Video.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + prompt: str, + size: str = "1280x720", + duration: int = 8, + seed: int = 0, + image: Optional[torch.Tensor] = None, + ): + if model == "sora-2" and size not in ("720x1280", "1280x720"): + raise ValueError("Invalid size for sora-2 model, only 720x1280 and 1280x720 are supported.") + files_input = None + if image is not None: + if get_number_of_images(image) != 1: + raise ValueError("Currently only one input image is supported.") + files_input = {"input_reference": ("image.png", tensor_to_bytesio(image), "image/png")} + initial_response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/openai/v1/videos", method="POST"), + data=Sora2GenerationRequest( + model=model, + prompt=prompt, + seconds=str(duration), + size=size, + ), + files=files_input, + response_model=Sora2GenerationResponse, + content_type="multipart/form-data", + ) + if initial_response.error: + raise Exception(initial_response.error["message"]) + + model_time_multiplier = 1 if model == "sora-2" else 2 + await poll_op( + cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/openai/v1/videos/{initial_response.id}"), + response_model=Sora2GenerationResponse, + status_extractor=lambda x: x.status, + poll_interval=8.0, + max_poll_attempts=160, + estimated_duration=int(45 * (duration / 4) * model_time_multiplier), + ) + return IO.NodeOutput( + await download_url_to_video_output(f"/proxy/openai/v1/videos/{initial_response.id}/content", cls=cls), + ) + + +class OpenAISoraExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + OpenAIVideoSora2, + ] + + +async def comfy_entrypoint() -> OpenAISoraExtension: + return OpenAISoraExtension() diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 5ba5ed986..bb7ceed78 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -2,7 +2,7 @@ from inspect import cleandoc from typing import Optional from typing_extensions import override -from comfy_api.latest import ComfyExtension, Input, io as comfy_io +from comfy_api.latest import ComfyExtension, Input, IO from comfy_api_nodes.apis.stability_api import ( StabilityUpscaleConservativeRequest, StabilityUpscaleCreativeRequest, @@ -20,21 +20,17 @@ from comfy_api_nodes.apis.stability_api import ( StabilityAudioInpaintRequest, StabilityAudioResponse, ) -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( +from comfy_api_nodes.util import ( + validate_audio_duration, + validate_string, + audio_input_to_mp3, bytesio_to_image_tensor, tensor_to_bytesio, - validate_string, audio_bytes_to_audio_input, - audio_input_to_mp3, + sync_op, + poll_op, + ApiEndpoint, ) -from comfy_api_nodes.util.validation_utils import validate_audio_duration import torch import base64 @@ -56,20 +52,20 @@ def get_async_dummy_status(x: StabilityResultsGetResponse): return StabilityPollStatus.in_progress -class StabilityStableImageUltraNode(comfy_io.ComfyNode): +class StabilityStableImageUltraNode(IO.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityStableImageUltraNode", display_name="Stability AI Stable Image Ultra", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", @@ -80,39 +76,39 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): "is a value between 0 and 1. For example: `The sky was a crisp (blue:0.3) and (green:0.8)`" + "would convey a sky that was blue and green, but more green than blue.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", - options=[x.value for x in StabilityAspectRatio], - default=StabilityAspectRatio.ratio_1_1.value, + options=StabilityAspectRatio, + default=StabilityAspectRatio.ratio_1_1, tooltip="Aspect ratio of generated image.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_preset", options=get_stability_style_presets(), tooltip="Optional desired style of generated image.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="A blurb of text describing what you do not wish to see in the output image. This is an advanced feature.", force_input=True, optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "image_denoise", default=0.5, min=0.0, @@ -123,12 +119,12 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -143,7 +139,7 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): image: Optional[torch.Tensor] = None, negative_prompt: str = "", image_denoise: Optional[float] = 0.5, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -161,19 +157,11 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/generate/ultra", - method=HttpMethod.POST, - request_model=StabilityStableUltraRequest, - response_model=StabilityStableUltraResponse, - ), - request=StabilityStableUltraRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/generate/ultra", method="POST"), + response_model=StabilityStableUltraResponse, + data=StabilityStableUltraRequest( prompt=prompt, negative_prompt=negative_prompt, aspect_ratio=aspect_ratio, @@ -183,9 +171,7 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Image Ultra generation failed: {response_api.finish_reason}.") @@ -193,44 +179,44 @@ class StabilityStableImageUltraNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): +class StabilityStableImageSD_3_5Node(IO.ComfyNode): """ Generates images synchronously based on prompt and resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityStableImageSD_3_5Node", display_name="Stability AI Stable Diffusion 3.5 Image", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[x.value for x in Stability_SD3_5_Model], + options=Stability_SD3_5_Model, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", - options=[x.value for x in StabilityAspectRatio], - default=StabilityAspectRatio.ratio_1_1.value, + options=StabilityAspectRatio, + default=StabilityAspectRatio.ratio_1_1, tooltip="Aspect ratio of generated image.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_preset", options=get_stability_style_presets(), tooltip="Optional desired style of generated image.", ), - comfy_io.Float.Input( + IO.Float.Input( "cfg_scale", default=4.0, min=1.0, @@ -238,28 +224,28 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): step=0.1, tooltip="How strictly the diffusion process adheres to the prompt text (higher values keep your image closer to your prompt)", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", optional=True, ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", force_input=True, optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "image_denoise", default=0.5, min=0.0, @@ -270,12 +256,12 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -292,7 +278,7 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): image: Optional[torch.Tensor] = None, negative_prompt: str = "", image_denoise: Optional[float] = 0.5, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) # prepare image binary if image present image_binary = None @@ -313,19 +299,11 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/generate/sd3", - method=HttpMethod.POST, - request_model=StabilityStable3_5Request, - response_model=StabilityStableUltraResponse, - ), - request=StabilityStable3_5Request( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/generate/sd3", method="POST"), + response_model=StabilityStableUltraResponse, + data=StabilityStable3_5Request( prompt=prompt, negative_prompt=negative_prompt, aspect_ratio=aspect_ratio, @@ -338,9 +316,7 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stable Diffusion 3.5 Image generation failed: {response_api.finish_reason}.") @@ -348,30 +324,30 @@ class StabilityStableImageSD_3_5Node(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): +class StabilityUpscaleConservativeNode(IO.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityUpscaleConservativeNode", display_name="Stability AI Upscale Conservative", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", ), - comfy_io.Float.Input( + IO.Float.Input( "creativity", default=0.35, min=0.2, @@ -379,17 +355,17 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): step=0.01, tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", @@ -398,12 +374,12 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -416,7 +392,7 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): creativity: float, seed: int, negative_prompt: str = "", - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -427,19 +403,11 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/upscale/conservative", - method=HttpMethod.POST, - request_model=StabilityUpscaleConservativeRequest, - response_model=StabilityStableUltraResponse, - ), - request=StabilityUpscaleConservativeRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/upscale/conservative", method="POST"), + response_model=StabilityStableUltraResponse, + data=StabilityUpscaleConservativeRequest( prompt=prompt, negative_prompt=negative_prompt, creativity=round(creativity,2), @@ -447,9 +415,7 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Conservative generation failed: {response_api.finish_reason}.") @@ -457,30 +423,30 @@ class StabilityUpscaleConservativeNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): +class StabilityUpscaleCreativeNode(IO.ComfyNode): """ Upscale image with minimal alterations to 4K resolution. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityUpscaleCreativeNode", display_name="Stability AI Upscale Creative", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), - comfy_io.String.Input( + IO.Image.Input("image"), + IO.String.Input( "prompt", multiline=True, default="", tooltip="What you wish to see in the output image. A strong, descriptive prompt that clearly defines elements, colors, and subjects will lead to better results.", ), - comfy_io.Float.Input( + IO.Float.Input( "creativity", default=0.3, min=0.1, @@ -488,22 +454,22 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): step=0.01, tooltip="Controls the likelihood of creating additional details not heavily conditioned by the init image.", ), - comfy_io.Combo.Input( + IO.Combo.Input( "style_preset", options=get_stability_style_presets(), tooltip="Optional desired style of generated image.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for creating the noise.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", default="", tooltip="Keywords of what you do not wish to see in the output image. This is an advanced feature.", @@ -512,12 +478,12 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -531,7 +497,7 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): style_preset: str, seed: int, negative_prompt: str = "", - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, strip_whitespace=False) image_binary = tensor_to_bytesio(image, total_pixels=1024*1024).read() @@ -544,19 +510,11 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/upscale/creative", - method=HttpMethod.POST, - request_model=StabilityUpscaleCreativeRequest, - response_model=StabilityAsyncResponse, - ), - request=StabilityUpscaleCreativeRequest( + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/upscale/creative", method="POST"), + response_model=StabilityAsyncResponse, + data=StabilityUpscaleCreativeRequest( prompt=prompt, negative_prompt=negative_prompt, creativity=round(creativity,2), @@ -565,25 +523,15 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): ), files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() - operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/stability/v2beta/results/{response_api.id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=StabilityResultsGetResponse, - ), + response_poll = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/stability/v2beta/results/{response_api.id}"), + response_model=StabilityResultsGetResponse, poll_interval=3, - completed_statuses=[StabilityPollStatus.finished], - failed_statuses=[StabilityPollStatus.failed], status_extractor=lambda x: get_async_dummy_status(x), - auth_kwargs=auth, - node_id=cls.hidden.unique_id, ) - response_poll: StabilityResultsGetResponse = await operation.execute() if response_poll.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Creative generation failed: {response_poll.finish_reason}.") @@ -591,61 +539,50 @@ class StabilityUpscaleCreativeNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_poll.result) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityUpscaleFastNode(comfy_io.ComfyNode): +class StabilityUpscaleFastNode(IO.ComfyNode): """ Quickly upscales an image via Stability API call to 4x its original size; intended for upscaling low-quality/compressed images. """ @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityUpscaleFastNode", display_name="Stability AI Upscale Fast", category="api node/image/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Image.Input("image"), + IO.Image.Input("image"), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @classmethod - async def execute(cls, image: torch.Tensor) -> comfy_io.NodeOutput: + async def execute(cls, image: torch.Tensor) -> IO.NodeOutput: image_binary = tensor_to_bytesio(image, total_pixels=4096*4096).read() files = { "image": image_binary } - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/stable-image/upscale/fast", - method=HttpMethod.POST, - request_model=EmptyRequest, - response_model=StabilityStableUltraResponse, - ), - request=EmptyRequest(), + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/stable-image/upscale/fast", method="POST"), + response_model=StabilityStableUltraResponse, files=files, content_type="multipart/form-data", - auth_kwargs=auth, ) - response_api = await operation.execute() if response_api.finish_reason != "SUCCESS": raise Exception(f"Stability Upscale Fast failed: {response_api.finish_reason}.") @@ -653,26 +590,26 @@ class StabilityUpscaleFastNode(comfy_io.ComfyNode): image_data = base64.b64decode(response_api.image) returned_image = bytesio_to_image_tensor(BytesIO(image_data)) - return comfy_io.NodeOutput(returned_image) + return IO.NodeOutput(returned_image) -class StabilityTextToAudio(comfy_io.ComfyNode): +class StabilityTextToAudio(IO.ComfyNode): """Generates high-quality music and sound effects from text descriptions.""" @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityTextToAudio", display_name="Stability AI Text To Audio", category="api node/audio/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["stable-audio-2.5"], ), - comfy_io.String.Input("prompt", multiline=True, default=""), - comfy_io.Int.Input( + IO.String.Input("prompt", multiline=True, default=""), + IO.Int.Input( "duration", default=190, min=1, @@ -681,18 +618,18 @@ class StabilityTextToAudio(comfy_io.ComfyNode): tooltip="Controls the duration in seconds of the generated audio.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for generation.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=8, min=4, @@ -703,58 +640,50 @@ class StabilityTextToAudio(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Audio.Output(), + IO.Audio.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @classmethod - async def execute(cls, model: str, prompt: str, duration: int, seed: int, steps: int) -> comfy_io.NodeOutput: + async def execute(cls, model: str, prompt: str, duration: int, seed: int, steps: int) -> IO.NodeOutput: validate_string(prompt, max_length=10000) payload = StabilityTextToAudioRequest(prompt=prompt, model=model, duration=duration, seed=seed, steps=steps) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/audio/stable-audio-2/text-to-audio", - method=HttpMethod.POST, - request_model=StabilityTextToAudioRequest, - response_model=StabilityAudioResponse, - ), - request=payload, + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/audio/stable-audio-2/text-to-audio", method="POST"), + response_model=StabilityAudioResponse, + data=payload, content_type="multipart/form-data", - auth_kwargs= { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") - return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) -class StabilityAudioToAudio(comfy_io.ComfyNode): +class StabilityAudioToAudio(IO.ComfyNode): """Transforms existing audio samples into new high-quality compositions using text instructions.""" @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityAudioToAudio", display_name="Stability AI Audio To Audio", category="api node/audio/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["stable-audio-2.5"], ), - comfy_io.String.Input("prompt", multiline=True, default=""), - comfy_io.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), - comfy_io.Int.Input( + IO.String.Input("prompt", multiline=True, default=""), + IO.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), + IO.Int.Input( "duration", default=190, min=1, @@ -763,18 +692,18 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): tooltip="Controls the duration in seconds of the generated audio.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for generation.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=8, min=4, @@ -783,24 +712,24 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): tooltip="Controls the number of sampling steps.", optional=True, ), - comfy_io.Float.Input( + IO.Float.Input( "strength", default=1, min=0.01, max=1.0, step=0.01, - display_mode=comfy_io.NumberDisplay.slider, + display_mode=IO.NumberDisplay.slider, tooltip="Parameter controls how much influence the audio parameter has on the generated audio.", optional=True, ), ], outputs=[ - comfy_io.Audio.Output(), + IO.Audio.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -808,51 +737,43 @@ class StabilityAudioToAudio(comfy_io.ComfyNode): @classmethod async def execute( cls, model: str, prompt: str, audio: Input.Audio, duration: int, seed: int, steps: int, strength: float - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, max_length=10000) validate_audio_duration(audio, 6, 190) payload = StabilityAudioToAudioRequest( prompt=prompt, model=model, duration=duration, seed=seed, steps=steps, strength=strength ) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/audio/stable-audio-2/audio-to-audio", - method=HttpMethod.POST, - request_model=StabilityAudioToAudioRequest, - response_model=StabilityAudioResponse, - ), - request=payload, + response_api = await sync_op( + cls, + ApiEndpoint(path="/proxy/stability/v2beta/audio/stable-audio-2/audio-to-audio", method="POST"), + response_model=StabilityAudioResponse, + data=payload, content_type="multipart/form-data", files={"audio": audio_input_to_mp3(audio)}, - auth_kwargs= { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") - return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) -class StabilityAudioInpaint(comfy_io.ComfyNode): +class StabilityAudioInpaint(IO.ComfyNode): """Transforms part of existing audio sample using text instructions.""" @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="StabilityAudioInpaint", display_name="Stability AI Audio Inpaint", category="api node/audio/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["stable-audio-2.5"], ), - comfy_io.String.Input("prompt", multiline=True, default=""), - comfy_io.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), - comfy_io.Int.Input( + IO.String.Input("prompt", multiline=True, default=""), + IO.Audio.Input("audio", tooltip="Audio must be between 6 and 190 seconds long."), + IO.Int.Input( "duration", default=190, min=1, @@ -861,18 +782,18 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): tooltip="Controls the duration in seconds of the generated audio.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=4294967294, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="The random seed used for generation.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "steps", default=8, min=4, @@ -881,7 +802,7 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): tooltip="Controls the number of sampling steps.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "mask_start", default=30, min=0, @@ -889,7 +810,7 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): step=1, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "mask_end", default=190, min=0, @@ -899,12 +820,12 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Audio.Output(), + IO.Audio.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -920,7 +841,7 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): steps: int, mask_start: int, mask_end: int, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: validate_string(prompt, max_length=10000) if mask_end <= mask_start: raise ValueError(f"Value of mask_end({mask_end}) should be greater then mask_start({mask_start})") @@ -935,30 +856,22 @@ class StabilityAudioInpaint(comfy_io.ComfyNode): mask_start=mask_start, mask_end=mask_end, ) - operation = SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/stability/v2beta/audio/stable-audio-2/inpaint", - method=HttpMethod.POST, - request_model=StabilityAudioInpaintRequest, - response_model=StabilityAudioResponse, - ), - request=payload, + response_api = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/stability/v2beta/audio/stable-audio-2/inpaint", method="POST"), + response_model=StabilityAudioResponse, + data=payload, content_type="multipart/form-data", files={"audio": audio_input_to_mp3(audio)}, - auth_kwargs={ - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, ) - response_api = await operation.execute() if not response_api.audio: raise ValueError("No audio file was received in response.") - return comfy_io.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) + return IO.NodeOutput(audio_bytes_to_audio_input(base64.b64decode(response_api.audio))) class StabilityExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ StabilityStableImageUltraNode, StabilityStableImageSD_3_5Node, diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index d08cf9007..697100ff2 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -1,46 +1,39 @@ import os -from folder_paths import get_output_directory -from comfy_api_nodes.mapper_utils import model_field_to_node_input -from comfy.comfy_types.node_typing import IO -from comfy_api_nodes.apis import ( - TripoOrientation, - TripoModelVersion, -) +from typing import Optional + +import torch +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension from comfy_api_nodes.apis.tripo_api import ( - TripoTaskType, - TripoStyle, - TripoFileReference, + TripoAnimateRetargetRequest, + TripoAnimateRigRequest, + TripoConvertModelRequest, TripoFileEmptyReference, - TripoUrlReference, + TripoFileReference, + TripoImageToModelRequest, + TripoModelVersion, + TripoMultiviewToModelRequest, + TripoOrientation, + TripoRefineModelRequest, + TripoStyle, TripoTaskResponse, TripoTaskStatus, + TripoTaskType, TripoTextToModelRequest, - TripoImageToModelRequest, - TripoMultiviewToModelRequest, TripoTextureModelRequest, - TripoRefineModelRequest, - TripoAnimateRigRequest, - TripoAnimateRetargetRequest, - TripoConvertModelRequest, + TripoUrlReference, ) - -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, -) -from comfy_api_nodes.apinode_utils import ( + download_url_as_bytesio, + poll_op, + sync_op, upload_images_to_comfyapi, - download_url_to_bytesio, ) +from folder_paths import get_output_directory -async def upload_image_to_tripo(image, **kwargs): - urls = await upload_images_to_comfyapi(image, max_images=1, auth_kwargs=kwargs) - return TripoFileReference(TripoUrlReference(url=urls[0], type="jpeg")) - def get_model_url_from_response(response: TripoTaskResponse) -> str: if response.data is not None: for key in ["pbr_model", "model", "base_model"]: @@ -50,20 +43,18 @@ def get_model_url_from_response(response: TripoTaskResponse) -> str: async def poll_until_finished( - kwargs: dict[str, str], + node_cls: type[IO.ComfyNode], response: TripoTaskResponse, -) -> tuple[str, str]: + average_duration: Optional[int] = None, +) -> IO.NodeOutput: """Polls the Tripo API endpoint until the task reaches a terminal state, then returns the response.""" if response.code != 0: raise RuntimeError(f"Failed to generate mesh: {response.error}") task_id = response.data.task_id - response_poll = await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/tripo/v2/openapi/task/{task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TripoTaskResponse, - ), + response_poll = await poll_op( + node_cls, + poll_endpoint=ApiEndpoint(path=f"/proxy/tripo/v2/openapi/task/{task_id}"), + response_model=TripoTaskResponse, completed_statuses=[TripoTaskStatus.SUCCESS], failed_statuses=[ TripoTaskStatus.FAILED, @@ -73,72 +64,84 @@ async def poll_until_finished( TripoTaskStatus.EXPIRED, ], status_extractor=lambda x: x.data.status, - auth_kwargs=kwargs, - node_id=kwargs["unique_id"], - result_url_extractor=get_model_url_from_response, progress_extractor=lambda x: x.data.progress, - ).execute() + estimated_duration=average_duration, + ) if response_poll.data.status == TripoTaskStatus.SUCCESS: url = get_model_url_from_response(response_poll) - bytesio = await download_url_to_bytesio(url) + bytesio = await download_url_as_bytesio(url) # Save the downloaded model file model_file = f"tripo_model_{task_id}.glb" with open(os.path.join(get_output_directory(), model_file), "wb") as f: f.write(bytesio.getvalue()) - return model_file, task_id + return IO.NodeOutput(model_file, task_id) raise RuntimeError(f"Failed to generate mesh: {response_poll}") -class TripoTextToModelNode: +class TripoTextToModelNode(IO.ComfyNode): """ Generates 3D models synchronously based on a text prompt using Tripo's API. """ - AVERAGE_DURATION = 80 + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "prompt": ("STRING", {"multiline": True}), - }, - "optional": { - "negative_prompt": ("STRING", {"multiline": True}), - "model_version": model_field_to_node_input(IO.COMBO, TripoTextToModelRequest, "model_version", enum_type=TripoModelVersion), - "style": model_field_to_node_input(IO.COMBO, TripoTextToModelRequest, "style", enum_type=TripoStyle, default="None"), - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "image_seed": ("INT", {"default": 42}), - "model_seed": ("INT", {"default": 42}), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "quad": ("BOOLEAN", {"default": False}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoTextToModelNode", + display_name="Tripo: Text to Model", + category="api node/3d/Tripo", + inputs=[ + IO.String.Input("prompt", multiline=True), + IO.String.Input("negative_prompt", multiline=True, optional=True), + IO.Combo.Input( + "model_version", options=TripoModelVersion, default=TripoModelVersion.v2_5_20250123, optional=True + ), + IO.Combo.Input("style", options=TripoStyle, default="None", optional=True), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("image_seed", default=42, optional=True), + IO.Int.Input("model_seed", default=42, optional=True), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Boolean.Input("quad", default=False, optional=True), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - - async def generate_mesh(self, prompt, negative_prompt=None, model_version=None, style=None, texture=None, pbr=None, image_seed=None, model_seed=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + @classmethod + async def execute( + cls, + prompt: str, + negative_prompt: Optional[str] = None, + model_version=None, + style: Optional[str] = None, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + image_seed: Optional[int] = None, + model_seed: Optional[int] = None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + face_limit: Optional[int] = None, + quad: Optional[bool] = None, + ) -> IO.NodeOutput: style_enum = None if style == "None" else style if not prompt: raise RuntimeError("Prompt is required") - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoTextToModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoTextToModelRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoTextToModelRequest( type=TripoTaskType.TEXT_TO_MODEL, prompt=prompt, negative_prompt=negative_prompt if negative_prompt else None, @@ -152,64 +155,89 @@ class TripoTextToModelNode: texture_quality=texture_quality, face_limit=face_limit, auto_size=True, - quad=quad + quad=quad, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoImageToModelNode: +class TripoImageToModelNode(IO.ComfyNode): """ Generates 3D models synchronously based on a single image using Tripo's API. """ - AVERAGE_DURATION = 80 + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - }, - "optional": { - "model_version": model_field_to_node_input(IO.COMBO, TripoImageToModelRequest, "model_version", enum_type=TripoModelVersion), - "style": model_field_to_node_input(IO.COMBO, TripoTextToModelRequest, "style", enum_type=TripoStyle, default="None"), - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "model_seed": ("INT", {"default": 42}), - "orientation": model_field_to_node_input(IO.COMBO, TripoImageToModelRequest, "orientation", enum_type=TripoOrientation), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "texture_alignment": (["original_image", "geometry"], {"default": "original_image"}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "quad": ("BOOLEAN", {"default": False}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoImageToModelNode", + display_name="Tripo: Image to Model", + category="api node/3d/Tripo", + inputs=[ + IO.Image.Input("image"), + IO.Combo.Input( + "model_version", + options=TripoModelVersion, + tooltip="The model version to use for generation", + optional=True, + ), + IO.Combo.Input("style", options=TripoStyle, default="None", optional=True), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("model_seed", default=42, optional=True), + IO.Combo.Input( + "orientation", options=TripoOrientation, default=TripoOrientation.DEFAULT, optional=True + ), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Combo.Input( + "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True + ), + IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Boolean.Input("quad", default=False, optional=True), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - - async def generate_mesh(self, image, model_version=None, style=None, texture=None, pbr=None, model_seed=None, orientation=None, texture_alignment=None, texture_seed=None, texture_quality=None, face_limit=None, quad=None, **kwargs): + @classmethod + async def execute( + cls, + image: torch.Tensor, + model_version: Optional[str] = None, + style: Optional[str] = None, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + model_seed: Optional[int] = None, + orientation=None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + texture_alignment: Optional[str] = None, + face_limit: Optional[int] = None, + quad: Optional[bool] = None, + ) -> IO.NodeOutput: style_enum = None if style == "None" else style if image is None: raise RuntimeError("Image is required") - tripo_file = await upload_image_to_tripo(image, **kwargs) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoImageToModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoImageToModelRequest( + tripo_file = TripoFileReference( + root=TripoUrlReference( + url=(await upload_images_to_comfyapi(cls, image, max_images=1))[0], + type="jpeg", + ) + ) + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoImageToModelRequest( type=TripoTaskType.IMAGE_TO_MODEL, file=tripo_file, model_version=model_version, @@ -223,80 +251,105 @@ class TripoImageToModelNode: texture_quality=texture_quality, face_limit=face_limit, auto_size=True, - quad=quad + quad=quad, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoMultiviewToModelNode: +class TripoMultiviewToModelNode(IO.ComfyNode): """ Generates 3D models synchronously based on up to four images (front, left, back, right) using Tripo's API. """ - AVERAGE_DURATION = 80 + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - }, - "optional": { - "image_left": ("IMAGE",), - "image_back": ("IMAGE",), - "image_right": ("IMAGE",), - "model_version": model_field_to_node_input(IO.COMBO, TripoMultiviewToModelRequest, "model_version", enum_type=TripoModelVersion), - "orientation": model_field_to_node_input(IO.COMBO, TripoImageToModelRequest, "orientation", enum_type=TripoOrientation), - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "model_seed": ("INT", {"default": 42}), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "texture_alignment": (["original_image", "geometry"], {"default": "original_image"}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "quad": ("BOOLEAN", {"default": False}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoMultiviewToModelNode", + display_name="Tripo: Multiview to Model", + category="api node/3d/Tripo", + inputs=[ + IO.Image.Input("image"), + IO.Image.Input("image_left", optional=True), + IO.Image.Input("image_back", optional=True), + IO.Image.Input("image_right", optional=True), + IO.Combo.Input( + "model_version", + options=TripoModelVersion, + optional=True, + tooltip="The model version to use for generation", + ), + IO.Combo.Input( + "orientation", + options=TripoOrientation, + default=TripoOrientation.DEFAULT, + optional=True, + ), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("model_seed", default=42, optional=True), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Combo.Input( + "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True + ), + IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True), + IO.Boolean.Input("quad", default=False, optional=True), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - - async def generate_mesh(self, image, image_left=None, image_back=None, image_right=None, model_version=None, orientation=None, texture=None, pbr=None, model_seed=None, texture_seed=None, texture_quality=None, texture_alignment=None, face_limit=None, quad=None, **kwargs): + @classmethod + async def execute( + cls, + image: torch.Tensor, + image_left: Optional[torch.Tensor] = None, + image_back: Optional[torch.Tensor] = None, + image_right: Optional[torch.Tensor] = None, + model_version: Optional[str] = None, + orientation: Optional[str] = None, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + model_seed: Optional[int] = None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + texture_alignment: Optional[str] = None, + face_limit: Optional[int] = None, + quad: Optional[bool] = None, + ) -> IO.NodeOutput: if image is None: raise RuntimeError("front image for multiview is required") images = [] - image_dict = { - "image": image, - "image_left": image_left, - "image_back": image_back, - "image_right": image_right - } + image_dict = {"image": image, "image_left": image_left, "image_back": image_back, "image_right": image_right} if image_left is None and image_back is None and image_right is None: raise RuntimeError("At least one of left, back, or right image must be provided for multiview") for image_name in ["image", "image_left", "image_back", "image_right"]: image_ = image_dict[image_name] if image_ is not None: - tripo_file = await upload_image_to_tripo(image_, **kwargs) - images.append(tripo_file) + images.append( + TripoFileReference( + root=TripoUrlReference( + url=(await upload_images_to_comfyapi(cls, image_, max_images=1))[0], type="jpeg" + ) + ) + ) else: images.append(TripoFileEmptyReference()) - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoMultiviewToModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoMultiviewToModelRequest( + response = await sync_op( + cls, + ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoMultiviewToModelRequest( type=TripoTaskType.MULTIVIEW_TO_MODEL, files=images, model_version=model_version, @@ -310,272 +363,283 @@ class TripoMultiviewToModelNode: face_limit=face_limit, quad=quad, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoTextureNode: +class TripoTextureNode(IO.ComfyNode): + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model_task_id": ("MODEL_TASK_ID",), - }, - "optional": { - "texture": ("BOOLEAN", {"default": True}), - "pbr": ("BOOLEAN", {"default": True}), - "texture_seed": ("INT", {"default": 42}), - "texture_quality": (["standard", "detailed"], {"default": "standard"}), - "texture_alignment": (["original_image", "geometry"], {"default": "original_image"}), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoTextureNode", + display_name="Tripo: Texture model", + category="api node/3d/Tripo", + inputs=[ + IO.Custom("MODEL_TASK_ID").Input("model_task_id"), + IO.Boolean.Input("texture", default=True, optional=True), + IO.Boolean.Input("pbr", default=True, optional=True), + IO.Int.Input("texture_seed", default=42, optional=True), + IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True), + IO.Combo.Input( + "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True + ), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 80 - - async def generate_mesh(self, model_task_id, texture=None, pbr=None, texture_seed=None, texture_quality=None, texture_alignment=None, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoTextureModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoTextureModelRequest( + @classmethod + async def execute( + cls, + model_task_id, + texture: Optional[bool] = None, + pbr: Optional[bool] = None, + texture_seed: Optional[int] = None, + texture_quality: Optional[str] = None, + texture_alignment: Optional[str] = None, + ) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoTextureModelRequest( original_model_task_id=model_task_id, texture=texture, pbr=pbr, texture_seed=texture_seed, texture_quality=texture_quality, - texture_alignment=texture_alignment + texture_alignment=texture_alignment, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=80) -class TripoRefineNode: +class TripoRefineNode(IO.ComfyNode): + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model_task_id": ("MODEL_TASK_ID", { - "tooltip": "Must be a v1.4 Tripo model" - }), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoRefineNode", + display_name="Tripo: Refine Draft model", + category="api node/3d/Tripo", + description="Refine a draft model created by v1.4 Tripo models only.", + inputs=[ + IO.Custom("MODEL_TASK_ID").Input("model_task_id", tooltip="Must be a v1.4 Tripo model"), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - DESCRIPTION = "Refine a draft model created by v1.4 Tripo models only." - - RETURN_TYPES = ("STRING", "MODEL_TASK_ID",) - RETURN_NAMES = ("model_file", "model task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 240 - - async def generate_mesh(self, model_task_id, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoRefineModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoRefineModelRequest( - draft_model_task_id=model_task_id - ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) - - -class TripoRigNode: @classmethod - def INPUT_TYPES(s): - return { - "required": { - "original_model_task_id": ("MODEL_TASK_ID",), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } - - RETURN_TYPES = ("STRING", "RIG_TASK_ID") - RETURN_NAMES = ("model_file", "rig task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 180 - - async def generate_mesh(self, original_model_task_id, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoAnimateRigRequest, - response_model=TripoTaskResponse, - ), - request=TripoAnimateRigRequest( - original_model_task_id=original_model_task_id, - out_format="glb", - spec="tripo" - ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + async def execute(cls, model_task_id) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoRefineModelRequest(draft_model_task_id=model_task_id), + ) + return await poll_until_finished(cls, response, average_duration=240) -class TripoRetargetNode: +class TripoRigNode(IO.ComfyNode): + @classmethod - def INPUT_TYPES(s): - return { - "required": { - "original_model_task_id": ("RIG_TASK_ID",), - "animation": ([ - "preset:idle", - "preset:walk", - "preset:climb", - "preset:jump", - "preset:slash", - "preset:shoot", - "preset:hurt", - "preset:fall", - "preset:turn", - ],), - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } + def define_schema(cls): + return IO.Schema( + node_id="TripoRigNode", + display_name="Tripo: Rig model", + category="api node/3d/Tripo", + inputs=[IO.Custom("MODEL_TASK_ID").Input("original_model_task_id")], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("RIG_TASK_ID").Output(display_name="rig task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) - RETURN_TYPES = ("STRING", "RETARGET_TASK_ID") - RETURN_NAMES = ("model_file", "retarget task_id") - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 30 + @classmethod + async def execute(cls, original_model_task_id) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoAnimateRigRequest(original_model_task_id=original_model_task_id, out_format="glb", spec="tripo"), + ) + return await poll_until_finished(cls, response, average_duration=180) - async def generate_mesh(self, animation, original_model_task_id, **kwargs): - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoAnimateRetargetRequest, - response_model=TripoTaskResponse, - ), - request=TripoAnimateRetargetRequest( + +class TripoRetargetNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TripoRetargetNode", + display_name="Tripo: Retarget rigged model", + category="api node/3d/Tripo", + inputs=[ + IO.Custom("RIG_TASK_ID").Input("original_model_task_id"), + IO.Combo.Input( + "animation", + options=[ + "preset:idle", + "preset:walk", + "preset:climb", + "preset:jump", + "preset:slash", + "preset:shoot", + "preset:hurt", + "preset:fall", + "preset:turn", + ], + ), + ], + outputs=[ + IO.String.Output(display_name="model_file"), + IO.Custom("RETARGET_TASK_ID").Output(display_name="retarget task_id"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) + + @classmethod + async def execute(cls, original_model_task_id, animation: str) -> IO.NodeOutput: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoAnimateRetargetRequest( original_model_task_id=original_model_task_id, animation=animation, out_format="glb", - bake_animation=True + bake_animation=True, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=30) -class TripoConversionNode: - @classmethod - def INPUT_TYPES(s): - return { - "required": { - "original_model_task_id": ("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID",), - "format": (["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"],), - }, - "optional": { - "quad": ("BOOLEAN", {"default": False}), - "face_limit": ("INT", {"min": -1, "max": 500000, "default": -1}), - "texture_size": ("INT", {"min": 128, "max": 4096, "default": 4096}), - "texture_format": (["BMP", "DPX", "HDR", "JPEG", "OPEN_EXR", "PNG", "TARGA", "TIFF", "WEBP"], {"default": "JPEG"}) - }, - "hidden": { - "auth_token": "AUTH_TOKEN_COMFY_ORG", - "comfy_api_key": "API_KEY_COMFY_ORG", - "unique_id": "UNIQUE_ID", - }, - } +class TripoConversionNode(IO.ComfyNode): @classmethod - def VALIDATE_INPUTS(cls, input_types): + def define_schema(cls): + return IO.Schema( + node_id="TripoConversionNode", + display_name="Tripo: Convert model", + category="api node/3d/Tripo", + inputs=[ + IO.Custom("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID").Input("original_model_task_id"), + IO.Combo.Input("format", options=["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"]), + IO.Boolean.Input("quad", default=False, optional=True), + IO.Int.Input( + "face_limit", + default=-1, + min=-1, + max=500000, + optional=True, + ), + IO.Int.Input( + "texture_size", + default=4096, + min=128, + max=4096, + optional=True, + ), + IO.Combo.Input( + "texture_format", + options=["BMP", "DPX", "HDR", "JPEG", "OPEN_EXR", "PNG", "TARGA", "TIFF", "WEBP"], + default="JPEG", + optional=True, + ), + ], + outputs=[], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + is_output_node=True, + ) + + @classmethod + def validate_inputs(cls, input_types): # The min and max of input1 and input2 are still validated because # we didn't take `input1` or `input2` as arguments if input_types["original_model_task_id"] not in ("MODEL_TASK_ID", "RIG_TASK_ID", "RETARGET_TASK_ID"): return "original_model_task_id must be MODEL_TASK_ID, RIG_TASK_ID or RETARGET_TASK_ID type" return True - RETURN_TYPES = () - FUNCTION = "generate_mesh" - CATEGORY = "api node/3d/Tripo" - API_NODE = True - OUTPUT_NODE = True - AVERAGE_DURATION = 30 - - async def generate_mesh(self, original_model_task_id, format, quad, face_limit, texture_size, texture_format, **kwargs): + @classmethod + async def execute( + cls, + original_model_task_id, + format: str, + quad: bool, + face_limit: int, + texture_size: int, + texture_format: str, + ) -> IO.NodeOutput: if not original_model_task_id: raise RuntimeError("original_model_task_id is required") - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path="/proxy/tripo/v2/openapi/task", - method=HttpMethod.POST, - request_model=TripoConvertModelRequest, - response_model=TripoTaskResponse, - ), - request=TripoConvertModelRequest( + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=TripoConvertModelRequest( original_model_task_id=original_model_task_id, format=format, quad=quad if quad else None, face_limit=face_limit if face_limit != -1 else None, texture_size=texture_size if texture_size != 4096 else None, - texture_format=texture_format if texture_format != "JPEG" else None + texture_format=texture_format if texture_format != "JPEG" else None, ), - auth_kwargs=kwargs, - ).execute() - return await poll_until_finished(kwargs, response) + ) + return await poll_until_finished(cls, response, average_duration=30) -NODE_CLASS_MAPPINGS = { - "TripoTextToModelNode": TripoTextToModelNode, - "TripoImageToModelNode": TripoImageToModelNode, - "TripoMultiviewToModelNode": TripoMultiviewToModelNode, - "TripoTextureNode": TripoTextureNode, - "TripoRefineNode": TripoRefineNode, - "TripoRigNode": TripoRigNode, - "TripoRetargetNode": TripoRetargetNode, - "TripoConversionNode": TripoConversionNode, -} +class TripoExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + TripoTextToModelNode, + TripoImageToModelNode, + TripoMultiviewToModelNode, + TripoTextureNode, + TripoRefineNode, + TripoRigNode, + TripoRetargetNode, + TripoConversionNode, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "TripoTextToModelNode": "Tripo: Text to Model", - "TripoImageToModelNode": "Tripo: Image to Model", - "TripoMultiviewToModelNode": "Tripo: Multiview to Model", - "TripoTextureNode": "Tripo: Texture model", - "TripoRefineNode": "Tripo: Refine Draft model", - "TripoRigNode": "Tripo: Rig model", - "TripoRetargetNode": "Tripo: Retarget rigged model", - "TripoConversionNode": "Tripo: Convert model", -} + +async def comfy_entrypoint() -> TripoExtension: + return TripoExtension() diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 251aecd42..d37e9e9b4 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -1,57 +1,35 @@ -import logging import base64 -import aiohttp -import torch from io import BytesIO -from typing import Optional + from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io from comfy_api.input_impl.video_types import VideoFromFile -from comfy_api_nodes.apis import ( - VeoGenVidRequest, - VeoGenVidResponse, +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.apis.veo_api import ( VeoGenVidPollRequest, VeoGenVidPollResponse, + VeoGenVidRequest, + VeoGenVidResponse, ) -from comfy_api_nodes.apis.client import ( +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, -) - -from comfy_api_nodes.apinode_utils import ( - downscale_image_tensor, + download_url_to_video_output, + poll_op, + sync_op, tensor_to_base64_string, ) AVERAGE_DURATION_VIDEO_GEN = 32 - -def convert_image_to_base64(image: torch.Tensor): - if image is None: - return None - - scaled_image = downscale_image_tensor(image, total_pixels=2048*2048) - return tensor_to_base64_string(scaled_image) +MODELS_MAP = { + "veo-2.0-generate-001": "veo-2.0-generate-001", + "veo-3.1-generate": "veo-3.1-generate-preview", + "veo-3.1-fast-generate": "veo-3.1-fast-generate-preview", + "veo-3.0-generate-001": "veo-3.0-generate-001", + "veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001", +} -def get_video_url_from_response(poll_response: VeoGenVidPollResponse) -> Optional[str]: - if ( - poll_response.response - and hasattr(poll_response.response, "videos") - and poll_response.response.videos - and len(poll_response.response.videos) > 0 - ): - video = poll_response.response.videos[0] - else: - return None - if hasattr(video, "gcsUri") and video.gcsUri: - return str(video.gcsUri) - return None - - -class VeoVideoGenerationNode(comfy_io.ComfyNode): +class VeoVideoGenerationNode(IO.ComfyNode): """ Generates videos from text prompts using Google's Veo API. @@ -61,71 +39,71 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="VeoVideoGenerationNode", display_name="Google Veo 2 Video Generation", category="api node/video/Veo", description="Generates videos from text prompts using Google's Veo 2 API", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text description of the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["16:9", "9:16"], default="16:9", tooltip="Aspect ratio of the output video", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration_seconds", default=5, min=5, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "enhance_prompt", default=True, tooltip="Whether to enhance the prompt with AI assistance", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "person_generation", options=["ALLOW", "BLOCK"], default="ALLOW", tooltip="Whether to allow generating people in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=0xFFFFFFFF, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Optional reference image to guide video generation", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["veo-2.0-generate-001"], default="veo-2.0-generate-001", @@ -134,12 +112,12 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -158,21 +136,17 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): model="veo-2.0-generate-001", generate_audio=False, ): + model = MODELS_MAP[model] # Prepare the instances for the request instances = [] - instance = { - "prompt": prompt - } + instance = {"prompt": prompt} # Add image if provided if image is not None: - image_base64 = convert_image_to_base64(image) + image_base64 = tensor_to_base64_string(image) if image_base64: - instance["image"] = { - "bytesBase64Encoded": image_base64, - "mimeType": "image/png" - } + instance["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"} instances.append(instance) @@ -190,119 +164,77 @@ class VeoVideoGenerationNode(comfy_io.ComfyNode): if seed > 0: parameters["seed"] = seed # Only add generateAudio for Veo 3 models - if "veo-3.0" in model: + if model.find("veo-2.0") == -1: parameters["generateAudio"] = generate_audio - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - # Initial request to start video generation - initial_operation = SynchronousOperation( - endpoint=ApiEndpoint( - path=f"/proxy/veo/{model}/generate", - method=HttpMethod.POST, - request_model=VeoGenVidRequest, - response_model=VeoGenVidResponse - ), - request=VeoGenVidRequest( + initial_response = await sync_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"), + response_model=VeoGenVidResponse, + data=VeoGenVidRequest( instances=instances, - parameters=parameters + parameters=parameters, ), - auth_kwargs=auth, ) - initial_response = await initial_operation.execute() - operation_name = initial_response.name - - logging.info(f"Veo generation started with operation name: {operation_name}") - - # Define status extractor function def status_extractor(response): # Only return "completed" if the operation is done, regardless of success or failure # We'll check for errors after polling completes return "completed" if response.done else "pending" - # Define progress extractor function - def progress_extractor(response): - # Could be enhanced if the API provides progress information - return None - - # Define the polling operation - poll_operation = PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/veo/{model}/poll", - method=HttpMethod.POST, - request_model=VeoGenVidPollRequest, - response_model=VeoGenVidPollResponse - ), - completed_statuses=["completed"], - failed_statuses=[], # No failed statuses, we'll handle errors after polling + poll_response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"), + response_model=VeoGenVidPollResponse, status_extractor=status_extractor, - progress_extractor=progress_extractor, - request=VeoGenVidPollRequest( - operationName=operation_name + data=VeoGenVidPollRequest( + operationName=initial_response.name, ), - auth_kwargs=auth, poll_interval=5.0, - result_url_extractor=get_video_url_from_response, - node_id=cls.hidden.unique_id, estimated_duration=AVERAGE_DURATION_VIDEO_GEN, ) - # Execute the polling operation - poll_response = await poll_operation.execute() - # Now check for errors in the final response # Check for error in poll response - if hasattr(poll_response, 'error') and poll_response.error: - error_message = f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})" - logging.error(error_message) - raise Exception(error_message) + if poll_response.error: + raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})") # Check for RAI filtered content - if (hasattr(poll_response.response, 'raiMediaFilteredCount') and - poll_response.response.raiMediaFilteredCount > 0): + if ( + hasattr(poll_response.response, "raiMediaFilteredCount") + and poll_response.response.raiMediaFilteredCount > 0 + ): # Extract reason message if available - if (hasattr(poll_response.response, 'raiMediaFilteredReasons') and - poll_response.response.raiMediaFilteredReasons): + if ( + hasattr(poll_response.response, "raiMediaFilteredReasons") + and poll_response.response.raiMediaFilteredReasons + ): reason = poll_response.response.raiMediaFilteredReasons[0] error_message = f"Content filtered by Google's Responsible AI practices: {reason} ({poll_response.response.raiMediaFilteredCount} videos filtered.)" else: error_message = f"Content filtered by Google's Responsible AI practices ({poll_response.response.raiMediaFilteredCount} videos filtered.)" - logging.error(error_message) raise Exception(error_message) # Extract video data - if poll_response.response and hasattr(poll_response.response, 'videos') and poll_response.response.videos and len(poll_response.response.videos) > 0: + if ( + poll_response.response + and hasattr(poll_response.response, "videos") + and poll_response.response.videos + and len(poll_response.response.videos) > 0 + ): video = poll_response.response.videos[0] # Check if video is provided as base64 or URL - if hasattr(video, 'bytesBase64Encoded') and video.bytesBase64Encoded: - # Decode base64 string to bytes - video_data = base64.b64decode(video.bytesBase64Encoded) - elif hasattr(video, 'gcsUri') and video.gcsUri: - # Download from URL - async with aiohttp.ClientSession() as session: - async with session.get(video.gcsUri) as video_response: - video_data = await video_response.content.read() - else: - raise Exception("Video returned but no data or URL was provided") - else: - raise Exception("Video generation completed but no video was returned") + if hasattr(video, "bytesBase64Encoded") and video.bytesBase64Encoded: + return IO.NodeOutput(VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded)))) - if not video_data: - raise Exception("No video data was returned") + if hasattr(video, "gcsUri") and video.gcsUri: + return IO.NodeOutput(await download_url_to_video_output(video.gcsUri)) - logging.info("Video generation completed successfully") - - # Convert video data to BytesIO object - video_io = BytesIO(video_data) - - # Return VideoFromFile object - return comfy_io.NodeOutput(VideoFromFile(video_io)) + raise Exception("Video returned but no data or URL was provided") + raise Exception("Video generation completed but no video was returned") class Veo3VideoGenerationNode(VeoVideoGenerationNode): @@ -319,78 +251,83 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="Veo3VideoGenerationNode", display_name="Google Veo 3 Video Generation", category="api node/video/Veo", description="Generates videos from text prompts using Google's Veo 3 API", inputs=[ - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Text description of the video", ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", options=["16:9", "9:16"], default="16:9", tooltip="Aspect ratio of the output video", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration_seconds", default=8, min=8, max=8, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "enhance_prompt", default=True, tooltip="Whether to enhance the prompt with AI assistance", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "person_generation", options=["ALLOW", "BLOCK"], default="ALLOW", tooltip="Whether to allow generating people in the video", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=0xFFFFFFFF, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="Optional reference image to guide video generation", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=["veo-3.0-generate-001", "veo-3.0-fast-generate-001"], + options=[ + "veo-3.1-generate", + "veo-3.1-fast-generate", + "veo-3.0-generate-001", + "veo-3.0-fast-generate-001", + ], default="veo-3.0-generate-001", tooltip="Veo 3 model to use for video generation", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "generate_audio", default=False, tooltip="Generate audio for the video. Supported by all Veo 3 models.", @@ -398,12 +335,12 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -411,11 +348,12 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode): class VeoExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ VeoVideoGenerationNode, Veo3VideoGenerationNode, ] + async def comfy_entrypoint() -> VeoExtension: return VeoExtension() diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 2f441948c..7a679f0d9 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -1,27 +1,23 @@ import logging from enum import Enum -from typing import Any, Callable, Optional, Literal, TypeVar -from typing_extensions import override +from typing import Literal, Optional, TypeVar import torch from pydantic import BaseModel, Field +from typing_extensions import override -from comfy_api.latest import ComfyExtension, io as comfy_io -from comfy_api_nodes.util.validation_utils import ( - validate_aspect_ratio_closeness, - validate_image_dimensions, - validate_image_aspect_ratio_range, - get_number_of_images, -) -from comfy_api_nodes.apis.client import ( +from comfy_api.latest import IO, ComfyExtension +from comfy_api_nodes.util import ( ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, + download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, + upload_images_to_comfyapi, + validate_image_aspect_ratio, + validate_image_dimensions, + validate_images_aspect_ratio_closeness, ) -from comfy_api_nodes.apinode_utils import download_url_to_video_output, upload_images_to_comfyapi - VIDU_TEXT_TO_VIDEO = "/proxy/vidu/text2video" VIDU_IMAGE_TO_VIDEO = "/proxy/vidu/img2video" @@ -31,8 +27,9 @@ VIDU_GET_GENERATION_STATUS = "/proxy/vidu/tasks/%s/creations" R = TypeVar("R") + class VideoModelName(str, Enum): - vidu_q1 = 'viduq1' + vidu_q1 = "viduq1" class AspectRatio(str, Enum): @@ -63,17 +60,9 @@ class TaskCreationRequest(BaseModel): images: Optional[list[str]] = Field(None, description="Base64 encoded string or image URL") -class TaskStatus(str, Enum): - created = "created" - queueing = "queueing" - processing = "processing" - success = "success" - failed = "failed" - - class TaskCreationResponse(BaseModel): task_id: str = Field(...) - state: TaskStatus = Field(...) + state: str = Field(...) created_at: str = Field(...) code: Optional[int] = Field(None, description="Error code") @@ -85,32 +74,11 @@ class TaskResult(BaseModel): class TaskStatusResponse(BaseModel): - state: TaskStatus = Field(...) + state: str = Field(...) err_code: Optional[str] = Field(None) creations: list[TaskResult] = Field(..., description="Generated results") -async def poll_until_finished( - auth_kwargs: dict[str, str], - api_endpoint: ApiEndpoint[Any, R], - result_url_extractor: Optional[Callable[[R], str]] = None, - estimated_duration: Optional[int] = None, - node_id: Optional[str] = None, -) -> R: - return await PollingOperation( - poll_endpoint=api_endpoint, - completed_statuses=[TaskStatus.success.value], - failed_statuses=[TaskStatus.failed.value], - status_extractor=lambda response: response.state.value, - auth_kwargs=auth_kwargs, - result_url_extractor=result_url_extractor, - estimated_duration=estimated_duration, - node_id=node_id, - poll_interval=16.0, - max_poll_attempts=256, - ).execute() - - def get_video_url_from_response(response) -> Optional[str]: if response.creations: return response.creations[0].url @@ -127,111 +95,101 @@ def get_video_from_response(response) -> TaskResult: async def execute_task( + cls: type[IO.ComfyNode], vidu_endpoint: str, - auth_kwargs: Optional[dict[str, str]], payload: TaskCreationRequest, estimated_duration: int, - node_id: str, ) -> R: - response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=vidu_endpoint, - method=HttpMethod.POST, - request_model=TaskCreationRequest, - response_model=TaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - if response.state == TaskStatus.failed: + response = await sync_op( + cls, + endpoint=ApiEndpoint(path=vidu_endpoint, method="POST"), + response_model=TaskCreationResponse, + data=payload, + ) + if response.state == "failed": error_msg = f"Vidu request failed. Code: {response.code}" logging.error(error_msg) raise RuntimeError(error_msg) - return await poll_until_finished( - auth_kwargs, - ApiEndpoint( - path=VIDU_GET_GENERATION_STATUS % response.task_id, - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=TaskStatusResponse, - ), - result_url_extractor=get_video_url_from_response, + return await poll_op( + cls, + ApiEndpoint(path=VIDU_GET_GENERATION_STATUS % response.task_id), + response_model=TaskStatusResponse, + status_extractor=lambda r: r.state, estimated_duration=estimated_duration, - node_id=node_id, ) -class ViduTextToVideoNode(comfy_io.ComfyNode): +class ViduTextToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduTextToVideoNode", display_name="Vidu Text To Video Generation", category="api node/video/Vidu", description="Generate video from text prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in VideoModelName], - default=VideoModelName.vidu_q1.value, + options=VideoModelName, + default=VideoModelName.vidu_q1, tooltip="Model name", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="A textual description for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", - options=[model.value for model in AspectRatio], - default=AspectRatio.r_16_9.value, + options=AspectRatio, + default=AspectRatio.r_16_9, tooltip="The aspect ratio of the output video", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", - options=[model.value for model in Resolution], - default=Resolution.r_1080p.value, + options=Resolution, + default=Resolution.r_1080p, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", - options=[model.value for model in MovementAmplitude], - default=MovementAmplitude.auto.value, + options=MovementAmplitude, + default=MovementAmplitude.auto, tooltip="The movement amplitude of objects in the frame", optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -246,7 +204,7 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): aspect_ratio: str, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if not prompt: raise ValueError("The prompt field is required and cannot be empty.") payload = TaskCreationRequest( @@ -258,84 +216,80 @@ class ViduTextToVideoNode(comfy_io.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } - results = await execute_task(VIDU_TEXT_TO_VIDEO, auth, payload, 320, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + results = await execute_task(cls, VIDU_TEXT_TO_VIDEO, payload, 320) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) -class ViduImageToVideoNode(comfy_io.ComfyNode): +class ViduImageToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduImageToVideoNode", display_name="Vidu Image To Video Generation", category="api node/video/Vidu", description="Generate video from image and optional prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in VideoModelName], - default=VideoModelName.vidu_q1.value, + options=VideoModelName, + default=VideoModelName.vidu_q1, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "image", tooltip="An image to be used as the start frame of the generated video", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="A textual description for video generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", - options=[model.value for model in Resolution], - default=Resolution.r_1080p.value, + options=Resolution, + default=Resolution.r_1080p, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", - options=[model.value for model in MovementAmplitude], + options=MovementAmplitude, default=MovementAmplitude.auto.value, tooltip="The movement amplitude of objects in the frame", optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -350,10 +304,10 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): seed: int, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if get_number_of_images(image) > 1: raise ValueError("Only one input image is allowed.") - validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_aspect_ratio(image, (1, 4), (4, 1)) payload = TaskCreationRequest( model_name=model, prompt=prompt, @@ -362,81 +316,77 @@ class ViduImageToVideoNode(comfy_io.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } payload.images = await upload_images_to_comfyapi( + cls, image, max_images=1, mime_type="image/png", - auth_kwargs=auth, ) - results = await execute_task(VIDU_IMAGE_TO_VIDEO, auth, payload, 120, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + results = await execute_task(cls, VIDU_IMAGE_TO_VIDEO, payload, 120) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) -class ViduReferenceVideoNode(comfy_io.ComfyNode): +class ViduReferenceVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduReferenceVideoNode", display_name="Vidu Reference To Video Generation", category="api node/video/Vidu", description="Generate video from multiple images and prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=[model.value for model in VideoModelName], - default=VideoModelName.vidu_q1.value, + options=VideoModelName, + default=VideoModelName.vidu_q1, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "images", tooltip="Images to use as references to generate a video with consistent subjects (max 7 images).", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="A textual description for video generation", ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "aspect_ratio", - options=[model.value for model in AspectRatio], - default=AspectRatio.r_16_9.value, + options=AspectRatio, + default=AspectRatio.r_16_9, tooltip="The aspect ratio of the output video", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[model.value for model in Resolution], default=Resolution.r_1080p.value, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", options=[model.value for model in MovementAmplitude], default=MovementAmplitude.auto.value, @@ -445,12 +395,12 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -466,14 +416,14 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): aspect_ratio: str, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: + ) -> IO.NodeOutput: if not prompt: raise ValueError("The prompt field is required and cannot be empty.") a = get_number_of_images(images) if a > 7: raise ValueError("Too many images, maximum allowed is 7.") for image in images: - validate_image_aspect_ratio_range(image, (1, 4), (4, 1)) + validate_image_aspect_ratio(image, (1, 4), (4, 1)) validate_image_dimensions(image, min_width=128, min_height=128) payload = TaskCreationRequest( model_name=model, @@ -484,79 +434,75 @@ class ViduReferenceVideoNode(comfy_io.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } payload.images = await upload_images_to_comfyapi( + cls, images, max_images=7, mime_type="image/png", - auth_kwargs=auth, ) - results = await execute_task(VIDU_REFERENCE_VIDEO, auth, payload, 120, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + results = await execute_task(cls, VIDU_REFERENCE_VIDEO, payload, 120) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) -class ViduStartEndToVideoNode(comfy_io.ComfyNode): +class ViduStartEndToVideoNode(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="ViduStartEndToVideoNode", display_name="Vidu Start End To Video Generation", category="api node/video/Vidu", description="Generate a video from start and end frames and a prompt", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=[model.value for model in VideoModelName], default=VideoModelName.vidu_q1.value, tooltip="Model name", ), - comfy_io.Image.Input( + IO.Image.Input( "first_frame", tooltip="Start frame", ), - comfy_io.Image.Input( + IO.Image.Input( "end_frame", tooltip="End frame", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, tooltip="A textual description for video generation", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=5, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Duration of the output video in seconds", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed for video generation (0 for random)", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[model.value for model in Resolution], default=Resolution.r_1080p.value, tooltip="Supported values may vary by model & duration", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "movement_amplitude", options=[model.value for model in MovementAmplitude], default=MovementAmplitude.auto.value, @@ -565,12 +511,12 @@ class ViduStartEndToVideoNode(comfy_io.ComfyNode): ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -586,8 +532,8 @@ class ViduStartEndToVideoNode(comfy_io.ComfyNode): seed: int, resolution: str, movement_amplitude: str, - ) -> comfy_io.NodeOutput: - validate_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) + ) -> IO.NodeOutput: + validate_images_aspect_ratio_closeness(first_frame, end_frame, min_rel=0.8, max_rel=1.25, strict=False) payload = TaskCreationRequest( model_name=model, prompt=prompt, @@ -596,21 +542,17 @@ class ViduStartEndToVideoNode(comfy_io.ComfyNode): resolution=resolution, movement_amplitude=movement_amplitude, ) - auth = { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - } payload.images = [ - (await upload_images_to_comfyapi(frame, max_images=1, mime_type="image/png", auth_kwargs=auth))[0] + (await upload_images_to_comfyapi(cls, frame, max_images=1, mime_type="image/png"))[0] for frame in (first_frame, end_frame) ] - results = await execute_task(VIDU_START_END_VIDEO, auth, payload, 96, cls.hidden.unique_id) - return comfy_io.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) + results = await execute_task(cls, VIDU_START_END_VIDEO, payload, 96) + return IO.NodeOutput(await download_url_to_video_output(get_video_from_response(results).url)) class ViduExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ ViduTextToVideoNode, ViduImageToVideoNode, @@ -618,5 +560,6 @@ class ViduExtension(ComfyExtension): ViduStartEndToVideoNode, ] + async def comfy_entrypoint() -> ViduExtension: return ViduExtension() diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index db5bd41c1..2aab3c2ff 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -1,33 +1,35 @@ import re -from typing import Optional, Type, Union -from typing_extensions import override +from typing import Optional import torch from pydantic import BaseModel, Field -from comfy_api.latest import ComfyExtension, Input, io as comfy_io -from comfy_api_nodes.apis.client import ( - ApiEndpoint, - HttpMethod, - SynchronousOperation, - PollingOperation, - EmptyRequest, - R, - T, -) -from comfy_api_nodes.util.validation_utils import get_number_of_images, validate_audio_duration +from typing_extensions import override -from comfy_api_nodes.apinode_utils import ( +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.util import ( + ApiEndpoint, + audio_to_base64_string, download_url_to_image_tensor, download_url_to_video_output, + get_number_of_images, + poll_op, + sync_op, tensor_to_base64_string, - audio_to_base64_string, + validate_audio_duration, ) + class Text2ImageInputField(BaseModel): prompt: str = Field(...) negative_prompt: Optional[str] = Field(None) +class Image2ImageInputField(BaseModel): + prompt: str = Field(...) + negative_prompt: Optional[str] = Field(None) + images: list[str] = Field(..., min_length=1, max_length=2) + + class Text2VideoInputField(BaseModel): prompt: str = Field(...) negative_prompt: Optional[str] = Field(None) @@ -49,6 +51,13 @@ class Txt2ImageParametersField(BaseModel): watermark: bool = Field(True) +class Image2ImageParametersField(BaseModel): + size: Optional[str] = Field(None) + n: int = Field(1, description="Number of images to generate.") # we support only value=1 + seed: int = Field(..., ge=0, le=2147483647) + watermark: bool = Field(True) + + class Text2VideoParametersField(BaseModel): size: str = Field(...) seed: int = Field(..., ge=0, le=2147483647) @@ -73,6 +82,12 @@ class Text2ImageTaskCreationRequest(BaseModel): parameters: Txt2ImageParametersField = Field(...) +class Image2ImageTaskCreationRequest(BaseModel): + model: str = Field(...) + input: Image2ImageInputField = Field(...) + parameters: Image2ImageParametersField = Field(...) + + class Text2VideoTaskCreationRequest(BaseModel): model: str = Field(...) input: Text2VideoInputField = Field(...) @@ -127,79 +142,38 @@ class VideoTaskStatusResponse(BaseModel): request_id: str = Field(...) -RES_IN_PARENS = re.compile(r'\((\d+)\s*[x×]\s*(\d+)\)') +RES_IN_PARENS = re.compile(r"\((\d+)\s*[x×]\s*(\d+)\)") -async def process_task( - auth_kwargs: dict[str, str], - url: str, - request_model: Type[T], - response_model: Type[R], - payload: Union[Text2ImageTaskCreationRequest, Text2VideoTaskCreationRequest, Image2VideoTaskCreationRequest], - node_id: str, - estimated_duration: int, - poll_interval: int, -) -> Type[R]: - initial_response = await SynchronousOperation( - endpoint=ApiEndpoint( - path=url, - method=HttpMethod.POST, - request_model=request_model, - response_model=TaskCreationResponse, - ), - request=payload, - auth_kwargs=auth_kwargs, - ).execute() - - if not initial_response.output: - raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") - - return await PollingOperation( - poll_endpoint=ApiEndpoint( - path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}", - method=HttpMethod.GET, - request_model=EmptyRequest, - response_model=response_model, - ), - completed_statuses=["SUCCEEDED"], - failed_statuses=["FAILED", "CANCELED", "UNKNOWN"], - status_extractor=lambda x: x.output.task_status, - estimated_duration=estimated_duration, - poll_interval=poll_interval, - node_id=node_id, - auth_kwargs=auth_kwargs, - ).execute() - - -class WanTextToImageApi(comfy_io.ComfyNode): +class WanTextToImageApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="WanTextToImageApi", display_name="Wan Text to Image", category="api node/image/Wan", description="Generates image based on text prompt.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["wan2.5-t2i-preview"], default="wan2.5-t2i-preview", tooltip="Model to use.", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid.", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "width", default=1024, min=768, @@ -207,7 +181,7 @@ class WanTextToImageApi(comfy_io.ComfyNode): step=32, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "height", default=1024, min=768, @@ -215,37 +189,37 @@ class WanTextToImageApi(comfy_io.ComfyNode): step=32, optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_extend", default=True, tooltip="Whether to enhance the prompt with AI assistance.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], outputs=[ - comfy_io.Image.Output(), + IO.Image.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -262,61 +236,187 @@ class WanTextToImageApi(comfy_io.ComfyNode): prompt_extend: bool = True, watermark: bool = True, ): - payload = Text2ImageTaskCreationRequest( - model=model, - input=Text2ImageInputField(prompt=prompt, negative_prompt=negative_prompt), - parameters=Txt2ImageParametersField( - size=f"{width}*{height}", - seed=seed, - prompt_extend=prompt_extend, - watermark=watermark, + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/text2image/image-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Text2ImageTaskCreationRequest( + model=model, + input=Text2ImageInputField(prompt=prompt, negative_prompt=negative_prompt), + parameters=Txt2ImageParametersField( + size=f"{width}*{height}", + seed=seed, + prompt_extend=prompt_extend, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/text2image/image-synthesis", - request_model=Text2ImageTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=ImageTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=9, poll_interval=3, ) - return comfy_io.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) + return IO.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) -class WanTextToVideoApi(comfy_io.ComfyNode): +class WanImageToImageApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( - node_id="WanTextToVideoApi", - display_name="Wan Text to Video", - category="api node/video/Wan", - description="Generates video based on text prompt.", + return IO.Schema( + node_id="WanImageToImageApi", + display_name="Wan Image to Image", + category="api node/image/Wan", + description="Generates an image from one or two input images and a text prompt. " + "The output image is currently fixed at 1.6 MP; its aspect ratio matches the input image(s).", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", - options=["wan2.5-t2v-preview"], - default="wan2.5-t2v-preview", + options=["wan2.5-i2i-preview"], + default="wan2.5-i2i-preview", tooltip="Model to use.", ), - comfy_io.String.Input( + IO.Image.Input( + "image", + tooltip="Single-image editing or multi-image fusion, maximum 2 images.", + ), + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid.", optional=True, ), - comfy_io.Combo.Input( + # redo this later as an optional combo of recommended resolutions + # IO.Int.Input( + # "width", + # default=1280, + # min=384, + # max=1440, + # step=16, + # optional=True, + # ), + # IO.Int.Input( + # "height", + # default=1280, + # min=384, + # max=1440, + # step=16, + # optional=True, + # ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + step=1, + display_mode=IO.NumberDisplay.number, + control_after_generate=True, + tooltip="Seed to use for generation.", + optional=True, + ), + IO.Boolean.Input( + "watermark", + default=True, + tooltip='Whether to add an "AI generated" watermark to the result.', + optional=True, + ), + ], + outputs=[ + IO.Image.Output(), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + ) + + @classmethod + async def execute( + cls, + model: str, + image: torch.Tensor, + prompt: str, + negative_prompt: str = "", + # width: int = 1024, + # height: int = 1024, + seed: int = 0, + watermark: bool = True, + ): + n_images = get_number_of_images(image) + if n_images not in (1, 2): + raise ValueError(f"Expected 1 or 2 input images, got {n_images}.") + images = [] + for i in image: + images.append("data:image/png;base64," + tensor_to_base64_string(i, total_pixels=4096 * 4096)) + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/image2image/image-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Image2ImageTaskCreationRequest( + model=model, + input=Image2ImageInputField(prompt=prompt, negative_prompt=negative_prompt, images=images), + parameters=Image2ImageParametersField( + # size=f"{width}*{height}", + seed=seed, + watermark=watermark, + ), + ), + ) + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), + response_model=ImageTaskStatusResponse, + status_extractor=lambda x: x.output.task_status, + estimated_duration=42, + poll_interval=4, + ) + return IO.NodeOutput(await download_url_to_image_tensor(str(response.output.results[0].url))) + + +class WanTextToVideoApi(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="WanTextToVideoApi", + display_name="Wan Text to Video", + category="api node/video/Wan", + description="Generates video based on text prompt.", + inputs=[ + IO.Combo.Input( + "model", + options=["wan2.5-t2v-preview"], + default="wan2.5-t2v-preview", + tooltip="Model to use.", + ), + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", + ), + IO.String.Input( + "negative_prompt", + multiline=True, + default="", + tooltip="Negative text prompt to guide what to avoid.", + optional=True, + ), + IO.Combo.Input( "size", options=[ "480p: 1:1 (624x624)", @@ -336,58 +436,58 @@ class WanTextToVideoApi(comfy_io.ComfyNode): default="480p: 1:1 (624x624)", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=10, step=5, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Available durations: 5 and 10 seconds", optional=True, ), - comfy_io.Audio.Input( + IO.Audio.Input( "audio", optional=True, tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "generate_audio", default=False, optional=True, tooltip="If there is no audio input, generate audio automatically.", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_extend", default=True, tooltip="Whether to enhance the prompt with AI assistance.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -411,66 +511,69 @@ class WanTextToVideoApi(comfy_io.ComfyNode): if audio is not None: validate_audio_duration(audio, 3.0, 29.0) audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame") - payload = Text2VideoTaskCreationRequest( - model=model, - input=Text2VideoInputField(prompt=prompt, negative_prompt=negative_prompt, audio_url=audio_url), - parameters=Text2VideoParametersField( - size=f"{width}*{height}", - duration=duration, - seed=seed, - audio=generate_audio, - prompt_extend=prompt_extend, - watermark=watermark, + + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Text2VideoTaskCreationRequest( + model=model, + input=Text2VideoInputField(prompt=prompt, negative_prompt=negative_prompt, audio_url=audio_url), + parameters=Text2VideoParametersField( + size=f"{width}*{height}", + duration=duration, + seed=seed, + audio=generate_audio, + prompt_extend=prompt_extend, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", - request_model=Text2VideoTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=VideoTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=120 * int(duration / 5), poll_interval=6, ) - return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url)) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) -class WanImageToVideoApi(comfy_io.ComfyNode): +class WanImageToVideoApi(IO.ComfyNode): @classmethod def define_schema(cls): - return comfy_io.Schema( + return IO.Schema( node_id="WanImageToVideoApi", display_name="Wan Image to Video", category="api node/video/Wan", description="Generates video based on the first frame and text prompt.", inputs=[ - comfy_io.Combo.Input( + IO.Combo.Input( "model", options=["wan2.5-i2v-preview"], default="wan2.5-i2v-preview", tooltip="Model to use.", ), - comfy_io.Image.Input( + IO.Image.Input( "image", ), - comfy_io.String.Input( + IO.String.Input( "prompt", multiline=True, default="", tooltip="Prompt used to describe the elements and visual features, supports English/Chinese.", ), - comfy_io.String.Input( + IO.String.Input( "negative_prompt", multiline=True, default="", tooltip="Negative text prompt to guide what to avoid.", optional=True, ), - comfy_io.Combo.Input( + IO.Combo.Input( "resolution", options=[ "480P", @@ -480,58 +583,58 @@ class WanImageToVideoApi(comfy_io.ComfyNode): default="480P", optional=True, ), - comfy_io.Int.Input( + IO.Int.Input( "duration", default=5, min=5, max=10, step=5, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, tooltip="Available durations: 5 and 10 seconds", optional=True, ), - comfy_io.Audio.Input( + IO.Audio.Input( "audio", optional=True, tooltip="Audio must contain a clear, loud voice, without extraneous noise, background music.", ), - comfy_io.Int.Input( + IO.Int.Input( "seed", default=0, min=0, max=2147483647, step=1, - display_mode=comfy_io.NumberDisplay.number, + display_mode=IO.NumberDisplay.number, control_after_generate=True, tooltip="Seed to use for generation.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "generate_audio", default=False, optional=True, tooltip="If there is no audio input, generate audio automatically.", ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "prompt_extend", default=True, tooltip="Whether to enhance the prompt with AI assistance.", optional=True, ), - comfy_io.Boolean.Input( + IO.Boolean.Input( "watermark", default=True, - tooltip="Whether to add an \"AI generated\" watermark to the result.", + tooltip='Whether to add an "AI generated" watermark to the result.', optional=True, ), ], outputs=[ - comfy_io.Video.Output(), + IO.Video.Output(), ], hidden=[ - comfy_io.Hidden.auth_token_comfy_org, - comfy_io.Hidden.api_key_comfy_org, - comfy_io.Hidden.unique_id, + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, ], is_api_node=True, ) @@ -553,46 +656,49 @@ class WanImageToVideoApi(comfy_io.ComfyNode): ): if get_number_of_images(image) != 1: raise ValueError("Exactly one input image is required.") - image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000*2000) + image_url = "data:image/png;base64," + tensor_to_base64_string(image, total_pixels=2000 * 2000) audio_url = None if audio is not None: validate_audio_duration(audio, 3.0, 29.0) audio_url = "data:audio/mp3;base64," + audio_to_base64_string(audio, "mp3", "libmp3lame") - payload = Image2VideoTaskCreationRequest( - model=model, - input=Image2VideoInputField( - prompt=prompt, negative_prompt=negative_prompt, img_url=image_url, audio_url=audio_url - ), - parameters=Image2VideoParametersField( - resolution=resolution, - duration=duration, - seed=seed, - audio=generate_audio, - prompt_extend=prompt_extend, - watermark=watermark, + initial_response = await sync_op( + cls, + ApiEndpoint(path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", method="POST"), + response_model=TaskCreationResponse, + data=Image2VideoTaskCreationRequest( + model=model, + input=Image2VideoInputField( + prompt=prompt, negative_prompt=negative_prompt, img_url=image_url, audio_url=audio_url + ), + parameters=Image2VideoParametersField( + resolution=resolution, + duration=duration, + seed=seed, + audio=generate_audio, + prompt_extend=prompt_extend, + watermark=watermark, + ), ), ) - response = await process_task( - { - "auth_token": cls.hidden.auth_token_comfy_org, - "comfy_api_key": cls.hidden.api_key_comfy_org, - }, - "/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis", - request_model=Image2VideoTaskCreationRequest, + if not initial_response.output: + raise Exception(f"Unknown error occurred: {initial_response.code} - {initial_response.message}") + response = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"), response_model=VideoTaskStatusResponse, - payload=payload, - node_id=cls.hidden.unique_id, + status_extractor=lambda x: x.output.task_status, estimated_duration=120 * int(duration / 5), poll_interval=6, ) - return comfy_io.NodeOutput(await download_url_to_video_output(response.output.video_url)) + return IO.NodeOutput(await download_url_to_video_output(response.output.video_url)) class WanApiExtension(ComfyExtension): @override - async def get_node_list(self) -> list[type[comfy_io.ComfyNode]]: + async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ WanTextToImageApi, + WanImageToImageApi, WanTextToVideoApi, WanImageToVideoApi, ] diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py index e69de29bb..21013b591 100644 --- a/comfy_api_nodes/util/__init__.py +++ b/comfy_api_nodes/util/__init__.py @@ -0,0 +1,97 @@ +from ._helpers import get_fs_object_size +from .client import ( + ApiEndpoint, + poll_op, + poll_op_raw, + sync_op, + sync_op_raw, +) +from .conversions import ( + audio_bytes_to_audio_input, + audio_input_to_mp3, + audio_to_base64_string, + bytesio_to_image_tensor, + downscale_image_tensor, + image_tensor_pair_to_batch, + pil_to_bytesio, + resize_mask_to_image, + tensor_to_base64_string, + tensor_to_bytesio, + tensor_to_pil, + text_filepath_to_base64_string, + text_filepath_to_data_uri, + trim_video, + video_to_base64_string, +) +from .download_helpers import ( + download_url_as_bytesio, + download_url_to_bytesio, + download_url_to_image_tensor, + download_url_to_video_output, +) +from .upload_helpers import ( + upload_audio_to_comfyapi, + upload_file_to_comfyapi, + upload_images_to_comfyapi, + upload_video_to_comfyapi, +) +from .validation_utils import ( + get_number_of_images, + validate_aspect_ratio_string, + validate_audio_duration, + validate_container_format_is_mp4, + validate_image_aspect_ratio, + validate_image_dimensions, + validate_images_aspect_ratio_closeness, + validate_string, + validate_video_dimensions, + validate_video_duration, +) + +__all__ = [ + # API client + "ApiEndpoint", + "poll_op", + "poll_op_raw", + "sync_op", + "sync_op_raw", + # Upload helpers + "upload_audio_to_comfyapi", + "upload_file_to_comfyapi", + "upload_images_to_comfyapi", + "upload_video_to_comfyapi", + # Download helpers + "download_url_as_bytesio", + "download_url_to_bytesio", + "download_url_to_image_tensor", + "download_url_to_video_output", + # Conversions + "audio_bytes_to_audio_input", + "audio_input_to_mp3", + "audio_to_base64_string", + "bytesio_to_image_tensor", + "downscale_image_tensor", + "image_tensor_pair_to_batch", + "pil_to_bytesio", + "resize_mask_to_image", + "tensor_to_base64_string", + "tensor_to_bytesio", + "tensor_to_pil", + "text_filepath_to_base64_string", + "text_filepath_to_data_uri", + "trim_video", + "video_to_base64_string", + # Validation utilities + "get_number_of_images", + "validate_aspect_ratio_string", + "validate_audio_duration", + "validate_container_format_is_mp4", + "validate_image_aspect_ratio", + "validate_image_dimensions", + "validate_images_aspect_ratio_closeness", + "validate_string", + "validate_video_dimensions", + "validate_video_duration", + # Misc functions + "get_fs_object_size", +] diff --git a/comfy_api_nodes/util/_helpers.py b/comfy_api_nodes/util/_helpers.py new file mode 100644 index 000000000..328fe5227 --- /dev/null +++ b/comfy_api_nodes/util/_helpers.py @@ -0,0 +1,71 @@ +import asyncio +import contextlib +import os +import time +from io import BytesIO +from typing import Callable, Optional, Union + +from comfy.cli_args import args +from comfy.model_management import processing_interrupted +from comfy_api.latest import IO + +from .common_exceptions import ProcessingInterrupted + + +def is_processing_interrupted() -> bool: + """Return True if user/runtime requested interruption.""" + return processing_interrupted() + + +def get_node_id(node_cls: type[IO.ComfyNode]) -> str: + return node_cls.hidden.unique_id + + +def get_auth_header(node_cls: type[IO.ComfyNode]) -> dict[str, str]: + if node_cls.hidden.auth_token_comfy_org: + return {"Authorization": f"Bearer {node_cls.hidden.auth_token_comfy_org}"} + if node_cls.hidden.api_key_comfy_org: + return {"X-API-KEY": node_cls.hidden.api_key_comfy_org} + return {} + + +def default_base_url() -> str: + return getattr(args, "comfy_api_base", "https://api.comfy.org") + + +async def sleep_with_interrupt( + seconds: float, + node_cls: Optional[type[IO.ComfyNode]], + label: Optional[str] = None, + start_ts: Optional[float] = None, + estimated_total: Optional[int] = None, + *, + display_callback: Optional[Callable[[type[IO.ComfyNode], str, int, Optional[int]], None]] = None, +): + """ + Sleep in 1s slices while: + - Checking for interruption (raises ProcessingInterrupted). + - Optionally emitting time progress via display_callback (if provided). + """ + end = time.monotonic() + seconds + while True: + if is_processing_interrupted(): + raise ProcessingInterrupted("Task cancelled") + now = time.monotonic() + if start_ts is not None and label and display_callback: + with contextlib.suppress(Exception): + display_callback(node_cls, label, int(now - start_ts), estimated_total) + if now >= end: + break + await asyncio.sleep(min(1.0, end - now)) + + +def mimetype_to_extension(mime_type: str) -> str: + """Converts a MIME type to a file extension.""" + return mime_type.split("/")[-1].lower() + + +def get_fs_object_size(path_or_object: Union[str, BytesIO]) -> int: + if isinstance(path_or_object, str): + return os.path.getsize(path_or_object) + return len(path_or_object.getvalue()) diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py new file mode 100644 index 000000000..2d5dcd648 --- /dev/null +++ b/comfy_api_nodes/util/client.py @@ -0,0 +1,936 @@ +import asyncio +import contextlib +import json +import logging +import time +import uuid +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from typing import Any, Callable, Iterable, Literal, Optional, Type, TypeVar, Union +from urllib.parse import urljoin, urlparse + +import aiohttp +from aiohttp.client_exceptions import ClientError, ContentTypeError +from pydantic import BaseModel + +from comfy import utils +from comfy_api.latest import IO +from server import PromptServer + +from . import request_logger +from ._helpers import ( + default_base_url, + get_auth_header, + get_node_id, + is_processing_interrupted, + sleep_with_interrupt, +) +from .common_exceptions import ApiServerError, LocalNetworkError, ProcessingInterrupted + +M = TypeVar("M", bound=BaseModel) + + +class ApiEndpoint: + def __init__( + self, + path: str, + method: Literal["GET", "POST", "PUT", "DELETE", "PATCH"] = "GET", + *, + query_params: Optional[dict[str, Any]] = None, + headers: Optional[dict[str, str]] = None, + ): + self.path = path + self.method = method + self.query_params = query_params or {} + self.headers = headers or {} + + +@dataclass +class _RequestConfig: + node_cls: type[IO.ComfyNode] + endpoint: ApiEndpoint + timeout: float + content_type: str + data: Optional[dict[str, Any]] + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] + multipart_parser: Optional[Callable] + max_retries: int + retry_delay: float + retry_backoff: float + wait_label: str = "Waiting" + monitor_progress: bool = True + estimated_total: Optional[int] = None + final_label_on_success: Optional[str] = "Completed" + progress_origin_ts: Optional[float] = None + + +@dataclass +class _PollUIState: + started: float + status_label: str = "Queued" + is_queued: bool = True + price: Optional[float] = None + estimated_duration: Optional[int] = None + base_processing_elapsed: float = 0.0 # sum of completed active intervals + active_since: Optional[float] = None # start time of current active interval (None if queued) + + +_RETRY_STATUS = {408, 429, 500, 502, 503, 504} +COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done"] +FAILED_STATUSES = ["cancelled", "canceled", "fail", "failed", "error"] +QUEUED_STATUSES = ["created", "queued", "queueing", "submitted"] + + +async def sync_op( + cls: type[IO.ComfyNode], + endpoint: ApiEndpoint, + *, + response_model: Type[M], + data: Optional[BaseModel] = None, + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, + content_type: str = "application/json", + timeout: float = 3600.0, + multipart_parser: Optional[Callable] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + wait_label: str = "Waiting for server", + estimated_duration: Optional[int] = None, + final_label_on_success: Optional[str] = "Completed", + progress_origin_ts: Optional[float] = None, + monitor_progress: bool = True, +) -> M: + raw = await sync_op_raw( + cls, + endpoint, + data=data, + files=files, + content_type=content_type, + timeout=timeout, + multipart_parser=multipart_parser, + max_retries=max_retries, + retry_delay=retry_delay, + retry_backoff=retry_backoff, + wait_label=wait_label, + estimated_duration=estimated_duration, + as_binary=False, + final_label_on_success=final_label_on_success, + progress_origin_ts=progress_origin_ts, + monitor_progress=monitor_progress, + ) + if not isinstance(raw, dict): + raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).") + return _validate_or_raise(response_model, raw) + + +async def poll_op( + cls: type[IO.ComfyNode], + poll_endpoint: ApiEndpoint, + *, + response_model: Type[M], + status_extractor: Callable[[M], Optional[Union[str, int]]], + progress_extractor: Optional[Callable[[M], Optional[int]]] = None, + price_extractor: Optional[Callable[[M], Optional[float]]] = None, + completed_statuses: Optional[list[Union[str, int]]] = None, + failed_statuses: Optional[list[Union[str, int]]] = None, + queued_statuses: Optional[list[Union[str, int]]] = None, + data: Optional[BaseModel] = None, + poll_interval: float = 5.0, + max_poll_attempts: int = 120, + timeout_per_poll: float = 120.0, + max_retries_per_poll: int = 3, + retry_delay_per_poll: float = 1.0, + retry_backoff_per_poll: float = 2.0, + estimated_duration: Optional[int] = None, + cancel_endpoint: Optional[ApiEndpoint] = None, + cancel_timeout: float = 10.0, +) -> M: + raw = await poll_op_raw( + cls, + poll_endpoint=poll_endpoint, + status_extractor=_wrap_model_extractor(response_model, status_extractor), + progress_extractor=_wrap_model_extractor(response_model, progress_extractor), + price_extractor=_wrap_model_extractor(response_model, price_extractor), + completed_statuses=completed_statuses, + failed_statuses=failed_statuses, + queued_statuses=queued_statuses, + data=data, + poll_interval=poll_interval, + max_poll_attempts=max_poll_attempts, + timeout_per_poll=timeout_per_poll, + max_retries_per_poll=max_retries_per_poll, + retry_delay_per_poll=retry_delay_per_poll, + retry_backoff_per_poll=retry_backoff_per_poll, + estimated_duration=estimated_duration, + cancel_endpoint=cancel_endpoint, + cancel_timeout=cancel_timeout, + ) + if not isinstance(raw, dict): + raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).") + return _validate_or_raise(response_model, raw) + + +async def sync_op_raw( + cls: type[IO.ComfyNode], + endpoint: ApiEndpoint, + *, + data: Optional[Union[dict[str, Any], BaseModel]] = None, + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]] = None, + content_type: str = "application/json", + timeout: float = 3600.0, + multipart_parser: Optional[Callable] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + wait_label: str = "Waiting for server", + estimated_duration: Optional[int] = None, + as_binary: bool = False, + final_label_on_success: Optional[str] = "Completed", + progress_origin_ts: Optional[float] = None, + monitor_progress: bool = True, +) -> Union[dict[str, Any], bytes]: + """ + Make a single network request. + - If as_binary=False (default): returns JSON dict (or {'_raw': ''} if non-JSON). + - If as_binary=True: returns bytes. + """ + if isinstance(data, BaseModel): + data = data.model_dump(exclude_none=True) + for k, v in list(data.items()): + if isinstance(v, Enum): + data[k] = v.value + cfg = _RequestConfig( + node_cls=cls, + endpoint=endpoint, + timeout=timeout, + content_type=content_type, + data=data, + files=files, + multipart_parser=multipart_parser, + max_retries=max_retries, + retry_delay=retry_delay, + retry_backoff=retry_backoff, + wait_label=wait_label, + monitor_progress=monitor_progress, + estimated_total=estimated_duration, + final_label_on_success=final_label_on_success, + progress_origin_ts=progress_origin_ts, + ) + return await _request_base(cfg, expect_binary=as_binary) + + +async def poll_op_raw( + cls: type[IO.ComfyNode], + poll_endpoint: ApiEndpoint, + *, + status_extractor: Callable[[dict[str, Any]], Optional[Union[str, int]]], + progress_extractor: Optional[Callable[[dict[str, Any]], Optional[int]]] = None, + price_extractor: Optional[Callable[[dict[str, Any]], Optional[float]]] = None, + completed_statuses: Optional[list[Union[str, int]]] = None, + failed_statuses: Optional[list[Union[str, int]]] = None, + queued_statuses: Optional[list[Union[str, int]]] = None, + data: Optional[Union[dict[str, Any], BaseModel]] = None, + poll_interval: float = 5.0, + max_poll_attempts: int = 120, + timeout_per_poll: float = 120.0, + max_retries_per_poll: int = 3, + retry_delay_per_poll: float = 1.0, + retry_backoff_per_poll: float = 2.0, + estimated_duration: Optional[int] = None, + cancel_endpoint: Optional[ApiEndpoint] = None, + cancel_timeout: float = 10.0, +) -> dict[str, Any]: + """ + Polls an endpoint until the task reaches a terminal state. Displays time while queued/processing, + checks interruption every second, and calls Cancel endpoint (if provided) on interruption. + + Uses default complete, failed and queued states assumption. + + Returns the final JSON response from the poll endpoint. + """ + completed_states = _normalize_statuses(COMPLETED_STATUSES if completed_statuses is None else completed_statuses) + failed_states = _normalize_statuses(FAILED_STATUSES if failed_statuses is None else failed_statuses) + queued_states = _normalize_statuses(QUEUED_STATUSES if queued_statuses is None else queued_statuses) + started = time.monotonic() + consumed_attempts = 0 # counts only non-queued polls + + progress_bar = utils.ProgressBar(100) if progress_extractor else None + last_progress: Optional[int] = None + + state = _PollUIState(started=started, estimated_duration=estimated_duration) + stop_ticker = asyncio.Event() + + async def _ticker(): + """Emit a UI update every second while polling is in progress.""" + try: + while not stop_ticker.is_set(): + if is_processing_interrupted(): + break + now = time.monotonic() + proc_elapsed = state.base_processing_elapsed + ( + (now - state.active_since) if state.active_since is not None else 0.0 + ) + _display_time_progress( + cls, + status=state.status_label, + elapsed_seconds=int(now - state.started), + estimated_total=state.estimated_duration, + price=state.price, + is_queued=state.is_queued, + processing_elapsed_seconds=int(proc_elapsed), + ) + await asyncio.sleep(1.0) + except Exception as exc: + logging.debug("Polling ticker exited: %s", exc) + + ticker_task = asyncio.create_task(_ticker()) + try: + while consumed_attempts < max_poll_attempts: + try: + resp_json = await sync_op_raw( + cls, + poll_endpoint, + data=data, + timeout=timeout_per_poll, + max_retries=max_retries_per_poll, + retry_delay=retry_delay_per_poll, + retry_backoff=retry_backoff_per_poll, + wait_label="Checking", + estimated_duration=None, + as_binary=False, + final_label_on_success=None, + monitor_progress=False, + ) + if not isinstance(resp_json, dict): + raise Exception("Polling endpoint returned non-JSON response.") + except ProcessingInterrupted: + if cancel_endpoint: + with contextlib.suppress(Exception): + await sync_op_raw( + cls, + cancel_endpoint, + timeout=cancel_timeout, + max_retries=0, + wait_label="Cancelling task", + estimated_duration=None, + as_binary=False, + final_label_on_success=None, + monitor_progress=False, + ) + raise + + try: + status = _normalize_status_value(status_extractor(resp_json)) + except Exception as e: + logging.error("Status extraction failed: %s", e) + status = None + + if price_extractor: + new_price = price_extractor(resp_json) + if new_price is not None: + state.price = new_price + + if progress_extractor: + new_progress = progress_extractor(resp_json) + if new_progress is not None and last_progress != new_progress: + progress_bar.update_absolute(new_progress, total=100) + last_progress = new_progress + + now_ts = time.monotonic() + is_queued = status in queued_states + + if is_queued: + if state.active_since is not None: # If we just moved from active -> queued, close the active interval + state.base_processing_elapsed += now_ts - state.active_since + state.active_since = None + else: + if state.active_since is None: # If we just moved from queued -> active, open a new active interval + state.active_since = now_ts + + state.is_queued = is_queued + state.status_label = status or ("Queued" if is_queued else "Processing") + if status in completed_states: + if state.active_since is not None: + state.base_processing_elapsed += now_ts - state.active_since + state.active_since = None + stop_ticker.set() + with contextlib.suppress(Exception): + await ticker_task + + if progress_bar and last_progress != 100: + progress_bar.update_absolute(100, total=100) + + _display_time_progress( + cls, + status=status if status else "Completed", + elapsed_seconds=int(now_ts - started), + estimated_total=estimated_duration, + price=state.price, + is_queued=False, + processing_elapsed_seconds=int(state.base_processing_elapsed), + ) + return resp_json + + if status in failed_states: + msg = f"Task failed: {json.dumps(resp_json)}" + logging.error(msg) + raise Exception(msg) + + try: + await sleep_with_interrupt(poll_interval, cls, None, None, None) + except ProcessingInterrupted: + if cancel_endpoint: + with contextlib.suppress(Exception): + await sync_op_raw( + cls, + cancel_endpoint, + timeout=cancel_timeout, + max_retries=0, + wait_label="Cancelling task", + estimated_duration=None, + as_binary=False, + final_label_on_success=None, + monitor_progress=False, + ) + raise + if not is_queued: + consumed_attempts += 1 + + raise Exception( + f"Polling timed out after {max_poll_attempts} non-queued attempts " + f"(~{int(max_poll_attempts * poll_interval)}s of active polling)." + ) + except ProcessingInterrupted: + raise + except (LocalNetworkError, ApiServerError): + raise + except Exception as e: + raise Exception(f"Polling aborted due to error: {e}") from e + finally: + stop_ticker.set() + with contextlib.suppress(Exception): + await ticker_task + + +def _display_text( + node_cls: type[IO.ComfyNode], + text: Optional[str], + *, + status: Optional[Union[str, int]] = None, + price: Optional[float] = None, +) -> None: + display_lines: list[str] = [] + if status: + display_lines.append(f"Status: {status.capitalize() if isinstance(status, str) else status}") + if price is not None: + display_lines.append(f"Price: ${float(price):,.4f}") + if text is not None: + display_lines.append(text) + if display_lines: + PromptServer.instance.send_progress_text("\n".join(display_lines), get_node_id(node_cls)) + + +def _display_time_progress( + node_cls: type[IO.ComfyNode], + status: Optional[Union[str, int]], + elapsed_seconds: int, + estimated_total: Optional[int] = None, + *, + price: Optional[float] = None, + is_queued: Optional[bool] = None, + processing_elapsed_seconds: Optional[int] = None, +) -> None: + if estimated_total is not None and estimated_total > 0 and is_queued is False: + pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds + remaining = max(0, int(estimated_total) - int(pe)) + time_line = f"Time elapsed: {int(elapsed_seconds)}s (~{remaining}s remaining)" + else: + time_line = f"Time elapsed: {int(elapsed_seconds)}s" + _display_text(node_cls, time_line, status=status, price=price) + + +async def _diagnose_connectivity() -> dict[str, bool]: + """Best-effort connectivity diagnostics to distinguish local vs. server issues.""" + results = { + "internet_accessible": False, + "api_accessible": False, + } + timeout = aiohttp.ClientTimeout(total=5.0) + async with aiohttp.ClientSession(timeout=timeout) as session: + with contextlib.suppress(ClientError, OSError): + async with session.get("https://www.google.com") as resp: + results["internet_accessible"] = resp.status < 500 + if not results["internet_accessible"]: + return results + + parsed = urlparse(default_base_url()) + health_url = f"{parsed.scheme}://{parsed.netloc}/health" + with contextlib.suppress(ClientError, OSError): + async with session.get(health_url) as resp: + results["api_accessible"] = resp.status < 500 + return results + + +def _unpack_tuple(t: tuple) -> tuple[str, Any, str]: + """Normalize (filename, value, content_type).""" + if len(t) == 2: + return t[0], t[1], "application/octet-stream" + if len(t) == 3: + return t[0], t[1], t[2] + raise ValueError("files tuple must be (filename, file[, content_type])") + + +def _merge_params(endpoint_params: dict[str, Any], method: str, data: Optional[dict[str, Any]]) -> dict[str, Any]: + params = dict(endpoint_params or {}) + if method.upper() == "GET" and data: + for k, v in data.items(): + if v is not None: + params[k] = v + return params + + +def _friendly_http_message(status: int, body: Any) -> str: + if status == 401: + return "Unauthorized: Please login first to use this node." + if status == 402: + return "Payment Required: Please add credits to your account to use this node." + if status == 409: + return "There is a problem with your account. Please contact support@comfy.org." + if status == 429: + return "Rate Limit Exceeded: Please try again later." + try: + if isinstance(body, dict): + err = body.get("error") + if isinstance(err, dict): + msg = err.get("message") + typ = err.get("type") + if msg and typ: + return f"API Error: {msg} (Type: {typ})" + if msg: + return f"API Error: {msg}" + return f"API Error: {json.dumps(body)}" + else: + txt = str(body) + if len(txt) <= 200: + return f"API Error (raw): {txt}" + return f"API Error (status {status})" + except Exception: + return f"HTTP {status}: Unknown error" + + +def _generate_operation_id(method: str, path: str, attempt: int) -> str: + slug = path.strip("/").replace("/", "_") or "op" + return f"{method}_{slug}_try{attempt}_{uuid.uuid4().hex[:8]}" + + +def _snapshot_request_body_for_logging( + content_type: str, + method: str, + data: Optional[dict[str, Any]], + files: Optional[Union[dict[str, Any], list[tuple[str, Any]]]], +) -> Optional[Union[dict[str, Any], str]]: + if method.upper() == "GET": + return None + if content_type == "multipart/form-data": + form_fields = sorted([k for k, v in (data or {}).items() if v is not None]) + file_fields: list[dict[str, str]] = [] + if files: + file_iter = files if isinstance(files, list) else list(files.items()) + for field_name, file_obj in file_iter: + if file_obj is None: + continue + if isinstance(file_obj, tuple): + filename = file_obj[0] + else: + filename = getattr(file_obj, "name", field_name) + file_fields.append({"field": field_name, "filename": str(filename or "")}) + return {"_multipart": True, "form_fields": form_fields, "file_fields": file_fields} + if content_type == "application/x-www-form-urlencoded": + return data or {} + return data or {} + + +async def _request_base(cfg: _RequestConfig, expect_binary: bool): + """Core request with retries, per-second interruption monitoring, true cancellation, and friendly errors.""" + url = cfg.endpoint.path + parsed_url = urlparse(url) + if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? + url = urljoin(default_base_url().rstrip("/") + "/", url.lstrip("/")) + + method = cfg.endpoint.method + params = _merge_params(cfg.endpoint.query_params, method, cfg.data if method == "GET" else None) + + async def _monitor(stop_evt: asyncio.Event, start_ts: float): + """Every second: update elapsed time and signal interruption.""" + try: + while not stop_evt.is_set(): + if is_processing_interrupted(): + return + if cfg.monitor_progress: + _display_time_progress( + cfg.node_cls, cfg.wait_label, int(time.monotonic() - start_ts), cfg.estimated_total + ) + await asyncio.sleep(1.0) + except asyncio.CancelledError: + return # normal shutdown + + start_time = cfg.progress_origin_ts if cfg.progress_origin_ts is not None else time.monotonic() + attempt = 0 + delay = cfg.retry_delay + operation_succeeded: bool = False + final_elapsed_seconds: Optional[int] = None + while True: + attempt += 1 + stop_event = asyncio.Event() + monitor_task: Optional[asyncio.Task] = None + sess: Optional[aiohttp.ClientSession] = None + + operation_id = _generate_operation_id(method, cfg.endpoint.path, attempt) + logging.debug("[DEBUG] HTTP %s %s (attempt %d)", method, url, attempt) + + payload_headers = {"Accept": "*/*"} if expect_binary else {"Accept": "application/json"} + if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? + payload_headers.update(get_auth_header(cfg.node_cls)) + if cfg.endpoint.headers: + payload_headers.update(cfg.endpoint.headers) + + payload_kw: dict[str, Any] = {"headers": payload_headers} + if method == "GET": + payload_headers.pop("Content-Type", None) + request_body_log = _snapshot_request_body_for_logging(cfg.content_type, method, cfg.data, cfg.files) + try: + if cfg.monitor_progress: + monitor_task = asyncio.create_task(_monitor(stop_event, start_time)) + + timeout = aiohttp.ClientTimeout(total=cfg.timeout) + sess = aiohttp.ClientSession(timeout=timeout) + + if cfg.content_type == "multipart/form-data" and method != "GET": + # aiohttp will set Content-Type boundary; remove any fixed Content-Type + payload_headers.pop("Content-Type", None) + if cfg.multipart_parser and cfg.data: + form = cfg.multipart_parser(cfg.data) + if not isinstance(form, aiohttp.FormData): + raise ValueError("multipart_parser must return aiohttp.FormData") + else: + form = aiohttp.FormData(default_to_multipart=True) + if cfg.data: + for k, v in cfg.data.items(): + if v is None: + continue + form.add_field(k, str(v) if not isinstance(v, (bytes, bytearray)) else v) + if cfg.files: + file_iter = cfg.files if isinstance(cfg.files, list) else cfg.files.items() + for field_name, file_obj in file_iter: + if file_obj is None: + continue + if isinstance(file_obj, tuple): + filename, file_value, content_type = _unpack_tuple(file_obj) + else: + filename = getattr(file_obj, "name", field_name) + file_value = file_obj + content_type = "application/octet-stream" + # Attempt to rewind BytesIO for retries + if isinstance(file_value, BytesIO): + with contextlib.suppress(Exception): + file_value.seek(0) + form.add_field(field_name, file_value, filename=filename, content_type=content_type) + payload_kw["data"] = form + elif cfg.content_type == "application/x-www-form-urlencoded" and method != "GET": + payload_headers["Content-Type"] = "application/x-www-form-urlencoded" + payload_kw["data"] = cfg.data or {} + elif method != "GET": + payload_headers["Content-Type"] = "application/json" + payload_kw["json"] = cfg.data or {} + + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + ) + except Exception as _log_e: + logging.debug("[DEBUG] request logging failed: %s", _log_e) + + req_coro = sess.request(method, url, params=params, **payload_kw) + req_task = asyncio.create_task(req_coro) + + # Race: request vs. monitor (interruption) + tasks = {req_task} + if monitor_task: + tasks.add(monitor_task) + done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED) + + if monitor_task and monitor_task in done: + # Interrupted – cancel the request and abort + if req_task in pending: + req_task.cancel() + raise ProcessingInterrupted("Task cancelled") + + # Otherwise, request finished + resp = await req_task + async with resp: + if resp.status >= 400: + try: + body = await resp.json() + except (ContentTypeError, json.JSONDecodeError): + body = await resp.text() + if resp.status in _RETRY_STATUS and attempt <= cfg.max_retries: + logging.warning( + "HTTP %s %s -> %s. Retrying in %.2fs (retry %d of %d).", + method, + url, + resp.status, + delay, + attempt, + cfg.max_retries, + ) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=_friendly_http_message(resp.status, body), + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + + await sleep_with_interrupt( + delay, + cfg.node_cls, + cfg.wait_label if cfg.monitor_progress else None, + start_time if cfg.monitor_progress else None, + cfg.estimated_total, + display_callback=_display_time_progress if cfg.monitor_progress else None, + ) + delay *= cfg.retry_backoff + continue + msg = _friendly_http_message(resp.status, body) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=msg, + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + raise Exception(msg) + + if expect_binary: + buff = bytearray() + last_tick = time.monotonic() + async for chunk in resp.content.iter_chunked(64 * 1024): + buff.extend(chunk) + now = time.monotonic() + if now - last_tick >= 1.0: + last_tick = now + if is_processing_interrupted(): + raise ProcessingInterrupted("Task cancelled") + if cfg.monitor_progress: + _display_time_progress( + cfg.node_cls, cfg.wait_label, int(now - start_time), cfg.estimated_total + ) + bytes_payload = bytes(buff) + operation_succeeded = True + final_elapsed_seconds = int(time.monotonic() - start_time) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=bytes_payload, + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + return bytes_payload + else: + try: + payload = await resp.json() + response_content_to_log: Any = payload + except (ContentTypeError, json.JSONDecodeError): + text = await resp.text() + try: + payload = json.loads(text) if text else {} + except json.JSONDecodeError: + payload = {"_raw": text} + response_content_to_log = payload if isinstance(payload, dict) else text + operation_succeeded = True + final_elapsed_seconds = int(time.monotonic() - start_time) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=response_content_to_log, + ) + except Exception as _log_e: + logging.debug("[DEBUG] response logging failed: %s", _log_e) + return payload + + except ProcessingInterrupted: + logging.debug("Polling was interrupted by user") + raise + except (ClientError, OSError) as e: + if attempt <= cfg.max_retries: + logging.warning( + "Connection error calling %s %s. Retrying in %.2fs (%d/%d): %s", + method, + url, + delay, + attempt, + cfg.max_retries, + str(e), + ) + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) + except Exception as _log_e: + logging.debug("[DEBUG] request error logging failed: %s", _log_e) + await sleep_with_interrupt( + delay, + cfg.node_cls, + cfg.wait_label if cfg.monitor_progress else None, + start_time if cfg.monitor_progress else None, + cfg.estimated_total, + display_callback=_display_time_progress if cfg.monitor_progress else None, + ) + delay *= cfg.retry_backoff + continue + diag = await _diagnose_connectivity() + if not diag["internet_accessible"]: + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"LocalNetworkError: {str(e)}", + ) + except Exception as _log_e: + logging.debug("[DEBUG] final error logging failed: %s", _log_e) + raise LocalNetworkError( + "Unable to connect to the API server due to local network issues. " + "Please check your internet connection and try again." + ) from e + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method=method, + request_url=url, + request_headers=dict(payload_headers) if payload_headers else None, + request_params=dict(params) if params else None, + request_data=request_body_log, + error_message=f"ApiServerError: {str(e)}", + ) + except Exception as _log_e: + logging.debug("[DEBUG] final error logging failed: %s", _log_e) + raise ApiServerError( + f"The API server at {default_base_url()} is currently unreachable. " + f"The service may be experiencing issues." + ) from e + finally: + stop_event.set() + if monitor_task: + monitor_task.cancel() + with contextlib.suppress(Exception): + await monitor_task + if sess: + with contextlib.suppress(Exception): + await sess.close() + if operation_succeeded and cfg.monitor_progress and cfg.final_label_on_success: + _display_time_progress( + cfg.node_cls, + status=cfg.final_label_on_success, + elapsed_seconds=( + final_elapsed_seconds + if final_elapsed_seconds is not None + else int(time.monotonic() - start_time) + ), + estimated_total=cfg.estimated_total, + price=None, + is_queued=False, + processing_elapsed_seconds=final_elapsed_seconds, + ) + + +def _validate_or_raise(response_model: Type[M], payload: Any) -> M: + try: + return response_model.model_validate(payload) + except Exception as e: + logging.error( + "Response validation failed for %s: %s", + getattr(response_model, "__name__", response_model), + e, + ) + raise Exception( + f"Response validation failed for {getattr(response_model, '__name__', response_model)}: {e}" + ) from e + + +def _wrap_model_extractor( + response_model: Type[M], + extractor: Optional[Callable[[M], Any]], +) -> Optional[Callable[[dict[str, Any]], Any]]: + """Wrap a typed extractor so it can be used by the dict-based poller. + Validates the dict into `response_model` before invoking `extractor`. + Uses a small per-wrapper cache keyed by `id(dict)` to avoid re-validating + the same response for multiple extractors in a single poll attempt. + """ + if extractor is None: + return None + _cache: dict[int, M] = {} + + def _wrapped(d: dict[str, Any]) -> Any: + try: + key = id(d) + model = _cache.get(key) + if model is None: + model = response_model.model_validate(d) + _cache[key] = model + return extractor(model) + except Exception as e: + logging.error("Extractor failed (typed -> dict wrapper): %s", e) + raise + + return _wrapped + + +def _normalize_statuses(values: Optional[Iterable[Union[str, int]]]) -> set[Union[str, int]]: + if not values: + return set() + out: set[Union[str, int]] = set() + for v in values: + nv = _normalize_status_value(v) + if nv is not None: + out.add(nv) + return out + + +def _normalize_status_value(val: Union[str, int, None]) -> Union[str, int, None]: + if isinstance(val, str): + return val.strip().lower() + return val diff --git a/comfy_api_nodes/util/common_exceptions.py b/comfy_api_nodes/util/common_exceptions.py new file mode 100644 index 000000000..0606a4407 --- /dev/null +++ b/comfy_api_nodes/util/common_exceptions.py @@ -0,0 +1,14 @@ +class NetworkError(Exception): + """Base exception for network-related errors with diagnostic information.""" + + +class LocalNetworkError(NetworkError): + """Exception raised when local network connectivity issues are detected.""" + + +class ApiServerError(NetworkError): + """Exception raised when the API server is unreachable but internet is working.""" + + +class ProcessingInterrupted(Exception): + """Operation was interrupted by user/runtime via processing_interrupted().""" diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py new file mode 100644 index 000000000..971dc57de --- /dev/null +++ b/comfy_api_nodes/util/conversions.py @@ -0,0 +1,470 @@ +import base64 +import logging +import math +import mimetypes +import uuid +from io import BytesIO +from typing import Optional + +import av +import numpy as np +import torch +from PIL import Image + +from comfy.utils import common_upscale +from comfy_api.latest import Input, InputImpl +from comfy_api.util import VideoCodec, VideoContainer + +from ._helpers import mimetype_to_extension + + +def bytesio_to_image_tensor(image_bytesio: BytesIO, mode: str = "RGBA") -> torch.Tensor: + """Converts image data from BytesIO to a torch.Tensor. + + Args: + image_bytesio: BytesIO object containing the image data. + mode: The PIL mode to convert the image to (e.g., "RGB", "RGBA"). + + Returns: + A torch.Tensor representing the image (1, H, W, C). + + Raises: + PIL.UnidentifiedImageError: If the image data cannot be identified. + ValueError: If the specified mode is invalid. + """ + image = Image.open(image_bytesio) + image = image.convert(mode) + image_array = np.array(image).astype(np.float32) / 255.0 + return torch.from_numpy(image_array).unsqueeze(0) + + +def image_tensor_pair_to_batch(image1: torch.Tensor, image2: torch.Tensor) -> torch.Tensor: + """ + Converts a pair of image tensors to a batch tensor. + If the images are not the same size, the smaller image is resized to + match the larger image. + """ + if image1.shape[1:] != image2.shape[1:]: + image2 = common_upscale( + image2.movedim(-1, 1), + image1.shape[2], + image1.shape[1], + "bilinear", + "center", + ).movedim(1, -1) + return torch.cat((image1, image2), dim=0) + + +def tensor_to_bytesio( + image: torch.Tensor, + name: Optional[str] = None, + total_pixels: int = 2048 * 2048, + mime_type: str = "image/png", +) -> BytesIO: + """Converts a torch.Tensor image to a named BytesIO object. + + Args: + image: Input torch.Tensor image. + name: Optional filename for the BytesIO object. + total_pixels: Maximum total pixels for potential downscaling. + mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). + + Returns: + Named BytesIO object containing the image data, with pointer set to the start of buffer. + """ + if not mime_type: + mime_type = "image/png" + + pil_image = tensor_to_pil(image, total_pixels=total_pixels) + img_binary = pil_to_bytesio(pil_image, mime_type=mime_type) + img_binary.name = f"{name if name else uuid.uuid4()}.{mimetype_to_extension(mime_type)}" + return img_binary + + +def tensor_to_pil(image: torch.Tensor, total_pixels: int = 2048 * 2048) -> Image.Image: + """Converts a single torch.Tensor image [H, W, C] to a PIL Image, optionally downscaling.""" + if len(image.shape) > 3: + image = image[0] + # TODO: remove alpha if not allowed and present + input_tensor = image.cpu() + input_tensor = downscale_image_tensor(input_tensor.unsqueeze(0), total_pixels=total_pixels).squeeze() + image_np = (input_tensor.numpy() * 255).astype(np.uint8) + img = Image.fromarray(image_np) + return img + + +def tensor_to_base64_string( + image_tensor: torch.Tensor, + total_pixels: int = 2048 * 2048, + mime_type: str = "image/png", +) -> str: + """Convert [B, H, W, C] or [H, W, C] tensor to a base64 string. + + Args: + image_tensor: Input torch.Tensor image. + total_pixels: Maximum total pixels for potential downscaling. + mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp', 'video/mp4'). + + Returns: + Base64 encoded string of the image. + """ + pil_image = tensor_to_pil(image_tensor, total_pixels=total_pixels) + img_byte_arr = pil_to_bytesio(pil_image, mime_type=mime_type) + img_bytes = img_byte_arr.getvalue() + # Encode bytes to base64 string + base64_encoded_string = base64.b64encode(img_bytes).decode("utf-8") + return base64_encoded_string + + +def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO: + """Converts a PIL Image to a BytesIO object.""" + if not mime_type: + mime_type = "image/png" + + img_byte_arr = BytesIO() + # Derive PIL format from MIME type (e.g., 'image/png' -> 'PNG') + pil_format = mime_type.split("/")[-1].upper() + if pil_format == "JPG": + pil_format = "JPEG" + img.save(img_byte_arr, format=pil_format) + img_byte_arr.seek(0) + return img_byte_arr + + +def downscale_image_tensor(image, total_pixels=1536 * 1024) -> torch.Tensor: + """Downscale input image tensor to roughly the specified total pixels.""" + samples = image.movedim(-1, 1) + total = int(total_pixels) + scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2])) + if scale_by >= 1: + return image + width = round(samples.shape[3] * scale_by) + height = round(samples.shape[2] * scale_by) + + s = common_upscale(samples, width, height, "lanczos", "disabled") + s = s.movedim(1, -1) + return s + + +def tensor_to_data_uri( + image_tensor: torch.Tensor, + total_pixels: int = 2048 * 2048, + mime_type: str = "image/png", +) -> str: + """Converts a tensor image to a Data URI string. + + Args: + image_tensor: Input torch.Tensor image. + total_pixels: Maximum total pixels for potential downscaling. + mime_type: Target image MIME type (e.g., 'image/png', 'image/jpeg', 'image/webp'). + + Returns: + Data URI string (e.g., 'data:image/png;base64,...'). + """ + base64_string = tensor_to_base64_string(image_tensor, total_pixels, mime_type) + return f"data:{mime_type};base64,{base64_string}" + + +def audio_to_base64_string(audio: Input.Audio, container_format: str = "mp4", codec_name: str = "aac") -> str: + """Converts an audio input to a base64 string.""" + sample_rate: int = audio["sample_rate"] + waveform: torch.Tensor = audio["waveform"] + audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) + audio_bytes_io = audio_ndarray_to_bytesio(audio_data_np, sample_rate, container_format, codec_name) + audio_bytes = audio_bytes_io.getvalue() + return base64.b64encode(audio_bytes).decode("utf-8") + + +def video_to_base64_string( + video: Input.Video, + container_format: VideoContainer = None, + codec: VideoCodec = None +) -> str: + """ + Converts a video input to a base64 string. + + Args: + video: The video input to convert + container_format: Optional container format to use (defaults to video.container if available) + codec: Optional codec to use (defaults to video.codec if available) + """ + video_bytes_io = BytesIO() + + # Use provided format/codec if specified, otherwise use video's own if available + format_to_use = container_format if container_format is not None else getattr(video, 'container', VideoContainer.MP4) + codec_to_use = codec if codec is not None else getattr(video, 'codec', VideoCodec.H264) + + video.save_to(video_bytes_io, format=format_to_use, codec=codec_to_use) + video_bytes_io.seek(0) + return base64.b64encode(video_bytes_io.getvalue()).decode("utf-8") + + +def audio_ndarray_to_bytesio( + audio_data_np: np.ndarray, + sample_rate: int, + container_format: str = "mp4", + codec_name: str = "aac", +) -> BytesIO: + """ + Encodes a numpy array of audio data into a BytesIO object. + """ + audio_bytes_io = BytesIO() + with av.open(audio_bytes_io, mode="w", format=container_format) as output_container: + audio_stream = output_container.add_stream(codec_name, rate=sample_rate) + frame = av.AudioFrame.from_ndarray( + audio_data_np, + format="fltp", + layout="stereo" if audio_data_np.shape[0] > 1 else "mono", + ) + frame.sample_rate = sample_rate + frame.pts = 0 + + for packet in audio_stream.encode(frame): + output_container.mux(packet) + + # Flush stream + for packet in audio_stream.encode(None): + output_container.mux(packet) + + audio_bytes_io.seek(0) + return audio_bytes_io + + +def audio_tensor_to_contiguous_ndarray(waveform: torch.Tensor) -> np.ndarray: + """ + Prepares audio waveform for av library by converting to a contiguous numpy array. + + Args: + waveform: a tensor of shape (1, channels, samples) derived from a Comfy `AUDIO` type. + + Returns: + Contiguous numpy array of the audio waveform. If the audio was batched, + the first item is taken. + """ + if waveform.ndim != 3 or waveform.shape[0] != 1: + raise ValueError("Expected waveform tensor shape (1, channels, samples)") + + # If batch is > 1, take first item + if waveform.shape[0] > 1: + waveform = waveform[0] + + # Prepare for av: remove batch dim, move to CPU, make contiguous, convert to numpy array + audio_data_np = waveform.squeeze(0).cpu().contiguous().numpy() + if audio_data_np.dtype != np.float32: + audio_data_np = audio_data_np.astype(np.float32) + + return audio_data_np + + +def audio_input_to_mp3(audio: Input.Audio) -> BytesIO: + waveform = audio["waveform"].cpu() + + output_buffer = BytesIO() + output_container = av.open(output_buffer, mode="w", format="mp3") + + out_stream = output_container.add_stream("libmp3lame", rate=audio["sample_rate"]) + out_stream.bit_rate = 320000 + + frame = av.AudioFrame.from_ndarray( + waveform.movedim(0, 1).reshape(1, -1).float().numpy(), + format="flt", + layout="mono" if waveform.shape[0] == 1 else "stereo", + ) + frame.sample_rate = audio["sample_rate"] + frame.pts = 0 + output_container.mux(out_stream.encode(frame)) + output_container.mux(out_stream.encode(None)) + output_container.close() + output_buffer.seek(0) + return output_buffer + + +def trim_video(video: Input.Video, duration_sec: float) -> Input.Video: + """ + Returns a new VideoInput object trimmed from the beginning to the specified duration, + using av to avoid loading entire video into memory. + + Args: + video: Input video to trim + duration_sec: Duration in seconds to keep from the beginning + + Returns: + VideoFromFile object that owns the output buffer + """ + output_buffer = BytesIO() + input_container = None + output_container = None + + try: + # Get the stream source - this avoids loading entire video into memory + # when the source is already a file path + input_source = video.get_stream_source() + + # Open containers + input_container = av.open(input_source, mode="r") + output_container = av.open(output_buffer, mode="w", format="mp4") + + # Set up output streams for re-encoding + video_stream = None + audio_stream = None + + for stream in input_container.streams: + logging.info("Found stream: type=%s, class=%s", stream.type, type(stream)) + if isinstance(stream, av.VideoStream): + # Create output video stream with same parameters + video_stream = output_container.add_stream("h264", rate=stream.average_rate) + video_stream.width = stream.width + video_stream.height = stream.height + video_stream.pix_fmt = "yuv420p" + logging.info("Added video stream: %sx%s @ %sfps", stream.width, stream.height, stream.average_rate) + elif isinstance(stream, av.AudioStream): + # Create output audio stream with same parameters + audio_stream = output_container.add_stream("aac", rate=stream.sample_rate) + audio_stream.sample_rate = stream.sample_rate + audio_stream.layout = stream.layout + logging.info("Added audio stream: %sHz, %s channels", stream.sample_rate, stream.channels) + + # Calculate target frame count that's divisible by 16 + fps = input_container.streams.video[0].average_rate + estimated_frames = int(duration_sec * fps) + target_frames = (estimated_frames // 16) * 16 # Round down to nearest multiple of 16 + + if target_frames == 0: + raise ValueError("Video too short: need at least 16 frames for Moonvalley") + + frame_count = 0 + audio_frame_count = 0 + + # Decode and re-encode video frames + if video_stream: + for frame in input_container.decode(video=0): + if frame_count >= target_frames: + break + + # Re-encode frame + for packet in video_stream.encode(frame): + output_container.mux(packet) + frame_count += 1 + + # Flush encoder + for packet in video_stream.encode(): + output_container.mux(packet) + + logging.info("Encoded %s video frames (target: %s)", frame_count, target_frames) + + # Decode and re-encode audio frames + if audio_stream: + input_container.seek(0) # Reset to beginning for audio + for frame in input_container.decode(audio=0): + if frame.time >= duration_sec: + break + + # Re-encode frame + for packet in audio_stream.encode(frame): + output_container.mux(packet) + audio_frame_count += 1 + + # Flush encoder + for packet in audio_stream.encode(): + output_container.mux(packet) + + logging.info("Encoded %s audio frames", audio_frame_count) + + # Close containers + output_container.close() + input_container.close() + + # Return as VideoFromFile using the buffer + output_buffer.seek(0) + return InputImpl.VideoFromFile(output_buffer) + + except Exception as e: + # Clean up on error + if input_container is not None: + input_container.close() + if output_container is not None: + output_container.close() + raise RuntimeError(f"Failed to trim video: {str(e)}") from e + + +def _f32_pcm(wav: torch.Tensor) -> torch.Tensor: + """Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file.""" + if wav.dtype.is_floating_point: + return wav + elif wav.dtype == torch.int16: + return wav.float() / (2**15) + elif wav.dtype == torch.int32: + return wav.float() / (2**31) + raise ValueError(f"Unsupported wav dtype: {wav.dtype}") + + +def audio_bytes_to_audio_input(audio_bytes: bytes) -> dict: + """ + Decode any common audio container from bytes using PyAV and return + a Comfy AUDIO dict: {"waveform": [1, C, T] float32, "sample_rate": int}. + """ + with av.open(BytesIO(audio_bytes)) as af: + if not af.streams.audio: + raise ValueError("No audio stream found in response.") + stream = af.streams.audio[0] + + in_sr = int(stream.codec_context.sample_rate) + out_sr = in_sr + + frames: list[torch.Tensor] = [] + n_channels = stream.channels or 1 + + for frame in af.decode(streams=stream.index): + arr = frame.to_ndarray() # shape can be [C, T] or [T, C] or [T] + buf = torch.from_numpy(arr) + if buf.ndim == 1: + buf = buf.unsqueeze(0) # [T] -> [1, T] + elif buf.shape[0] != n_channels and buf.shape[-1] == n_channels: + buf = buf.transpose(0, 1).contiguous() # [T, C] -> [C, T] + elif buf.shape[0] != n_channels: + buf = buf.reshape(-1, n_channels).t().contiguous() # fallback to [C, T] + frames.append(buf) + + if not frames: + raise ValueError("Decoded zero audio frames.") + + wav = torch.cat(frames, dim=1) # [C, T] + wav = _f32_pcm(wav) + return {"waveform": wav.unsqueeze(0).contiguous(), "sample_rate": out_sr} + + +def resize_mask_to_image( + mask: torch.Tensor, + image: torch.Tensor, + upscale_method="nearest-exact", + crop="disabled", + allow_gradient=True, + add_channel_dim=False, +): + """Resize mask to be the same dimensions as an image, while maintaining proper format for API calls.""" + _, height, width, _ = image.shape + mask = mask.unsqueeze(-1) + mask = mask.movedim(-1, 1) + mask = common_upscale(mask, width=width, height=height, upscale_method=upscale_method, crop=crop) + mask = mask.movedim(1, -1) + if not add_channel_dim: + mask = mask.squeeze(-1) + if not allow_gradient: + mask = (mask > 0.5).float() + return mask + + +def text_filepath_to_base64_string(filepath: str) -> str: + """Converts a text file to a base64 string.""" + with open(filepath, "rb") as f: + file_content = f.read() + return base64.b64encode(file_content).decode("utf-8") + + +def text_filepath_to_data_uri(filepath: str) -> str: + """Converts a text file to a data URI.""" + base64_string = text_filepath_to_base64_string(filepath) + mime_type, _ = mimetypes.guess_type(filepath) + if mime_type is None: + mime_type = "application/octet-stream" + return f"data:{mime_type};base64,{base64_string}" diff --git a/comfy_api_nodes/util/download_helpers.py b/comfy_api_nodes/util/download_helpers.py new file mode 100644 index 000000000..14207dc68 --- /dev/null +++ b/comfy_api_nodes/util/download_helpers.py @@ -0,0 +1,262 @@ +import asyncio +import contextlib +import uuid +from io import BytesIO +from pathlib import Path +from typing import IO, Optional, Union +from urllib.parse import urljoin, urlparse + +import aiohttp +import torch +from aiohttp.client_exceptions import ClientError, ContentTypeError + +from comfy_api.input_impl import VideoFromFile +from comfy_api.latest import IO as COMFY_IO + +from . import request_logger +from ._helpers import ( + default_base_url, + get_auth_header, + is_processing_interrupted, + sleep_with_interrupt, +) +from .client import _diagnose_connectivity +from .common_exceptions import ApiServerError, LocalNetworkError, ProcessingInterrupted +from .conversions import bytesio_to_image_tensor + +_RETRY_STATUS = {408, 429, 500, 502, 503, 504} + + +async def download_url_to_bytesio( + url: str, + dest: Optional[Union[BytesIO, IO[bytes], str, Path]], + *, + timeout: Optional[float] = None, + max_retries: int = 5, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + cls: type[COMFY_IO.ComfyNode] = None, +) -> None: + """Stream-download a URL to `dest`. + + `dest` must be one of: + - a BytesIO (rewound to 0 after write), + - a file-like object opened in binary write mode (must implement .write()), + - a filesystem path (str | pathlib.Path), which will be opened with 'wb'. + + If `url` starts with `/proxy/`, `cls` must be provided so the URL can be expanded + to an absolute URL and authentication headers can be applied. + + Raises: + ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception (HTTP and other errors) + """ + if not isinstance(dest, (str, Path)) and not hasattr(dest, "write"): + raise ValueError("dest must be a path (str|Path) or a binary-writable object providing .write().") + + attempt = 0 + delay = retry_delay + headers: dict[str, str] = {} + + parsed_url = urlparse(url) + if not parsed_url.scheme and not parsed_url.netloc: # is URL relative? + if cls is None: + raise ValueError("For relative 'cloud' paths, the `cls` parameter is required.") + url = urljoin(default_base_url().rstrip("/") + "/", url.lstrip("/")) + headers = get_auth_header(cls) + + while True: + attempt += 1 + op_id = _generate_operation_id("GET", url, attempt) + timeout_cfg = aiohttp.ClientTimeout(total=timeout) + + is_path_sink = isinstance(dest, (str, Path)) + fhandle = None + session: Optional[aiohttp.ClientSession] = None + stop_evt: Optional[asyncio.Event] = None + monitor_task: Optional[asyncio.Task] = None + req_task: Optional[asyncio.Task] = None + + try: + with contextlib.suppress(Exception): + request_logger.log_request_response(operation_id=op_id, request_method="GET", request_url=url) + + session = aiohttp.ClientSession(timeout=timeout_cfg) + stop_evt = asyncio.Event() + + async def _monitor(): + try: + while not stop_evt.is_set(): + if is_processing_interrupted(): + return + await asyncio.sleep(1.0) + except asyncio.CancelledError: + return + + monitor_task = asyncio.create_task(_monitor()) + + req_task = asyncio.create_task(session.get(url, headers=headers)) + done, pending = await asyncio.wait({req_task, monitor_task}, return_when=asyncio.FIRST_COMPLETED) + + if monitor_task in done and req_task in pending: + req_task.cancel() + with contextlib.suppress(Exception): + await req_task + raise ProcessingInterrupted("Task cancelled") + + try: + resp = await req_task + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + + async with resp: + if resp.status >= 400: + with contextlib.suppress(Exception): + try: + body = await resp.json() + except (ContentTypeError, ValueError): + text = await resp.text() + body = text if len(text) <= 4096 else f"[text {len(text)} bytes]" + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=f"HTTP {resp.status}", + ) + + if resp.status in _RETRY_STATUS and attempt <= max_retries: + await sleep_with_interrupt(delay, cls, None, None, None) + delay *= retry_backoff + continue + raise Exception(f"Failed to download (HTTP {resp.status}).") + + if is_path_sink: + p = Path(str(dest)) + with contextlib.suppress(Exception): + p.parent.mkdir(parents=True, exist_ok=True) + fhandle = open(p, "wb") + sink = fhandle + else: + sink = dest # BytesIO or file-like + + written = 0 + while True: + try: + chunk = await asyncio.wait_for(resp.content.read(1024 * 1024), timeout=1.0) + except asyncio.TimeoutError: + chunk = b"" + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + + if is_processing_interrupted(): + raise ProcessingInterrupted("Task cancelled") + + if not chunk: + if resp.content.at_eof(): + break + continue + + sink.write(chunk) + written += len(chunk) + + if isinstance(dest, BytesIO): + with contextlib.suppress(Exception): + dest.seek(0) + + with contextlib.suppress(Exception): + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=f"[streamed {written} bytes to dest]", + ) + return + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + except (ClientError, OSError) as e: + if attempt <= max_retries: + with contextlib.suppress(Exception): + request_logger.log_request_response( + operation_id=op_id, + request_method="GET", + request_url=url, + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) + await sleep_with_interrupt(delay, cls, None, None, None) + delay *= retry_backoff + continue + + diag = await _diagnose_connectivity() + if not diag["internet_accessible"]: + raise LocalNetworkError( + "Unable to connect to the network. Please check your internet connection and try again." + ) from e + raise ApiServerError("The remote service appears unreachable at this time.") from e + finally: + if stop_evt is not None: + stop_evt.set() + if monitor_task: + monitor_task.cancel() + with contextlib.suppress(Exception): + await monitor_task + if req_task and not req_task.done(): + req_task.cancel() + with contextlib.suppress(Exception): + await req_task + if session: + with contextlib.suppress(Exception): + await session.close() + if fhandle: + with contextlib.suppress(Exception): + fhandle.flush() + fhandle.close() + + +async def download_url_to_image_tensor( + url: str, + *, + timeout: float = None, + cls: type[COMFY_IO.ComfyNode] = None, +) -> torch.Tensor: + """Downloads an image from a URL and returns a [B, H, W, C] tensor.""" + result = BytesIO() + await download_url_to_bytesio(url, result, timeout=timeout, cls=cls) + return bytesio_to_image_tensor(result) + + +async def download_url_to_video_output( + video_url: str, + *, + timeout: float = None, + max_retries: int = 5, + cls: type[COMFY_IO.ComfyNode] = None, +) -> VideoFromFile: + """Downloads a video from a URL and returns a `VIDEO` output.""" + result = BytesIO() + await download_url_to_bytesio(video_url, result, timeout=timeout, max_retries=max_retries, cls=cls) + return VideoFromFile(result) + + +async def download_url_as_bytesio( + url: str, + *, + timeout: float = None, + cls: type[COMFY_IO.ComfyNode] = None, +) -> BytesIO: + """Downloads content from a URL and returns a new BytesIO (rewound to 0).""" + result = BytesIO() + await download_url_to_bytesio(url, result, timeout=timeout, cls=cls) + return result + + +def _generate_operation_id(method: str, url: str, attempt: int) -> str: + try: + parsed = urlparse(url) + slug = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc or "download").strip("/").replace("/", "_") + except Exception: + slug = "download" + return f"{method}_{slug}_try{attempt}_{uuid.uuid4().hex[:8]}" diff --git a/comfy_api_nodes/apis/request_logger.py b/comfy_api_nodes/util/request_logger.py similarity index 62% rename from comfy_api_nodes/apis/request_logger.py rename to comfy_api_nodes/util/request_logger.py index 42901e141..ac52e2eab 100644 --- a/comfy_api_nodes/apis/request_logger.py +++ b/comfy_api_nodes/util/request_logger.py @@ -1,65 +1,102 @@ from __future__ import annotations -import os import datetime +import hashlib import json import logging +import os +import re +from typing import Any + import folder_paths # Get the logger instance logger = logging.getLogger(__name__) + def get_log_directory(): - """ - Ensures the API log directory exists within ComfyUI's temp directory - and returns its path. - """ + """Ensures the API log directory exists within ComfyUI's temp directory and returns its path.""" base_temp_dir = folder_paths.get_temp_directory() log_dir = os.path.join(base_temp_dir, "api_logs") try: os.makedirs(log_dir, exist_ok=True) except Exception as e: - logger.error(f"Error creating API log directory {log_dir}: {e}") + logger.error("Error creating API log directory %s: %s", log_dir, str(e)) # Fallback to base temp directory if sub-directory creation fails return base_temp_dir return log_dir -def _format_data_for_logging(data): + +def _sanitize_filename_component(name: str) -> str: + if not name: + return "log" + sanitized = re.sub(r"[^A-Za-z0-9._-]+", "_", name) # Replace disallowed characters with underscore + sanitized = sanitized.strip(" ._") # Windows: trailing dots or spaces are not allowed + if not sanitized: + sanitized = "log" + return sanitized + + +def _short_hash(*parts: str, length: int = 10) -> str: + return hashlib.sha1(("|".join(parts)).encode("utf-8")).hexdigest()[:length] + + +def _build_log_filepath(log_dir: str, operation_id: str, request_url: str) -> str: + """Build log filepath. We keep it well under common path length limits aiming for <= 240 characters total.""" + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f") + slug = _sanitize_filename_component(operation_id) # Best-effort human-readable slug from operation_id + h = _short_hash(operation_id or "", request_url or "") # Short hash ties log to the full operation and URL + + # Compute how much room we have for the slug given the directory length + # Keep total path length reasonably below ~260 on Windows. + max_total_path = 240 + prefix = f"{timestamp}_" + suffix = f"_{h}.log" + if not slug: + slug = "op" + max_filename_len = max(60, max_total_path - len(log_dir) - 1) + max_slug_len = max(8, max_filename_len - len(prefix) - len(suffix)) + if len(slug) > max_slug_len: + slug = slug[:max_slug_len].rstrip(" ._-") + return os.path.join(log_dir, f"{prefix}{slug}{suffix}") + + +def _format_data_for_logging(data: Any) -> str: """Helper to format data (dict, str, bytes) for logging.""" if isinstance(data, bytes): try: - return data.decode('utf-8') # Try to decode as text + return data.decode("utf-8") # Try to decode as text except UnicodeDecodeError: return f"[Binary data of length {len(data)} bytes]" elif isinstance(data, (dict, list)): try: return json.dumps(data, indent=2, ensure_ascii=False) except TypeError: - return str(data) # Fallback for non-serializable objects + return str(data) # Fallback for non-serializable objects return str(data) + def log_request_response( operation_id: str, request_method: str, request_url: str, request_headers: dict | None = None, request_params: dict | None = None, - request_data: any = None, + request_data: Any = None, response_status_code: int | None = None, response_headers: dict | None = None, - response_content: any = None, - error_message: str | None = None + response_content: Any = None, + error_message: str | None = None, ): """ Logs API request and response details to a file in the temp/api_logs directory. + Filenames are sanitized and length-limited for cross-platform safety. + If we still fail to write, we fall back to appending into api.log. """ log_dir = get_log_directory() - timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f") - filename = f"{timestamp}_{operation_id.replace('/', '_').replace(':', '_')}.log" - filepath = os.path.join(log_dir, filename) - - log_content = [] + filepath = _build_log_filepath(log_dir, operation_id, request_url) + log_content: list[str] = [] log_content.append(f"Timestamp: {datetime.datetime.now().isoformat()}") log_content.append(f"Operation ID: {operation_id}") log_content.append("-" * 30 + " REQUEST " + "-" * 30) @@ -69,7 +106,7 @@ def log_request_response( log_content.append(f"Headers:\n{_format_data_for_logging(request_headers)}") if request_params: log_content.append(f"Params:\n{_format_data_for_logging(request_params)}") - if request_data: + if request_data is not None: log_content.append(f"Data/Body:\n{_format_data_for_logging(request_data)}") log_content.append("\n" + "-" * 30 + " RESPONSE " + "-" * 30) @@ -77,7 +114,7 @@ def log_request_response( log_content.append(f"Status Code: {response_status_code}") if response_headers: log_content.append(f"Headers:\n{_format_data_for_logging(response_headers)}") - if response_content: + if response_content is not None: log_content.append(f"Content:\n{_format_data_for_logging(response_content)}") if error_message: log_content.append(f"Error:\n{error_message}") @@ -85,9 +122,10 @@ def log_request_response( try: with open(filepath, "w", encoding="utf-8") as f: f.write("\n".join(log_content)) - logger.debug(f"API log saved to: {filepath}") + logger.debug("API log saved to: %s", filepath) except Exception as e: - logger.error(f"Error writing API log to {filepath}: {e}") + logger.error("Error writing API log to %s: %s", filepath, str(e)) + if __name__ == '__main__': # Example usage (for testing the logger directly) diff --git a/comfy_api_nodes/util/upload_helpers.py b/comfy_api_nodes/util/upload_helpers.py new file mode 100644 index 000000000..632450d9b --- /dev/null +++ b/comfy_api_nodes/util/upload_helpers.py @@ -0,0 +1,338 @@ +import asyncio +import contextlib +import logging +import time +import uuid +from io import BytesIO +from typing import Optional, Union +from urllib.parse import urlparse + +import aiohttp +import torch +from pydantic import BaseModel, Field + +from comfy_api.latest import IO, Input +from comfy_api.util import VideoCodec, VideoContainer + +from . import request_logger +from ._helpers import is_processing_interrupted, sleep_with_interrupt +from .client import ( + ApiEndpoint, + _diagnose_connectivity, + _display_time_progress, + sync_op, +) +from .common_exceptions import ApiServerError, LocalNetworkError, ProcessingInterrupted +from .conversions import ( + audio_ndarray_to_bytesio, + audio_tensor_to_contiguous_ndarray, + tensor_to_bytesio, +) + + +class UploadRequest(BaseModel): + file_name: str = Field(..., description="Filename to upload") + content_type: Optional[str] = Field( + None, + description="Mime type of the file. For example: image/png, image/jpeg, video/mp4, etc.", + ) + + +class UploadResponse(BaseModel): + download_url: str = Field(..., description="URL to GET uploaded file") + upload_url: str = Field(..., description="URL to PUT file to upload") + + +async def upload_images_to_comfyapi( + cls: type[IO.ComfyNode], + image: torch.Tensor, + *, + max_images: int = 8, + mime_type: Optional[str] = None, + wait_label: Optional[str] = "Uploading", +) -> list[str]: + """ + Uploads images to ComfyUI API and returns download URLs. + To upload multiple images, stack them in the batch dimension first. + """ + # if batch, try to upload each file if max_images is greater than 0 + download_urls: list[str] = [] + is_batch = len(image.shape) > 3 + batch_len = image.shape[0] if is_batch else 1 + + for idx in range(min(batch_len, max_images)): + tensor = image[idx] if is_batch else image + img_io = tensor_to_bytesio(tensor, mime_type=mime_type) + url = await upload_file_to_comfyapi(cls, img_io, img_io.name, mime_type, wait_label) + download_urls.append(url) + return download_urls + + +async def upload_audio_to_comfyapi( + cls: type[IO.ComfyNode], + audio: Input.Audio, + *, + container_format: str = "mp4", + codec_name: str = "aac", + mime_type: str = "audio/mp4", + filename: str = "uploaded_audio.mp4", +) -> str: + """ + Uploads a single audio input to ComfyUI API and returns its download URL. + Encodes the raw waveform into the specified format before uploading. + """ + sample_rate: int = audio["sample_rate"] + waveform: torch.Tensor = audio["waveform"] + audio_data_np = audio_tensor_to_contiguous_ndarray(waveform) + audio_bytes_io = audio_ndarray_to_bytesio(audio_data_np, sample_rate, container_format, codec_name) + return await upload_file_to_comfyapi(cls, audio_bytes_io, filename, mime_type) + + +async def upload_video_to_comfyapi( + cls: type[IO.ComfyNode], + video: Input.Video, + *, + container: VideoContainer = VideoContainer.MP4, + codec: VideoCodec = VideoCodec.H264, + max_duration: Optional[int] = None, +) -> str: + """ + Uploads a single video to ComfyUI API and returns its download URL. + Uses the specified container and codec for saving the video before upload. + """ + if max_duration is not None: + try: + actual_duration = video.get_duration() + if actual_duration > max_duration: + raise ValueError( + f"Video duration ({actual_duration:.2f}s) exceeds the maximum allowed ({max_duration}s)." + ) + except Exception as e: + logging.error("Error getting video duration: %s", str(e)) + raise ValueError(f"Could not verify video duration from source: {e}") from e + + upload_mime_type = f"video/{container.value.lower()}" + filename = f"uploaded_video.{container.value.lower()}" + + # Convert VideoInput to BytesIO using specified container/codec + video_bytes_io = BytesIO() + video.save_to(video_bytes_io, format=container, codec=codec) + video_bytes_io.seek(0) + + return await upload_file_to_comfyapi(cls, video_bytes_io, filename, upload_mime_type) + + +async def upload_file_to_comfyapi( + cls: type[IO.ComfyNode], + file_bytes_io: BytesIO, + filename: str, + upload_mime_type: Optional[str], + wait_label: Optional[str] = "Uploading", +) -> str: + """Uploads a single file to ComfyUI API and returns its download URL.""" + if upload_mime_type is None: + request_object = UploadRequest(file_name=filename) + else: + request_object = UploadRequest(file_name=filename, content_type=upload_mime_type) + create_resp = await sync_op( + cls, + endpoint=ApiEndpoint(path="/customers/storage", method="POST"), + data=request_object, + response_model=UploadResponse, + final_label_on_success=None, + monitor_progress=False, + ) + await upload_file( + cls, + create_resp.upload_url, + file_bytes_io, + content_type=upload_mime_type, + wait_label=wait_label, + ) + return create_resp.download_url + + +async def upload_file( + cls: type[IO.ComfyNode], + upload_url: str, + file: Union[BytesIO, str], + *, + content_type: Optional[str] = None, + max_retries: int = 3, + retry_delay: float = 1.0, + retry_backoff: float = 2.0, + wait_label: Optional[str] = None, +) -> None: + """ + Upload a file to a signed URL (e.g., S3 pre-signed PUT) with retries, Comfy progress display, and interruption. + + Args: + cls: Node class (provides auth context + UI progress hooks). + upload_url: Pre-signed PUT URL. + file: BytesIO or path string. + content_type: Explicit MIME type. If None, we *suppress* Content-Type. + max_retries: Maximum retry attempts. + retry_delay: Initial delay in seconds. + retry_backoff: Exponential backoff factor. + wait_label: Progress label shown in Comfy UI. + + Raises: + ProcessingInterrupted, LocalNetworkError, ApiServerError, Exception + """ + if isinstance(file, BytesIO): + with contextlib.suppress(Exception): + file.seek(0) + data = file.read() + elif isinstance(file, str): + with open(file, "rb") as f: + data = f.read() + else: + raise ValueError("file must be a BytesIO or a filesystem path string") + + headers: dict[str, str] = {} + skip_auto_headers: set[str] = set() + if content_type: + headers["Content-Type"] = content_type + else: + skip_auto_headers.add("Content-Type") # Don't let aiohttp add Content-Type, it can break the signed request + + attempt = 0 + delay = retry_delay + start_ts = time.monotonic() + op_uuid = uuid.uuid4().hex[:8] + while True: + attempt += 1 + operation_id = _generate_operation_id("PUT", upload_url, attempt, op_uuid) + timeout = aiohttp.ClientTimeout(total=None) + stop_evt = asyncio.Event() + + async def _monitor(): + try: + while not stop_evt.is_set(): + if is_processing_interrupted(): + return + if wait_label: + _display_time_progress(cls, wait_label, int(time.monotonic() - start_ts), None) + await asyncio.sleep(1.0) + except asyncio.CancelledError: + return + + monitor_task = asyncio.create_task(_monitor()) + sess: Optional[aiohttp.ClientSession] = None + try: + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + request_headers=headers or None, + request_params=None, + request_data=f"[File data {len(data)} bytes]", + ) + except Exception as e: + logging.debug("[DEBUG] upload request logging failed: %s", e) + + sess = aiohttp.ClientSession(timeout=timeout) + req = sess.put(upload_url, data=data, headers=headers, skip_auto_headers=skip_auto_headers) + req_task = asyncio.create_task(req) + + done, pending = await asyncio.wait({req_task, monitor_task}, return_when=asyncio.FIRST_COMPLETED) + + if monitor_task in done and req_task in pending: + req_task.cancel() + raise ProcessingInterrupted("Upload cancelled") + + try: + resp = await req_task + except asyncio.CancelledError: + raise ProcessingInterrupted("Upload cancelled") from None + + async with resp: + if resp.status >= 400: + with contextlib.suppress(Exception): + try: + body = await resp.json() + except Exception: + body = await resp.text() + msg = f"Upload failed with status {resp.status}" + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content=body, + error_message=msg, + ) + if resp.status in {408, 429, 500, 502, 503, 504} and attempt <= max_retries: + await sleep_with_interrupt( + delay, + cls, + wait_label, + start_ts, + None, + display_callback=_display_time_progress if wait_label else None, + ) + delay *= retry_backoff + continue + raise Exception(f"Failed to upload (HTTP {resp.status}).") + try: + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + response_status_code=resp.status, + response_headers=dict(resp.headers), + response_content="File uploaded successfully.", + ) + except Exception as e: + logging.debug("[DEBUG] upload response logging failed: %s", e) + return + except asyncio.CancelledError: + raise ProcessingInterrupted("Task cancelled") from None + except (aiohttp.ClientError, OSError) as e: + if attempt <= max_retries: + with contextlib.suppress(Exception): + request_logger.log_request_response( + operation_id=operation_id, + request_method="PUT", + request_url=upload_url, + request_headers=headers or None, + request_data=f"[File data {len(data)} bytes]", + error_message=f"{type(e).__name__}: {str(e)} (will retry)", + ) + await sleep_with_interrupt( + delay, + cls, + wait_label, + start_ts, + None, + display_callback=_display_time_progress if wait_label else None, + ) + delay *= retry_backoff + continue + + diag = await _diagnose_connectivity() + if not diag["internet_accessible"]: + raise LocalNetworkError( + "Unable to connect to the network. Please check your internet connection and try again." + ) from e + raise ApiServerError("The API service appears unreachable at this time.") from e + finally: + stop_evt.set() + if monitor_task: + monitor_task.cancel() + with contextlib.suppress(Exception): + await monitor_task + if sess: + with contextlib.suppress(Exception): + await sess.close() + + +def _generate_operation_id(method: str, url: str, attempt: int, op_uuid: str) -> str: + try: + parsed = urlparse(url) + slug = (parsed.path.rsplit("/", 1)[-1] or parsed.netloc or "upload").strip("/").replace("/", "_") + except Exception: + slug = "upload" + return f"{method}_{slug}_{op_uuid}_try{attempt}" diff --git a/comfy_api_nodes/util/validation_utils.py b/comfy_api_nodes/util/validation_utils.py index ca913e9b3..ec7006aed 100644 --- a/comfy_api_nodes/util/validation_utils.py +++ b/comfy_api_nodes/util/validation_utils.py @@ -2,6 +2,8 @@ import logging from typing import Optional import torch + +from comfy_api.input.video_types import VideoInput from comfy_api.latest import Input @@ -28,76 +30,69 @@ def validate_image_dimensions( if max_width is not None and width > max_width: raise ValueError(f"Image width must be at most {max_width}px, got {width}px") if min_height is not None and height < min_height: - raise ValueError( - f"Image height must be at least {min_height}px, got {height}px" - ) + raise ValueError(f"Image height must be at least {min_height}px, got {height}px") if max_height is not None and height > max_height: raise ValueError(f"Image height must be at most {max_height}px, got {height}px") def validate_image_aspect_ratio( image: torch.Tensor, - min_aspect_ratio: Optional[float] = None, - max_aspect_ratio: Optional[float] = None, -): - width, height = get_image_dimensions(image) - aspect_ratio = width / height - - if min_aspect_ratio is not None and aspect_ratio < min_aspect_ratio: - raise ValueError( - f"Image aspect ratio must be at least {min_aspect_ratio}, got {aspect_ratio}" - ) - if max_aspect_ratio is not None and aspect_ratio > max_aspect_ratio: - raise ValueError( - f"Image aspect ratio must be at most {max_aspect_ratio}, got {aspect_ratio}" - ) - - -def validate_image_aspect_ratio_range( - image: torch.Tensor, - min_ratio: tuple[float, float], # e.g. (1, 4) - max_ratio: tuple[float, float], # e.g. (4, 1) + min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4) + max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1) *, - strict: bool = True, # True -> (min, max); False -> [min, max] + strict: bool = True, # True -> (min, max); False -> [min, max] ) -> float: - a1, b1 = min_ratio - a2, b2 = max_ratio - if a1 <= 0 or b1 <= 0 or a2 <= 0 or b2 <= 0: - raise ValueError("Ratios must be positive, like (1, 4) or (4, 1).") - lo, hi = (a1 / b1), (a2 / b2) - if lo > hi: - lo, hi = hi, lo - a1, b1, a2, b2 = a2, b2, a1, b1 # swap only for error text + """Validates that image aspect ratio is within min and max. If a bound is None, that side is not checked.""" w, h = get_image_dimensions(image) if w <= 0 or h <= 0: raise ValueError(f"Invalid image dimensions: {w}x{h}") ar = w / h - ok = (lo < ar < hi) if strict else (lo <= ar <= hi) - if not ok: - op = "<" if strict else "≤" - raise ValueError(f"Image aspect ratio {ar:.6g} is outside allowed range: {a1}:{b1} {op} ratio {op} {a2}:{b2}") + _assert_ratio_bounds(ar, min_ratio=min_ratio, max_ratio=max_ratio, strict=strict) return ar -def validate_aspect_ratio_closeness( - start_img, - end_img, - min_rel: float, - max_rel: float, +def validate_images_aspect_ratio_closeness( + first_image: torch.Tensor, + second_image: torch.Tensor, + min_rel: float, # e.g. 0.8 + max_rel: float, # e.g. 1.25 *, - strict: bool = False, # True => exclusive, False => inclusive -) -> None: - w1, h1 = get_image_dimensions(start_img) - w2, h2 = get_image_dimensions(end_img) + strict: bool = False, # True -> (min, max); False -> [min, max] +) -> float: + """ + Validates that the two images' aspect ratios are 'close'. + The closeness factor is C = max(ar1, ar2) / min(ar1, ar2) (C >= 1). + We require C <= limit, where limit = max(max_rel, 1.0 / min_rel). + + Returns the computed closeness factor C. + """ + w1, h1 = get_image_dimensions(first_image) + w2, h2 = get_image_dimensions(second_image) if min(w1, h1, w2, h2) <= 0: raise ValueError("Invalid image dimensions") ar1 = w1 / h1 ar2 = w2 / h2 - # Normalize so it is symmetric (no need to check both ar1/ar2 and ar2/ar1) closeness = max(ar1, ar2) / min(ar1, ar2) - limit = max(max_rel, 1.0 / min_rel) # for 0.8..1.25 this is 1.25 + limit = max(max_rel, 1.0 / min_rel) if (closeness >= limit) if strict else (closeness > limit): - raise ValueError(f"Aspect ratios must be close: start/end={ar1/ar2:.4f}, allowed range {min_rel}–{max_rel}.") + raise ValueError( + f"Aspect ratios must be close: ar1/ar2={ar1/ar2:.2g}, " + f"allowed range {min_rel}–{max_rel} (limit {limit:.2g})." + ) + return closeness + + +def validate_aspect_ratio_string( + aspect_ratio: str, + min_ratio: Optional[tuple[float, float]] = None, # e.g. (1, 4) + max_ratio: Optional[tuple[float, float]] = None, # e.g. (4, 1) + *, + strict: bool = False, # True -> (min, max); False -> [min, max] +) -> float: + """Parses 'X:Y' and validates it against optional bounds. Returns the numeric ratio.""" + ar = _parse_aspect_ratio_string(aspect_ratio) + _assert_ratio_bounds(ar, min_ratio=min_ratio, max_ratio=max_ratio, strict=strict) + return ar def validate_video_dimensions( @@ -118,9 +113,7 @@ def validate_video_dimensions( if max_width is not None and width > max_width: raise ValueError(f"Video width must be at most {max_width}px, got {width}px") if min_height is not None and height < min_height: - raise ValueError( - f"Video height must be at least {min_height}px, got {height}px" - ) + raise ValueError(f"Video height must be at least {min_height}px, got {height}px") if max_height is not None and height > max_height: raise ValueError(f"Video height must be at most {max_height}px, got {height}px") @@ -138,13 +131,9 @@ def validate_video_duration( epsilon = 0.0001 if min_duration is not None and min_duration - epsilon > duration: - raise ValueError( - f"Video duration must be at least {min_duration}s, got {duration}s" - ) + raise ValueError(f"Video duration must be at least {min_duration}s, got {duration}s") if max_duration is not None and duration > max_duration + epsilon: - raise ValueError( - f"Video duration must be at most {max_duration}s, got {duration}s" - ) + raise ValueError(f"Video duration must be at most {max_duration}s, got {duration}s") def get_number_of_images(images): @@ -165,3 +154,77 @@ def validate_audio_duration( raise ValueError(f"Audio duration must be at least {min_duration}s, got {dur + eps:.2f}s") if max_duration is not None and dur - eps > max_duration: raise ValueError(f"Audio duration must be at most {max_duration}s, got {dur - eps:.2f}s") + + +def validate_string( + string: str, + strip_whitespace=True, + field_name="prompt", + min_length=None, + max_length=None, +): + if string is None: + raise Exception(f"Field '{field_name}' cannot be empty.") + if strip_whitespace: + string = string.strip() + if min_length and len(string) < min_length: + raise Exception( + f"Field '{field_name}' cannot be shorter than {min_length} characters; was {len(string)} characters long." + ) + if max_length and len(string) > max_length: + raise Exception( + f" Field '{field_name} cannot be longer than {max_length} characters; was {len(string)} characters long." + ) + + +def validate_container_format_is_mp4(video: VideoInput) -> None: + """Validates video container format is MP4.""" + container_format = video.get_container_format() + if container_format not in ["mp4", "mov,mp4,m4a,3gp,3g2,mj2"]: + raise ValueError(f"Only MP4 container format supported. Got: {container_format}") + + +def _ratio_from_tuple(r: tuple[float, float]) -> float: + a, b = r + if a <= 0 or b <= 0: + raise ValueError(f"Ratios must be positive, got {a}:{b}.") + return a / b + + +def _assert_ratio_bounds( + ar: float, + *, + min_ratio: Optional[tuple[float, float]] = None, + max_ratio: Optional[tuple[float, float]] = None, + strict: bool = True, +) -> None: + """Validate a numeric aspect ratio against optional min/max ratio bounds.""" + lo = _ratio_from_tuple(min_ratio) if min_ratio is not None else None + hi = _ratio_from_tuple(max_ratio) if max_ratio is not None else None + + if lo is not None and hi is not None and lo > hi: + lo, hi = hi, lo # normalize order if caller swapped them + + if lo is not None: + if (ar <= lo) if strict else (ar < lo): + op = "<" if strict else "≤" + raise ValueError(f"Aspect ratio `{ar:.2g}` must be {op} {lo:.2g}.") + if hi is not None: + if (ar >= hi) if strict else (ar > hi): + op = "<" if strict else "≤" + raise ValueError(f"Aspect ratio `{ar:.2g}` must be {op} {hi:.2g}.") + + +def _parse_aspect_ratio_string(ar_str: str) -> float: + """Parse 'X:Y' with integer parts into a positive float ratio X/Y.""" + parts = ar_str.split(":") + if len(parts) != 2: + raise ValueError(f"Aspect ratio must be 'X:Y' (e.g., 16:9), got '{ar_str}'.") + try: + a = int(parts[0].strip()) + b = int(parts[1].strip()) + except ValueError as exc: + raise ValueError(f"Aspect ratio must contain integers separated by ':', got '{ar_str}'.") from exc + if a <= 0 or b <= 0: + raise ValueError(f"Aspect ratio parts must be positive integers, got {a}:{b}.") + return a / b diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py index 41224ce3b..326a279fc 100644 --- a/comfy_execution/caching.py +++ b/comfy_execution/caching.py @@ -1,4 +1,9 @@ +import bisect +import gc import itertools +import psutil +import time +import torch from typing import Sequence, Mapping, Dict from comfy_execution.graph import DynamicPrompt from abc import ABC, abstractmethod @@ -48,7 +53,7 @@ class Unhashable: def to_hashable(obj): # So that we don't infinitely recurse since frozenset and tuples # are Sequences. - if isinstance(obj, (int, float, str, bool, type(None))): + if isinstance(obj, (int, float, str, bool, bytes, type(None))): return obj elif isinstance(obj, Mapping): return frozenset([(to_hashable(k), to_hashable(v)) for k, v in sorted(obj.items())]) @@ -188,6 +193,9 @@ class BasicCache: self._clean_cache() self._clean_subcaches() + def poll(self, **kwargs): + pass + def _set_immediate(self, node_id, value): assert self.initialized cache_key = self.cache_key_set.get_data_key(node_id) @@ -265,6 +273,29 @@ class HierarchicalCache(BasicCache): assert cache is not None return await cache._ensure_subcache(node_id, children_ids) +class NullCache: + + async def set_prompt(self, dynprompt, node_ids, is_changed_cache): + pass + + def all_node_ids(self): + return [] + + def clean_unused(self): + pass + + def poll(self, **kwargs): + pass + + def get(self, node_id): + return None + + def set(self, node_id, value): + pass + + async def ensure_subcache_for(self, node_id, children_ids): + return self + class LRUCache(BasicCache): def __init__(self, key_class, max_size=100): super().__init__(key_class) @@ -318,155 +349,75 @@ class LRUCache(BasicCache): return self -class DependencyAwareCache(BasicCache): - """ - A cache implementation that tracks dependencies between nodes and manages - their execution and caching accordingly. It extends the BasicCache class. - Nodes are removed from this cache once all of their descendants have been - executed. - """ +#Iterating the cache for usage analysis might be expensive, so if we trigger make sure +#to take a chunk out to give breathing space on high-node / low-ram-per-node flows. + +RAM_CACHE_HYSTERESIS = 1.1 + +#This is kinda in GB but not really. It needs to be non-zero for the below heuristic +#and as long as Multi GB models dwarf this it will approximate OOM scoring OK + +RAM_CACHE_DEFAULT_RAM_USAGE = 0.1 + +#Exponential bias towards evicting older workflows so garbage will be taken out +#in constantly changing setups. + +RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER = 1.3 + +class RAMPressureCache(LRUCache): def __init__(self, key_class): - """ - Initialize the DependencyAwareCache. - - Args: - key_class: The class used for generating cache keys. - """ - super().__init__(key_class) - self.descendants = {} # Maps node_id -> set of descendant node_ids - self.ancestors = {} # Maps node_id -> set of ancestor node_ids - self.executed_nodes = set() # Tracks nodes that have been executed - - async def set_prompt(self, dynprompt, node_ids, is_changed_cache): - """ - Clear the entire cache and rebuild the dependency graph. - - Args: - dynprompt: The dynamic prompt object containing node information. - node_ids: List of node IDs to initialize the cache for. - is_changed_cache: Flag indicating if the cache has changed. - """ - # Clear all existing cache data - self.cache.clear() - self.subcaches.clear() - self.descendants.clear() - self.ancestors.clear() - self.executed_nodes.clear() - - # Call the parent method to initialize the cache with the new prompt - await super().set_prompt(dynprompt, node_ids, is_changed_cache) - - # Rebuild the dependency graph - self._build_dependency_graph(dynprompt, node_ids) - - def _build_dependency_graph(self, dynprompt, node_ids): - """ - Build the dependency graph for all nodes. - - Args: - dynprompt: The dynamic prompt object containing node information. - node_ids: List of node IDs to build the graph for. - """ - self.descendants.clear() - self.ancestors.clear() - for node_id in node_ids: - self.descendants[node_id] = set() - self.ancestors[node_id] = set() - - for node_id in node_ids: - inputs = dynprompt.get_node(node_id)["inputs"] - for input_data in inputs.values(): - if is_link(input_data): # Check if the input is a link to another node - ancestor_id = input_data[0] - self.descendants[ancestor_id].add(node_id) - self.ancestors[node_id].add(ancestor_id) - - def set(self, node_id, value): - """ - Mark a node as executed and store its value in the cache. - - Args: - node_id: The ID of the node to store. - value: The value to store for the node. - """ - self._set_immediate(node_id, value) - self.executed_nodes.add(node_id) - self._cleanup_ancestors(node_id) - - def get(self, node_id): - """ - Retrieve the cached value for a node. - - Args: - node_id: The ID of the node to retrieve. - - Returns: - The cached value for the node. - """ - return self._get_immediate(node_id) - - async def ensure_subcache_for(self, node_id, children_ids): - """ - Ensure a subcache exists for a node and update dependencies. - - Args: - node_id: The ID of the parent node. - children_ids: List of child node IDs to associate with the parent node. - - Returns: - The subcache object for the node. - """ - subcache = await super()._ensure_subcache(node_id, children_ids) - for child_id in children_ids: - self.descendants[node_id].add(child_id) - self.ancestors[child_id].add(node_id) - return subcache - - def _cleanup_ancestors(self, node_id): - """ - Check if ancestors of a node can be removed from the cache. - - Args: - node_id: The ID of the node whose ancestors are to be checked. - """ - for ancestor_id in self.ancestors.get(node_id, []): - if ancestor_id in self.executed_nodes: - # Remove ancestor if all its descendants have been executed - if all(descendant in self.executed_nodes for descendant in self.descendants[ancestor_id]): - self._remove_node(ancestor_id) - - def _remove_node(self, node_id): - """ - Remove a node from the cache. - - Args: - node_id: The ID of the node to remove. - """ - cache_key = self.cache_key_set.get_data_key(node_id) - if cache_key in self.cache: - del self.cache[cache_key] - subcache_key = self.cache_key_set.get_subcache_key(node_id) - if subcache_key in self.subcaches: - del self.subcaches[subcache_key] + super().__init__(key_class, 0) + self.timestamps = {} def clean_unused(self): - """ - Clean up unused nodes. This is a no-op for this cache implementation. - """ - pass + self._clean_subcaches() - def recursive_debug_dump(self): - """ - Dump the cache and dependency graph for debugging. + def set(self, node_id, value): + self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time() + super().set(node_id, value) - Returns: - A list containing the cache state and dependency graph. - """ - result = super().recursive_debug_dump() - result.append({ - "descendants": self.descendants, - "ancestors": self.ancestors, - "executed_nodes": list(self.executed_nodes), - }) - return result + def get(self, node_id): + self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time() + return super().get(node_id) + + def poll(self, ram_headroom): + def _ram_gb(): + return psutil.virtual_memory().available / (1024**3) + + if _ram_gb() > ram_headroom: + return + gc.collect() + if _ram_gb() > ram_headroom: + return + + clean_list = [] + + for key, (outputs, _), in self.cache.items(): + oom_score = RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER ** (self.generation - self.used_generation[key]) + + ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE + def scan_list_for_ram_usage(outputs): + nonlocal ram_usage + if outputs is None: + return + for output in outputs: + if isinstance(output, list): + scan_list_for_ram_usage(output) + elif isinstance(output, torch.Tensor) and output.device.type == 'cpu': + #score Tensors at a 50% discount for RAM usage as they are likely to + #be high value intermediates + ram_usage += (output.numel() * output.element_size()) * 0.5 + elif hasattr(output, "get_ram_usage"): + ram_usage += output.get_ram_usage() + scan_list_for_ram_usage(outputs) + + oom_score *= ram_usage + #In the case where we have no information on the node ram usage at all, + #break OOM score ties on the last touch timestamp (pure LRU) + bisect.insort(clean_list, (oom_score, self.timestamps[key], key)) + + while _ram_gb() < ram_headroom * RAM_CACHE_HYSTERESIS and clean_list: + _, _, key = clean_list.pop() + del self.cache[key] + gc.collect() diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index f4b427265..0d811e354 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -153,8 +153,9 @@ class TopologicalSort: continue _, _, input_info = self.get_input_info(unique_id, input_name) is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"] - if (include_lazy or not is_lazy) and not self.is_cached(from_node_id): - node_ids.append(from_node_id) + if (include_lazy or not is_lazy): + if not self.is_cached(from_node_id): + node_ids.append(from_node_id) links.append((from_node_id, from_socket, unique_id)) for link in links: @@ -194,10 +195,40 @@ class ExecutionList(TopologicalSort): super().__init__(dynprompt) self.output_cache = output_cache self.staged_node_id = None + self.execution_cache = {} + self.execution_cache_listeners = {} def is_cached(self, node_id): return self.output_cache.get(node_id) is not None + def cache_link(self, from_node_id, to_node_id): + if not to_node_id in self.execution_cache: + self.execution_cache[to_node_id] = {} + self.execution_cache[to_node_id][from_node_id] = self.output_cache.get(from_node_id) + if not from_node_id in self.execution_cache_listeners: + self.execution_cache_listeners[from_node_id] = set() + self.execution_cache_listeners[from_node_id].add(to_node_id) + + def get_cache(self, from_node_id, to_node_id): + if not to_node_id in self.execution_cache: + return None + value = self.execution_cache[to_node_id].get(from_node_id) + if value is None: + return None + #Write back to the main cache on touch. + self.output_cache.set(from_node_id, value) + return value + + def cache_update(self, node_id, value): + if node_id in self.execution_cache_listeners: + for to_node_id in self.execution_cache_listeners[node_id]: + if to_node_id in self.execution_cache: + self.execution_cache[to_node_id][node_id] = value + + def add_strong_link(self, from_node_id, from_socket, to_node_id): + super().add_strong_link(from_node_id, from_socket, to_node_id) + self.cache_link(from_node_id, to_node_id) + async def stage_node_execution(self): assert self.staged_node_id is None if self.is_empty(): @@ -277,6 +308,8 @@ class ExecutionList(TopologicalSort): def complete_node_execution(self): node_id = self.staged_node_id self.pop_node(node_id) + self.execution_cache.pop(node_id, None) + self.execution_cache_listeners.pop(node_id, None) self.staged_node_id = None def get_nodes_in_cycle(self): diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index 51c8b9dd9..2ed7e0b22 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -142,9 +142,10 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non for key, value in metadata.items(): output_container.metadata[key] = value + layout = 'mono' if waveform.shape[0] == 1 else 'stereo' # Set up the output stream with appropriate properties if format == "opus": - out_stream = output_container.add_stream("libopus", rate=sample_rate) + out_stream = output_container.add_stream("libopus", rate=sample_rate, layout=layout) if quality == "64k": out_stream.bit_rate = 64000 elif quality == "96k": @@ -156,7 +157,7 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non elif quality == "320k": out_stream.bit_rate = 320000 elif format == "mp3": - out_stream = output_container.add_stream("libmp3lame", rate=sample_rate) + out_stream = output_container.add_stream("libmp3lame", rate=sample_rate, layout=layout) if quality == "V0": #TODO i would really love to support V3 and V5 but there doesn't seem to be a way to set the qscale level, the property below is a bool out_stream.codec_context.qscale = 1 @@ -165,9 +166,9 @@ def save_audio(self, audio, filename_prefix="ComfyUI", format="flac", prompt=Non elif quality == "320k": out_stream.bit_rate = 320000 else: #format == "flac": - out_stream = output_container.add_stream("flac", rate=sample_rate) + out_stream = output_container.add_stream("flac", rate=sample_rate, layout=layout) - frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout='mono' if waveform.shape[0] == 1 else 'stereo') + frame = av.AudioFrame.from_ndarray(waveform.movedim(0, 1).reshape(1, -1).float().numpy(), format='flt', layout=layout) frame.sample_rate = sample_rate frame.pts = 0 output_container.mux(out_stream.encode(frame)) @@ -360,7 +361,7 @@ class RecordAudio: def load(self, audio): audio_path = folder_paths.get_annotated_filepath(audio) - waveform, sample_rate = torchaudio.load(audio_path) + waveform, sample_rate = load(audio_path) audio = {"waveform": waveform.unsqueeze(0), "sample_rate": sample_rate} return (audio, ) diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py index 39a140fef..13aacd41a 100644 --- a/comfy_extras/nodes_audio_encoder.py +++ b/comfy_extras/nodes_audio_encoder.py @@ -1,44 +1,62 @@ import folder_paths import comfy.audio_encoders.audio_encoders import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class AudioEncoderLoader: +class AudioEncoderLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "audio_encoder_name": (folder_paths.get_filename_list("audio_encoders"), ), - }} - RETURN_TYPES = ("AUDIO_ENCODER",) - FUNCTION = "load_model" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="AudioEncoderLoader", + category="loaders", + inputs=[ + io.Combo.Input( + "audio_encoder_name", + options=folder_paths.get_filename_list("audio_encoders"), + ), + ], + outputs=[io.AudioEncoder.Output()], + ) - CATEGORY = "loaders" - - def load_model(self, audio_encoder_name): + @classmethod + def execute(cls, audio_encoder_name) -> io.NodeOutput: audio_encoder_name = folder_paths.get_full_path_or_raise("audio_encoders", audio_encoder_name) sd = comfy.utils.load_torch_file(audio_encoder_name, safe_load=True) audio_encoder = comfy.audio_encoders.audio_encoders.load_audio_encoder_from_sd(sd) if audio_encoder is None: raise RuntimeError("ERROR: audio encoder file is invalid and does not contain a valid model.") - return (audio_encoder,) + return io.NodeOutput(audio_encoder) -class AudioEncoderEncode: +class AudioEncoderEncode(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "audio_encoder": ("AUDIO_ENCODER",), - "audio": ("AUDIO",), - }} - RETURN_TYPES = ("AUDIO_ENCODER_OUTPUT",) - FUNCTION = "encode" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="AudioEncoderEncode", + category="conditioning", + inputs=[ + io.AudioEncoder.Input("audio_encoder"), + io.Audio.Input("audio"), + ], + outputs=[io.AudioEncoderOutput.Output()], + ) - CATEGORY = "conditioning" - - def encode(self, audio_encoder, audio): + @classmethod + def execute(cls, audio_encoder, audio) -> io.NodeOutput: output = audio_encoder.encode_audio(audio["waveform"], audio["sample_rate"]) - return (output,) + return io.NodeOutput(output) -NODE_CLASS_MAPPINGS = { - "AudioEncoderLoader": AudioEncoderLoader, - "AudioEncoderEncode": AudioEncoderEncode, -} +class AudioEncoder(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + AudioEncoderLoader, + AudioEncoderEncode, + ] + + +async def comfy_entrypoint() -> AudioEncoder: + return AudioEncoder() diff --git a/comfy_extras/nodes_compositing.py b/comfy_extras/nodes_compositing.py index 2f994fa11..e4e4e1cbc 100644 --- a/comfy_extras/nodes_compositing.py +++ b/comfy_extras/nodes_compositing.py @@ -1,6 +1,9 @@ import torch import comfy.utils from enum import Enum +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + def resize_mask(mask, shape): return torch.nn.functional.interpolate(mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])), size=(shape[0], shape[1]), mode="bilinear").squeeze(1) @@ -101,24 +104,28 @@ def porter_duff_composite(src_image: torch.Tensor, src_alpha: torch.Tensor, dst_ return out_image, out_alpha -class PorterDuffImageComposite: +class PorterDuffImageComposite(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "source": ("IMAGE",), - "source_alpha": ("MASK",), - "destination": ("IMAGE",), - "destination_alpha": ("MASK",), - "mode": ([mode.name for mode in PorterDuffMode], {"default": PorterDuffMode.DST.name}), - }, - } + def define_schema(cls): + return io.Schema( + node_id="PorterDuffImageComposite", + display_name="Porter-Duff Image Composite", + category="mask/compositing", + inputs=[ + io.Image.Input("source"), + io.Mask.Input("source_alpha"), + io.Image.Input("destination"), + io.Mask.Input("destination_alpha"), + io.Combo.Input("mode", options=[mode.name for mode in PorterDuffMode], default=PorterDuffMode.DST.name), + ], + outputs=[ + io.Image.Output(), + io.Mask.Output(), + ], + ) - RETURN_TYPES = ("IMAGE", "MASK") - FUNCTION = "composite" - CATEGORY = "mask/compositing" - - def composite(self, source: torch.Tensor, source_alpha: torch.Tensor, destination: torch.Tensor, destination_alpha: torch.Tensor, mode): + @classmethod + def execute(cls, source: torch.Tensor, source_alpha: torch.Tensor, destination: torch.Tensor, destination_alpha: torch.Tensor, mode) -> io.NodeOutput: batch_size = min(len(source), len(source_alpha), len(destination), len(destination_alpha)) out_images = [] out_alphas = [] @@ -150,45 +157,48 @@ class PorterDuffImageComposite: out_images.append(out_image) out_alphas.append(out_alpha.squeeze(2)) - result = (torch.stack(out_images), torch.stack(out_alphas)) - return result + return io.NodeOutput(torch.stack(out_images), torch.stack(out_alphas)) -class SplitImageWithAlpha: +class SplitImageWithAlpha(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - } - } + def define_schema(cls): + return io.Schema( + node_id="SplitImageWithAlpha", + display_name="Split Image with Alpha", + category="mask/compositing", + inputs=[ + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(), + io.Mask.Output(), + ], + ) - CATEGORY = "mask/compositing" - RETURN_TYPES = ("IMAGE", "MASK") - FUNCTION = "split_image_with_alpha" - - def split_image_with_alpha(self, image: torch.Tensor): + @classmethod + def execute(cls, image: torch.Tensor) -> io.NodeOutput: out_images = [i[:,:,:3] for i in image] out_alphas = [i[:,:,3] if i.shape[2] > 3 else torch.ones_like(i[:,:,0]) for i in image] - result = (torch.stack(out_images), 1.0 - torch.stack(out_alphas)) - return result + return io.NodeOutput(torch.stack(out_images), 1.0 - torch.stack(out_alphas)) -class JoinImageWithAlpha: +class JoinImageWithAlpha(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "alpha": ("MASK",), - } - } + def define_schema(cls): + return io.Schema( + node_id="JoinImageWithAlpha", + display_name="Join Image with Alpha", + category="mask/compositing", + inputs=[ + io.Image.Input("image"), + io.Mask.Input("alpha"), + ], + outputs=[io.Image.Output()], + ) - CATEGORY = "mask/compositing" - RETURN_TYPES = ("IMAGE",) - FUNCTION = "join_image_with_alpha" - - def join_image_with_alpha(self, image: torch.Tensor, alpha: torch.Tensor): + @classmethod + def execute(cls, image: torch.Tensor, alpha: torch.Tensor) -> io.NodeOutput: batch_size = min(len(image), len(alpha)) out_images = [] @@ -196,19 +206,18 @@ class JoinImageWithAlpha: for i in range(batch_size): out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2)) - result = (torch.stack(out_images),) - return result + return io.NodeOutput(torch.stack(out_images)) -NODE_CLASS_MAPPINGS = { - "PorterDuffImageComposite": PorterDuffImageComposite, - "SplitImageWithAlpha": SplitImageWithAlpha, - "JoinImageWithAlpha": JoinImageWithAlpha, -} +class CompositingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PorterDuffImageComposite, + SplitImageWithAlpha, + JoinImageWithAlpha, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "PorterDuffImageComposite": "Porter-Duff Image Composite", - "SplitImageWithAlpha": "Split Image with Alpha", - "JoinImageWithAlpha": "Join Image with Alpha", -} +async def comfy_entrypoint() -> CompositingExtension: + return CompositingExtension() diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py index 2d20e1fed..e835feed7 100644 --- a/comfy_extras/nodes_controlnet.py +++ b/comfy_extras/nodes_controlnet.py @@ -1,20 +1,26 @@ from comfy.cldm.control_types import UNION_CONTROLNET_TYPES import nodes import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class SetUnionControlNetType: +class SetUnionControlNetType(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"control_net": ("CONTROL_NET", ), - "type": (["auto"] + list(UNION_CONTROLNET_TYPES.keys()),) - }} + def define_schema(cls): + return io.Schema( + node_id="SetUnionControlNetType", + category="conditioning/controlnet", + inputs=[ + io.ControlNet.Input("control_net"), + io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())), + ], + outputs=[ + io.ControlNet.Output(), + ], + ) - CATEGORY = "conditioning/controlnet" - RETURN_TYPES = ("CONTROL_NET",) - - FUNCTION = "set_controlnet_type" - - def set_controlnet_type(self, control_net, type): + @classmethod + def execute(cls, control_net, type) -> io.NodeOutput: control_net = control_net.copy() type_number = UNION_CONTROLNET_TYPES.get(type, -1) if type_number >= 0: @@ -22,27 +28,36 @@ class SetUnionControlNetType: else: control_net.set_extra_arg("control_type", []) - return (control_net,) + return io.NodeOutput(control_net) -class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced): + set_controlnet_type = execute # TODO: remove + + +class ControlNetInpaintingAliMamaApply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "control_net": ("CONTROL_NET", ), - "vae": ("VAE", ), - "image": ("IMAGE", ), - "mask": ("MASK", ), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}) - }} + def define_schema(cls): + return io.Schema( + node_id="ControlNetInpaintingAliMamaApply", + category="conditioning/controlnet", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.ControlNet.Input("control_net"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Mask.Input("mask"), + io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + ) - FUNCTION = "apply_inpaint_controlnet" - - CATEGORY = "conditioning/controlnet" - - def apply_inpaint_controlnet(self, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent): + @classmethod + def execute(cls, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent) -> io.NodeOutput: extra_concat = [] if control_net.concat_mask: mask = 1.0 - mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])) @@ -50,11 +65,20 @@ class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced): image = image * mask_apply.movedim(1, -1).repeat(1, 1, 1, image.shape[3]) extra_concat = [mask] - return self.apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat) + result = nodes.ControlNetApplyAdvanced().apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat) + return io.NodeOutput(result[0], result[1]) + + apply_inpaint_controlnet = execute # TODO: remove +class ControlNetExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SetUnionControlNetType, + ControlNetInpaintingAliMamaApply, + ] -NODE_CLASS_MAPPINGS = { - "SetUnionControlNetType": SetUnionControlNetType, - "ControlNetInpaintingAliMamaApply": ControlNetInpaintingAliMamaApply, -} + +async def comfy_entrypoint() -> ControlNetExtension: + return ControlNetExtension() diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py index 255ac420d..6dfdf466c 100644 --- a/comfy_extras/nodes_differential_diffusion.py +++ b/comfy_extras/nodes_differential_diffusion.py @@ -1,34 +1,41 @@ # code adapted from https://github.com/exx8/differential-diffusion +from typing_extensions import override + import torch +from comfy_api.latest import ComfyExtension, io -class DifferentialDiffusion(): + +class DifferentialDiffusion(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL", ), - }, - "optional": { - "strength": ("FLOAT", { - "default": 1.0, - "min": 0.0, - "max": 1.0, - "step": 0.01, - }), - } - } - RETURN_TYPES = ("MODEL",) - FUNCTION = "apply" - CATEGORY = "_for_testing" - INIT = False + def define_schema(cls): + return io.Schema( + node_id="DifferentialDiffusion", + display_name="Differential Diffusion", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "strength", + default=1.0, + min=0.0, + max=1.0, + step=0.01, + optional=True, + ), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - def apply(self, model, strength=1.0): + @classmethod + def execute(cls, model, strength=1.0) -> io.NodeOutput: model = model.clone() - model.set_model_denoise_mask_function(lambda *args, **kwargs: self.forward(*args, **kwargs, strength=strength)) - return (model, ) + model.set_model_denoise_mask_function(lambda *args, **kwargs: cls.forward(*args, **kwargs, strength=strength)) + return io.NodeOutput(model) - def forward(self, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float): + @classmethod + def forward(cls, sigma: torch.Tensor, denoise_mask: torch.Tensor, extra_options: dict, strength: float): model = extra_options["model"] step_sigmas = extra_options["sigmas"] sigma_to = model.inner_model.model_sampling.sigma_min @@ -53,9 +60,13 @@ class DifferentialDiffusion(): return binary_mask -NODE_CLASS_MAPPINGS = { - "DifferentialDiffusion": DifferentialDiffusion, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "DifferentialDiffusion": "Differential Diffusion", -} +class DifferentialDiffusionExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + DifferentialDiffusion, + ] + + +async def comfy_entrypoint() -> DifferentialDiffusionExtension: + return DifferentialDiffusionExtension() diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py index c170e9fd9..11b23ffdb 100644 --- a/comfy_extras/nodes_easycache.py +++ b/comfy_extras/nodes_easycache.py @@ -11,13 +11,13 @@ if TYPE_CHECKING: def easycache_forward_wrapper(executor, *args, **kwargs): # get values from args - x: torch.Tensor = args[0] transformer_options: dict[str] = args[-1] if not isinstance(transformer_options, dict): transformer_options = kwargs.get("transformer_options") if not transformer_options: transformer_options = args[-2] easycache: EasyCacheHolder = transformer_options["easycache"] + x: torch.Tensor = args[0][:, :easycache.output_channels] sigmas = transformer_options["sigmas"] uuids = transformer_options["uuids"] if sigmas is not None and easycache.is_past_end_timestep(sigmas): @@ -82,13 +82,13 @@ def easycache_forward_wrapper(executor, *args, **kwargs): def lazycache_predict_noise_wrapper(executor, *args, **kwargs): # get values from args - x: torch.Tensor = args[0] timestep: float = args[1] model_options: dict[str] = args[2] easycache: LazyCacheHolder = model_options["transformer_options"]["easycache"] if easycache.is_past_end_timestep(timestep): return executor(*args, **kwargs) # prepare next x_prev + x: torch.Tensor = args[0][:, :easycache.output_channels] next_x_prev = x input_change = None do_easycache = easycache.should_do_easycache(timestep) @@ -173,7 +173,7 @@ def easycache_sample_wrapper(executor, *args, **kwargs): class EasyCacheHolder: - def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None): self.name = "EasyCache" self.reuse_threshold = reuse_threshold self.start_percent = start_percent @@ -202,6 +202,7 @@ class EasyCacheHolder: self.allow_mismatch = True self.cut_from_start = True self.state_metadata = None + self.output_channels = output_channels def is_past_end_timestep(self, timestep: float) -> bool: return not (timestep[0] > self.end_t).item() @@ -244,6 +245,8 @@ class EasyCacheHolder: self.total_steps_skipped += 1 batch_offset = x.shape[0] // len(uuids) for i, uuid in enumerate(uuids): + # slice out only what is relevant to this cond + batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)] # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video) if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]: if not self.allow_mismatch: @@ -261,9 +264,8 @@ class EasyCacheHolder: slicing.append(slice(None, dim_u)) else: slicing.append(slice(None)) - slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing - x = x[slicing] - x += self.uuid_cache_diffs[uuid].to(x.device) + batch_slice = batch_slice + slicing + x[tuple(batch_slice)] += self.uuid_cache_diffs[uuid].to(x.device) return x def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]): @@ -282,7 +284,7 @@ class EasyCacheHolder: else: slicing.append(slice(None)) skip_dim = False - x = x[slicing] + x = x[tuple(slicing)] diff = output - x batch_offset = diff.shape[0] // len(uuids) for i, uuid in enumerate(uuids): @@ -322,7 +324,7 @@ class EasyCacheHolder: return self def clone(self): - return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + return EasyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels) class EasyCacheNode(io.ComfyNode): @@ -349,7 +351,7 @@ class EasyCacheNode(io.ComfyNode): @classmethod def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: model = model.clone() - model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.model_options["transformer_options"]["easycache"] = EasyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "easycache", easycache_sample_wrapper) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.CALC_COND_BATCH, "easycache", easycache_calc_cond_batch_wrapper) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "easycache", easycache_forward_wrapper) @@ -357,7 +359,7 @@ class EasyCacheNode(io.ComfyNode): class LazyCacheHolder: - def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False): + def __init__(self, reuse_threshold: float, start_percent: float, end_percent: float, subsample_factor: int, offload_cache_diff: bool, verbose: bool=False, output_channels: int=None): self.name = "LazyCache" self.reuse_threshold = reuse_threshold self.start_percent = start_percent @@ -381,6 +383,7 @@ class LazyCacheHolder: self.approx_output_change_rates = [] self.total_steps_skipped = 0 self.state_metadata = None + self.output_channels = output_channels def has_cache_diff(self) -> bool: return self.cache_diff is not None @@ -455,7 +458,7 @@ class LazyCacheHolder: return self def clone(self): - return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose) + return LazyCacheHolder(self.reuse_threshold, self.start_percent, self.end_percent, self.subsample_factor, self.offload_cache_diff, self.verbose, output_channels=self.output_channels) class LazyCacheNode(io.ComfyNode): @classmethod @@ -481,7 +484,7 @@ class LazyCacheNode(io.ComfyNode): @classmethod def execute(cls, model: io.Model.Type, reuse_threshold: float, start_percent: float, end_percent: float, verbose: bool) -> io.NodeOutput: model = model.clone() - model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose) + model.model_options["transformer_options"]["easycache"] = LazyCacheHolder(reuse_threshold, start_percent, end_percent, subsample_factor=8, offload_cache_diff=False, verbose=verbose, output_channels=model.model.latent_format.latent_channels) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.OUTER_SAMPLE, "lazycache", easycache_sample_wrapper) model.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.PREDICT_NOISE, "lazycache", lazycache_predict_noise_wrapper) return io.NodeOutput(model) diff --git a/comfy_extras/nodes_edit_model.py b/comfy_extras/nodes_edit_model.py index b69f79715..36da66f34 100644 --- a/comfy_extras/nodes_edit_model.py +++ b/comfy_extras/nodes_edit_model.py @@ -1,26 +1,38 @@ import node_helpers +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class ReferenceLatent: +class ReferenceLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"conditioning": ("CONDITIONING", ), - }, - "optional": {"latent": ("LATENT", ),} - } + def define_schema(cls): + return io.Schema( + node_id="ReferenceLatent", + category="advanced/conditioning/edit_models", + description="This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images.", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Latent.Input("latent", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ] + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - - CATEGORY = "advanced/conditioning/edit_models" - DESCRIPTION = "This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images." - - def append(self, conditioning, latent=None): + @classmethod + def execute(cls, conditioning, latent=None) -> io.NodeOutput: if latent is not None: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [latent["samples"]]}, append=True) - return (conditioning, ) + return io.NodeOutput(conditioning) -NODE_CLASS_MAPPINGS = { - "ReferenceLatent": ReferenceLatent, -} +class EditModelExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + ReferenceLatent, + ] + + +def comfy_entrypoint() -> EditModelExtension: + return EditModelExtension() diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py new file mode 100644 index 000000000..4d8061741 --- /dev/null +++ b/comfy_extras/nodes_eps.py @@ -0,0 +1,169 @@ +import torch +from typing_extensions import override + +from comfy.k_diffusion.sampling import sigma_to_half_log_snr +from comfy_api.latest import ComfyExtension, io + + +class EpsilonScaling(io.ComfyNode): + """ + Implements the Epsilon Scaling method from 'Elucidating the Exposure Bias in Diffusion Models' + (https://arxiv.org/abs/2308.15321v6). + + This method mitigates exposure bias by scaling the predicted noise during sampling, + which can significantly improve sample quality. This implementation uses the "uniform schedule" + recommended by the paper for its practicality and effectiveness. + """ + @classmethod + def define_schema(cls): + return io.Schema( + node_id="Epsilon Scaling", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "scaling_factor", + default=1.005, + min=0.5, + max=1.5, + step=0.001, + display_mode=io.NumberDisplay.number, + ), + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def execute(cls, model, scaling_factor) -> io.NodeOutput: + # Prevent division by zero, though the UI's min value should prevent this. + if scaling_factor == 0: + scaling_factor = 1e-9 + + def epsilon_scaling_function(args): + """ + This function is applied after the CFG guidance has been calculated. + It recalculates the denoised latent by scaling the predicted noise. + """ + denoised = args["denoised"] + x = args["input"] + + noise_pred = x - denoised + + scaled_noise_pred = noise_pred / scaling_factor + + new_denoised = x - scaled_noise_pred + + return new_denoised + + # Clone the model patcher to avoid modifying the original model in place + model_clone = model.clone() + + model_clone.set_model_sampler_post_cfg_function(epsilon_scaling_function) + + return io.NodeOutput(model_clone) + + +def compute_tsr_rescaling_factor( + snr: torch.Tensor, tsr_k: float, tsr_variance: float +) -> torch.Tensor: + """Compute the rescaling score ratio in Temporal Score Rescaling. + + See equation (6) in https://arxiv.org/pdf/2510.01184v1. + """ + posinf_mask = torch.isposinf(snr) + rescaling_factor = (snr * tsr_variance + 1) / (snr * tsr_variance / tsr_k + 1) + return torch.where(posinf_mask, tsr_k, rescaling_factor) # when snr → inf, r = tsr_k + + +class TemporalScoreRescaling(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="TemporalScoreRescaling", + display_name="TSR - Temporal Score Rescaling", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input( + "tsr_k", + tooltip=( + "Controls the rescaling strength.\n" + "Lower k produces more detailed results; higher k produces smoother results in image generation. Setting k = 1 disables rescaling." + ), + default=0.95, + min=0.01, + max=100.0, + step=0.001, + display_mode=io.NumberDisplay.number, + ), + io.Float.Input( + "tsr_sigma", + tooltip=( + "Controls how early rescaling takes effect.\n" + "Larger values take effect earlier." + ), + default=1.0, + min=0.01, + max=100.0, + step=0.001, + display_mode=io.NumberDisplay.number, + ), + ], + outputs=[ + io.Model.Output( + display_name="patched_model", + ), + ], + description=( + "[Post-CFG Function]\n" + "TSR - Temporal Score Rescaling (2510.01184)\n\n" + "Rescaling the model's score or noise to steer the sampling diversity.\n" + ), + ) + + @classmethod + def execute(cls, model, tsr_k, tsr_sigma) -> io.NodeOutput: + tsr_variance = tsr_sigma**2 + + def temporal_score_rescaling(args): + denoised = args["denoised"] + x = args["input"] + sigma = args["sigma"] + curr_model = args["model"] + + # No rescaling (r = 1) or no noise + if tsr_k == 1 or sigma == 0: + return denoised + + model_sampling = curr_model.current_patcher.get_model_object("model_sampling") + half_log_snr = sigma_to_half_log_snr(sigma, model_sampling) + snr = (2 * half_log_snr).exp() + + # No rescaling needed (r = 1) + if snr == 0: + return denoised + + rescaling_r = compute_tsr_rescaling_factor(snr, tsr_k, tsr_variance) + + # Derived from scaled_denoised = (x - r * sigma * noise) / alpha + alpha = sigma * half_log_snr.exp() + return torch.lerp(x / alpha, denoised, rescaling_r) + + m = model.clone() + m.set_model_sampler_post_cfg_function(temporal_score_rescaling) + return io.NodeOutput(m) + + +class EpsilonScalingExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EpsilonScaling, + TemporalScoreRescaling, + ] + + +async def comfy_entrypoint() -> EpsilonScalingExtension: + return EpsilonScalingExtension() diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 25e029ffd..ce1b2e89f 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -1,60 +1,80 @@ import node_helpers import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodeFlux: + +class CLIPTextEncodeFlux(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "guidance": ("FLOAT", {"default": 3.5, "min": 0.0, "max": 100.0, "step": 0.1}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeFlux", + category="advanced/conditioning/flux", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("clip_l", multiline=True, dynamic_prompts=True), + io.String.Input("t5xxl", multiline=True, dynamic_prompts=True), + io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning/flux" - - def encode(self, clip, clip_l, t5xxl, guidance): + @classmethod + def execute(cls, clip, clip_l, t5xxl, guidance) -> io.NodeOutput: tokens = clip.tokenize(clip_l) tokens["t5xxl"] = clip.tokenize(t5xxl)["t5xxl"] - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"guidance": guidance}), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens, add_dict={"guidance": guidance})) -class FluxGuidance: + encode = execute # TODO: remove + + +class FluxGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "conditioning": ("CONDITIONING", ), - "guidance": ("FLOAT", {"default": 3.5, "min": 0.0, "max": 100.0, "step": 0.1}), - }} + def define_schema(cls): + return io.Schema( + node_id="FluxGuidance", + category="advanced/conditioning/flux", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - - CATEGORY = "advanced/conditioning/flux" - - def append(self, conditioning, guidance): + @classmethod + def execute(cls, conditioning, guidance) -> io.NodeOutput: c = node_helpers.conditioning_set_values(conditioning, {"guidance": guidance}) - return (c, ) + return io.NodeOutput(c) + + append = execute # TODO: remove -class FluxDisableGuidance: +class FluxDisableGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "conditioning": ("CONDITIONING", ), - }} + def define_schema(cls): + return io.Schema( + node_id="FluxDisableGuidance", + category="advanced/conditioning/flux", + description="This node completely disables the guidance embed on Flux and Flux like models", + inputs=[ + io.Conditioning.Input("conditioning"), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - - CATEGORY = "advanced/conditioning/flux" - DESCRIPTION = "This node completely disables the guidance embed on Flux and Flux like models" - - def append(self, conditioning): + @classmethod + def execute(cls, conditioning) -> io.NodeOutput: c = node_helpers.conditioning_set_values(conditioning, {"guidance": None}) - return (c, ) + return io.NodeOutput(c) + + append = execute # TODO: remove PREFERED_KONTEXT_RESOLUTIONS = [ @@ -78,52 +98,73 @@ PREFERED_KONTEXT_RESOLUTIONS = [ ] -class FluxKontextImageScale: +class FluxKontextImageScale(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"image": ("IMAGE", ), - }, - } + def define_schema(cls): + return io.Schema( + node_id="FluxKontextImageScale", + category="advanced/conditioning/flux", + description="This node resizes the image to one that is more optimal for flux kontext.", + inputs=[ + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "scale" - - CATEGORY = "advanced/conditioning/flux" - DESCRIPTION = "This node resizes the image to one that is more optimal for flux kontext." - - def scale(self, image): + @classmethod + def execute(cls, image) -> io.NodeOutput: width = image.shape[2] height = image.shape[1] aspect_ratio = width / height _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS) image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1) - return (image, ) + return io.NodeOutput(image) + + scale = execute # TODO: remove -class FluxKontextMultiReferenceLatentMethod: +class FluxKontextMultiReferenceLatentMethod(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "conditioning": ("CONDITIONING", ), - "reference_latents_method": (("offset", "index", "uxo/uno"), ), - }} + def define_schema(cls): + return io.Schema( + node_id="FluxKontextMultiReferenceLatentMethod", + category="advanced/conditioning/flux", + inputs=[ + io.Conditioning.Input("conditioning"), + io.Combo.Input( + "reference_latents_method", + options=["offset", "index", "uxo/uno"], + ), + ], + outputs=[ + io.Conditioning.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "append" - EXPERIMENTAL = True - - CATEGORY = "advanced/conditioning/flux" - - def append(self, conditioning, reference_latents_method): + @classmethod + def execute(cls, conditioning, reference_latents_method) -> io.NodeOutput: if "uxo" in reference_latents_method or "uso" in reference_latents_method: reference_latents_method = "uxo" c = node_helpers.conditioning_set_values(conditioning, {"reference_latents_method": reference_latents_method}) - return (c, ) + return io.NodeOutput(c) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeFlux": CLIPTextEncodeFlux, - "FluxGuidance": FluxGuidance, - "FluxDisableGuidance": FluxDisableGuidance, - "FluxKontextImageScale": FluxKontextImageScale, - "FluxKontextMultiReferenceLatentMethod": FluxKontextMultiReferenceLatentMethod, -} + append = execute # TODO: remove + + +class FluxExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeFlux, + FluxGuidance, + FluxDisableGuidance, + FluxKontextImageScale, + FluxKontextMultiReferenceLatentMethod, + ] + + +async def comfy_entrypoint() -> FluxExtension: + return FluxExtension() diff --git a/comfy_extras/nodes_gits.py b/comfy_extras/nodes_gits.py index 47b1dd049..25367560a 100644 --- a/comfy_extras/nodes_gits.py +++ b/comfy_extras/nodes_gits.py @@ -1,6 +1,8 @@ # from https://github.com/zju-pi/diff-sampler/tree/main/gits-main import numpy as np import torch +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def loglinear_interp(t_steps, num_steps): """ @@ -333,25 +335,28 @@ NOISE_LEVELS = { ], } -class GITSScheduler: +class GITSScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"coeff": ("FLOAT", {"default": 1.20, "min": 0.80, "max": 1.50, "step": 0.05}), - "steps": ("INT", {"default": 10, "min": 2, "max": 1000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="GITSScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Float.Input("coeff", default=1.20, min=0.80, max=1.50, step=0.05), + io.Int.Input("steps", default=10, min=2, max=1000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, coeff, steps, denoise): + @classmethod + def execute(cls, coeff, steps, denoise): total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = round(steps * denoise) if steps <= 20: @@ -362,8 +367,16 @@ class GITSScheduler: sigmas = sigmas[-(total_steps + 1):] sigmas[-1] = 0 - return (torch.FloatTensor(sigmas), ) + return io.NodeOutput(torch.FloatTensor(sigmas)) -NODE_CLASS_MAPPINGS = { - "GITSScheduler": GITSScheduler, -} + +class GITSSchedulerExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + GITSScheduler, + ] + + +async def comfy_entrypoint() -> GITSSchedulerExtension: + return GITSSchedulerExtension() diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index db398cdf1..f7c34d059 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -2,42 +2,60 @@ import nodes import node_helpers import torch import comfy.model_management +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodeHunyuanDiT: +class CLIPTextEncodeHunyuanDiT(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeHunyuanDiT", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("bert", multiline=True, dynamic_prompts=True), + io.String.Input("mt5xl", multiline=True, dynamic_prompts=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, bert, mt5xl): + @classmethod + def execute(cls, clip, bert, mt5xl) -> io.NodeOutput: tokens = clip.tokenize(bert) tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"] - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -class EmptyHunyuanLatentVideo: + encode = execute # TODO: remove + + +class EmptyHunyuanLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyHunyuanLatentVideo", + category="latent/video", + inputs=[ + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=25, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent/video" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return ({"samples":latent}, ) + return io.NodeOutput({"samples":latent}) + + generate = execute # TODO: remove + PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " @@ -50,45 +68,61 @@ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( "<|start_header_id|>assistant<|end_header_id|>\n\n" ) -class TextEncodeHunyuanVideo_ImageToVideo: +class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_vision_output": ("CLIP_VISION_OUTPUT", ), - "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}), - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="TextEncodeHunyuanVideo_ImageToVideo", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.ClipVisionOutput.Input("clip_vision_output"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Int.Input( + "image_interleave", + default=2, + min=1, + max=512, + tooltip="How much the image influences things vs the text prompt. Higher number means more influence from the text prompt.", + ), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, clip_vision_output, prompt, image_interleave): + @classmethod + def execute(cls, clip, clip_vision_output, prompt, image_interleave) -> io.NodeOutput: tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave) - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -class HunyuanImageToVideo: + encode = execute # TODO: remove + + +class HunyuanImageToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "vae": ("VAE", ), - "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], ) - }, - "optional": {"start_image": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="HunyuanImageToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Vae.Input("vae"), + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=53, min=1, max=nodes.MAX_RESOLUTION, step=4), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Combo.Input("guidance_type", options=["v1 (concat)", "v2 (replace)", "custom"]), + io.Image.Input("start_image", optional=True), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/video_models" - - def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None): + @classmethod + def execute(cls, positive, vae, width, height, length, batch_size, guidance_type, start_image=None) -> io.NodeOutput: latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) out_latent = {} @@ -111,51 +145,76 @@ class HunyuanImageToVideo: positive = node_helpers.conditioning_set_values(positive, cond) out_latent["samples"] = latent - return (positive, out_latent) + return io.NodeOutput(positive, out_latent) -class EmptyHunyuanImageLatent: + encode = execute # TODO: remove + + +class EmptyHunyuanImageLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyHunyuanImageLatent", + category="latent", + inputs=[ + io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent" - - def generate(self, width, height, batch_size=1): + @classmethod + def execute(cls, width, height, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device()) - return ({"samples":latent}, ) + return io.NodeOutput({"samples":latent}) -class HunyuanRefinerLatent: + generate = execute # TODO: remove + + +class HunyuanRefinerLatent(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "latent": ("LATENT", ), - "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="HunyuanRefinerLatent", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Latent.Input("latent"), + io.Float.Input("noise_augmentation", default=0.10, min=0.0, max=1.0, step=0.01), - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - FUNCTION = "execute" - - def execute(self, positive, negative, latent, noise_augmentation): + @classmethod + def execute(cls, positive, negative, latent, noise_augmentation) -> io.NodeOutput: latent = latent["samples"] positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) out_latent = {} out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) - return (positive, negative, out_latent) + return io.NodeOutput(positive, negative, out_latent) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, - "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, - "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, - "HunyuanImageToVideo": HunyuanImageToVideo, - "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent, - "HunyuanRefinerLatent": HunyuanRefinerLatent, -} +class HunyuanExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeHunyuanDiT, + TextEncodeHunyuanVideo_ImageToVideo, + EmptyHunyuanLatentVideo, + HunyuanImageToVideo, + EmptyHunyuanImageLatent, + HunyuanRefinerLatent, + ] + + +async def comfy_entrypoint() -> HunyuanExtension: + return HunyuanExtension() diff --git a/comfy_extras/nodes_hypernetwork.py b/comfy_extras/nodes_hypernetwork.py index 665632292..2a6a87a81 100644 --- a/comfy_extras/nodes_hypernetwork.py +++ b/comfy_extras/nodes_hypernetwork.py @@ -2,6 +2,9 @@ import comfy.utils import folder_paths import torch import logging +from comfy_api.latest import IO, ComfyExtension +from typing_extensions import override + def load_hypernetwork_patch(path, strength): sd = comfy.utils.load_torch_file(path, safe_load=True) @@ -94,27 +97,42 @@ def load_hypernetwork_patch(path, strength): return hypernetwork_patch(out, strength) -class HypernetworkLoader: +class HypernetworkLoader(IO.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "hypernetwork_name": (folder_paths.get_filename_list("hypernetworks"), ), - "strength": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "load_hypernetwork" + def define_schema(cls): + return IO.Schema( + node_id="HypernetworkLoader", + category="loaders", + inputs=[ + IO.Model.Input("model"), + IO.Combo.Input("hypernetwork_name", options=folder_paths.get_filename_list("hypernetworks")), + IO.Float.Input("strength", default=1.0, min=-10.0, max=10.0, step=0.01), + ], + outputs=[ + IO.Model.Output(), + ], + ) - CATEGORY = "loaders" - - def load_hypernetwork(self, model, hypernetwork_name, strength): + @classmethod + def execute(cls, model, hypernetwork_name, strength) -> IO.NodeOutput: hypernetwork_path = folder_paths.get_full_path_or_raise("hypernetworks", hypernetwork_name) model_hypernetwork = model.clone() patch = load_hypernetwork_patch(hypernetwork_path, strength) if patch is not None: model_hypernetwork.set_model_attn1_patch(patch) model_hypernetwork.set_model_attn2_patch(patch) - return (model_hypernetwork,) + return IO.NodeOutput(model_hypernetwork) -NODE_CLASS_MAPPINGS = { - "HypernetworkLoader": HypernetworkLoader -} + load_hypernetwork = execute # TODO: remove + + +class HyperNetworkExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + HypernetworkLoader, + ] + + +async def comfy_entrypoint() -> HyperNetworkExtension: + return HyperNetworkExtension() diff --git a/comfy_extras/nodes_hypertile.py b/comfy_extras/nodes_hypertile.py index b366117c7..0ad5e6773 100644 --- a/comfy_extras/nodes_hypertile.py +++ b/comfy_extras/nodes_hypertile.py @@ -1,9 +1,11 @@ #Taken from: https://github.com/tfernd/HyperTile/ import math +from typing_extensions import override from einops import rearrange # Use torch rng for consistency across generations from torch import randint +from comfy_api.latest import ComfyExtension, io def random_divisor(value: int, min_value: int, /, max_options: int = 1) -> int: min_value = min(min_value, value) @@ -20,25 +22,31 @@ def random_divisor(value: int, min_value: int, /, max_options: int = 1) -> int: return ns[idx] -class HyperTile: +class HyperTile(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "tile_size": ("INT", {"default": 256, "min": 1, "max": 2048}), - "swap_size": ("INT", {"default": 2, "min": 1, "max": 128}), - "max_depth": ("INT", {"default": 0, "min": 0, "max": 10}), - "scale_depth": ("BOOLEAN", {"default": False}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="HyperTile", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Int.Input("tile_size", default=256, min=1, max=2048), + io.Int.Input("swap_size", default=2, min=1, max=128), + io.Int.Input("max_depth", default=0, min=0, max=10), + io.Boolean.Input("scale_depth", default=False), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, tile_size, swap_size, max_depth, scale_depth): + @classmethod + def execute(cls, model, tile_size, swap_size, max_depth, scale_depth) -> io.NodeOutput: latent_tile_size = max(32, tile_size) // 8 - self.temp = None + temp = None def hypertile_in(q, k, v, extra_options): + nonlocal temp model_chans = q.shape[-2] orig_shape = extra_options['original_shape'] apply_to = [] @@ -58,14 +66,15 @@ class HyperTile: if nh * nw > 1: q = rearrange(q, "b (nh h nw w) c -> (b nh nw) (h w) c", h=h // nh, w=w // nw, nh=nh, nw=nw) - self.temp = (nh, nw, h, w) + temp = (nh, nw, h, w) return q, k, v return q, k, v def hypertile_out(out, extra_options): - if self.temp is not None: - nh, nw, h, w = self.temp - self.temp = None + nonlocal temp + if temp is not None: + nh, nw, h, w = temp + temp = None out = rearrange(out, "(b nh nw) hw c -> b nh nw hw c", nh=nh, nw=nw) out = rearrange(out, "b nh nw (h w) c -> b (nh h nw w) c", h=h // nh, w=w // nw) return out @@ -76,6 +85,14 @@ class HyperTile: m.set_model_attn1_output_patch(hypertile_out) return (m, ) -NODE_CLASS_MAPPINGS = { - "HyperTile": HyperTile, -} + +class HyperTileExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + HyperTile, + ] + + +async def comfy_entrypoint() -> HyperTileExtension: + return HyperTileExtension() diff --git a/comfy_extras/nodes_ip2p.py b/comfy_extras/nodes_ip2p.py index c2e70a84c..78f29915d 100644 --- a/comfy_extras/nodes_ip2p.py +++ b/comfy_extras/nodes_ip2p.py @@ -1,21 +1,30 @@ import torch -class InstructPixToPixConditioning: +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + +class InstructPixToPixConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE", ), - "pixels": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="InstructPixToPixConditioning", + category="conditioning/instructpix2pix", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Image.Input("pixels"), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING","CONDITIONING","LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - FUNCTION = "encode" - - CATEGORY = "conditioning/instructpix2pix" - - def encode(self, positive, negative, pixels, vae): + @classmethod + def execute(cls, positive, negative, pixels, vae) -> io.NodeOutput: x = (pixels.shape[1] // 8) * 8 y = (pixels.shape[2] // 8) * 8 @@ -38,8 +47,17 @@ class InstructPixToPixConditioning: n = [t[0], d] c.append(n) out.append(c) - return (out[0], out[1], out_latent) + return io.NodeOutput(out[0], out[1], out_latent) + + +class InstructPix2PixExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + InstructPixToPixConditioning, + ] + + +async def comfy_entrypoint() -> InstructPix2PixExtension: + return InstructPix2PixExtension() -NODE_CLASS_MAPPINGS = { - "InstructPixToPixConditioning": InstructPixToPixConditioning, -} diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 0f90cf60c..d2df07ff9 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -2,6 +2,8 @@ import comfy.utils import comfy_extras.nodes_post_processing import torch import nodes +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def reshape_latent_to(target_shape, latent, repeat_batch=True): @@ -13,17 +15,23 @@ def reshape_latent_to(target_shape, latent, repeat_batch=True): return latent -class LatentAdd: +class LatentAdd(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}} + def define_schema(cls): + return io.Schema( + node_id="LatentAdd", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2): + @classmethod + def execute(cls, samples1, samples2) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -31,19 +39,25 @@ class LatentAdd: s2 = reshape_latent_to(s1.shape, s2) samples_out["samples"] = s1 + s2 - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentSubtract: +class LatentSubtract(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}} + def define_schema(cls): + return io.Schema( + node_id="LatentSubtract", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2): + @classmethod + def execute(cls, samples1, samples2) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -51,41 +65,49 @@ class LatentSubtract: s2 = reshape_latent_to(s1.shape, s2) samples_out["samples"] = s1 - s2 - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentMultiply: +class LatentMultiply(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "multiplier": ("FLOAT", {"default": 1.0, "min": -10.0, "max": 10.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentMultiply", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples"), + io.Float.Input("multiplier", default=1.0, min=-10.0, max=10.0, step=0.01), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples, multiplier): + @classmethod + def execute(cls, samples, multiplier) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] samples_out["samples"] = s1 * multiplier - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentInterpolate: +class LatentInterpolate(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), - "samples2": ("LATENT",), - "ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentInterpolate", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + io.Float.Input("ratio", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2, ratio): + @classmethod + def execute(cls, samples1, samples2, ratio) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -104,19 +126,26 @@ class LatentInterpolate: st = torch.nan_to_num(t / mt) samples_out["samples"] = st * (m1 * ratio + m2 * (1.0 - ratio)) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentConcat: +class LatentConcat(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",), "dim": (["x", "-x", "y", "-y", "t", "-t"], )}} + def define_schema(cls): + return io.Schema( + node_id="LatentConcat", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + io.Combo.Input("dim", options=["x", "-x", "y", "-y", "t", "-t"]), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples1, samples2, dim): + @classmethod + def execute(cls, samples1, samples2, dim) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] @@ -136,22 +165,27 @@ class LatentConcat: dim = -3 samples_out["samples"] = torch.cat(c, dim=dim) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentCut: +class LatentCut(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"samples": ("LATENT",), - "dim": (["x", "y", "t"], ), - "index": ("INT", {"default": 0, "min": -nodes.MAX_RESOLUTION, "max": nodes.MAX_RESOLUTION, "step": 1}), - "amount": ("INT", {"default": 1, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 1})}} + def define_schema(cls): + return io.Schema( + node_id="LatentCut", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples"), + io.Combo.Input("dim", options=["x", "y", "t"]), + io.Int.Input("index", default=0, min=-nodes.MAX_RESOLUTION, max=nodes.MAX_RESOLUTION, step=1), + io.Int.Input("amount", default=1, min=1, max=nodes.MAX_RESOLUTION, step=1), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples, dim, index, amount): + @classmethod + def execute(cls, samples, dim, index, amount) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] @@ -171,19 +205,25 @@ class LatentCut: amount = min(-index, amount) samples_out["samples"] = torch.narrow(s1, dim, index, amount) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentBatch: +class LatentBatch(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples1": ("LATENT",), "samples2": ("LATENT",)}} + def define_schema(cls): + return io.Schema( + node_id="LatentBatch", + category="latent/batch", + inputs=[ + io.Latent.Input("samples1"), + io.Latent.Input("samples2"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "batch" - - CATEGORY = "latent/batch" - - def batch(self, samples1, samples2): + @classmethod + def execute(cls, samples1, samples2) -> io.NodeOutput: samples_out = samples1.copy() s1 = samples1["samples"] s2 = samples2["samples"] @@ -192,20 +232,25 @@ class LatentBatch: s = torch.cat((s1, s2), dim=0) samples_out["samples"] = s samples_out["batch_index"] = samples1.get("batch_index", [x for x in range(0, s1.shape[0])]) + samples2.get("batch_index", [x for x in range(0, s2.shape[0])]) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentBatchSeedBehavior: +class LatentBatchSeedBehavior(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "seed_behavior": (["random", "fixed"],{"default": "fixed"}),}} + def define_schema(cls): + return io.Schema( + node_id="LatentBatchSeedBehavior", + category="latent/advanced", + inputs=[ + io.Latent.Input("samples"), + io.Combo.Input("seed_behavior", options=["random", "fixed"], default="fixed"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced" - - def op(self, samples, seed_behavior): + @classmethod + def execute(cls, samples, seed_behavior) -> io.NodeOutput: samples_out = samples.copy() latent = samples["samples"] if seed_behavior == "random": @@ -215,41 +260,50 @@ class LatentBatchSeedBehavior: batch_number = samples_out.get("batch_index", [0])[0] samples_out["batch_index"] = [batch_number] * latent.shape[0] - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentApplyOperation: +class LatentApplyOperation(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "samples": ("LATENT",), - "operation": ("LATENT_OPERATION",), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentApplyOperation", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Latent.Input("samples"), + io.LatentOperation.Input("operation"), + ], + outputs=[ + io.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "op" - - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def op(self, samples, operation): + @classmethod + def execute(cls, samples, operation) -> io.NodeOutput: samples_out = samples.copy() s1 = samples["samples"] samples_out["samples"] = operation(latent=s1) - return (samples_out,) + return io.NodeOutput(samples_out) -class LatentApplyOperationCFG: +class LatentApplyOperationCFG(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "operation": ("LATENT_OPERATION",), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="LatentApplyOperationCFG", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.LatentOperation.Input("operation"), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def patch(self, model, operation): + @classmethod + def execute(cls, model, operation) -> io.NodeOutput: m = model.clone() def pre_cfg_function(args): @@ -261,21 +315,25 @@ class LatentApplyOperationCFG: return conds_out m.set_model_sampler_pre_cfg_function(pre_cfg_function) - return (m, ) + return io.NodeOutput(m) -class LatentOperationTonemapReinhard: +class LatentOperationTonemapReinhard(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "multiplier": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentOperationTonemapReinhard", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Float.Input("multiplier", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.LatentOperation.Output(), + ], + ) - RETURN_TYPES = ("LATENT_OPERATION",) - FUNCTION = "op" - - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def op(self, multiplier): + @classmethod + def execute(cls, multiplier) -> io.NodeOutput: def tonemap_reinhard(latent, **kwargs): latent_vector_magnitude = (torch.linalg.vector_norm(latent, dim=(1)) + 0.0000000001)[:,None] normalized_latent = latent / latent_vector_magnitude @@ -291,39 +349,27 @@ class LatentOperationTonemapReinhard: new_magnitude *= top return normalized_latent * new_magnitude - return (tonemap_reinhard,) + return io.NodeOutput(tonemap_reinhard) -class LatentOperationSharpen: +class LatentOperationSharpen(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "sharpen_radius": ("INT", { - "default": 9, - "min": 1, - "max": 31, - "step": 1 - }), - "sigma": ("FLOAT", { - "default": 1.0, - "min": 0.1, - "max": 10.0, - "step": 0.1 - }), - "alpha": ("FLOAT", { - "default": 0.1, - "min": 0.0, - "max": 5.0, - "step": 0.01 - }), - }} + def define_schema(cls): + return io.Schema( + node_id="LatentOperationSharpen", + category="latent/advanced/operations", + is_experimental=True, + inputs=[ + io.Int.Input("sharpen_radius", default=9, min=1, max=31, step=1), + io.Float.Input("sigma", default=1.0, min=0.1, max=10.0, step=0.1), + io.Float.Input("alpha", default=0.1, min=0.0, max=5.0, step=0.01), + ], + outputs=[ + io.LatentOperation.Output(), + ], + ) - RETURN_TYPES = ("LATENT_OPERATION",) - FUNCTION = "op" - - CATEGORY = "latent/advanced/operations" - EXPERIMENTAL = True - - def op(self, sharpen_radius, sigma, alpha): + @classmethod + def execute(cls, sharpen_radius, sigma, alpha) -> io.NodeOutput: def sharpen(latent, **kwargs): luminance = (torch.linalg.vector_norm(latent, dim=(1)) + 1e-6)[:,None] normalized_latent = latent / luminance @@ -340,19 +386,27 @@ class LatentOperationSharpen: sharpened = torch.nn.functional.conv2d(padded_image, kernel.repeat(channels, 1, 1).unsqueeze(1), padding=kernel_size // 2, groups=channels)[:,:,sharpen_radius:-sharpen_radius, sharpen_radius:-sharpen_radius] return luminance * sharpened - return (sharpen,) + return io.NodeOutput(sharpen) -NODE_CLASS_MAPPINGS = { - "LatentAdd": LatentAdd, - "LatentSubtract": LatentSubtract, - "LatentMultiply": LatentMultiply, - "LatentInterpolate": LatentInterpolate, - "LatentConcat": LatentConcat, - "LatentCut": LatentCut, - "LatentBatch": LatentBatch, - "LatentBatchSeedBehavior": LatentBatchSeedBehavior, - "LatentApplyOperation": LatentApplyOperation, - "LatentApplyOperationCFG": LatentApplyOperationCFG, - "LatentOperationTonemapReinhard": LatentOperationTonemapReinhard, - "LatentOperationSharpen": LatentOperationSharpen, -} + +class LatentExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LatentAdd, + LatentSubtract, + LatentMultiply, + LatentInterpolate, + LatentConcat, + LatentCut, + LatentBatch, + LatentBatchSeedBehavior, + LatentApplyOperation, + LatentApplyOperationCFG, + LatentOperationTonemapReinhard, + LatentOperationSharpen, + ] + + +async def comfy_entrypoint() -> LatentExtension: + return LatentExtension() diff --git a/comfy_extras/nodes_lora_extract.py b/comfy_extras/nodes_lora_extract.py index dfd4fe9f4..a2375cba7 100644 --- a/comfy_extras/nodes_lora_extract.py +++ b/comfy_extras/nodes_lora_extract.py @@ -5,6 +5,8 @@ import folder_paths import os import logging from enum import Enum +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io CLAMP_QUANTILE = 0.99 @@ -71,32 +73,40 @@ def calc_lora_model(model_diff, rank, prefix_model, prefix_lora, output_sd, lora output_sd["{}{}.diff_b".format(prefix_lora, k[len(prefix_model):-5])] = sd[k].contiguous().half().cpu() return output_sd -class LoraSave: - def __init__(self): - self.output_dir = folder_paths.get_output_directory() +class LoraSave(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoraSave", + display_name="Extract and Save Lora", + category="_for_testing", + inputs=[ + io.String.Input("filename_prefix", default="loras/ComfyUI_extracted_lora"), + io.Int.Input("rank", default=8, min=1, max=4096, step=1), + io.Combo.Input("lora_type", options=tuple(LORA_TYPES.keys())), + io.Boolean.Input("bias_diff", default=True), + io.Model.Input( + "model_diff", + tooltip="The ModelSubtract output to be converted to a lora.", + optional=True, + ), + io.Clip.Input( + "text_encoder_diff", + tooltip="The CLIPSubtract output to be converted to a lora.", + optional=True, + ), + ], + is_experimental=True, + is_output_node=True, + ) @classmethod - def INPUT_TYPES(s): - return {"required": {"filename_prefix": ("STRING", {"default": "loras/ComfyUI_extracted_lora"}), - "rank": ("INT", {"default": 8, "min": 1, "max": 4096, "step": 1}), - "lora_type": (tuple(LORA_TYPES.keys()),), - "bias_diff": ("BOOLEAN", {"default": True}), - }, - "optional": {"model_diff": ("MODEL", {"tooltip": "The ModelSubtract output to be converted to a lora."}), - "text_encoder_diff": ("CLIP", {"tooltip": "The CLIPSubtract output to be converted to a lora."})}, - } - RETURN_TYPES = () - FUNCTION = "save" - OUTPUT_NODE = True - - CATEGORY = "_for_testing" - - def save(self, filename_prefix, rank, lora_type, bias_diff, model_diff=None, text_encoder_diff=None): + def execute(cls, filename_prefix, rank, lora_type, bias_diff, model_diff=None, text_encoder_diff=None) -> io.NodeOutput: if model_diff is None and text_encoder_diff is None: - return {} + return io.NodeOutput() lora_type = LORA_TYPES.get(lora_type) - full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir) + full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory()) output_sd = {} if model_diff is not None: @@ -108,12 +118,16 @@ class LoraSave: output_checkpoint = os.path.join(full_output_folder, output_checkpoint) comfy.utils.save_torch_file(output_sd, output_checkpoint, metadata=None) - return {} + return io.NodeOutput() -NODE_CLASS_MAPPINGS = { - "LoraSave": LoraSave -} -NODE_DISPLAY_NAME_MAPPINGS = { - "LoraSave": "Extract and Save Lora" -} +class LoraSaveExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LoraSave, + ] + + +async def comfy_entrypoint() -> LoraSaveExtension: + return LoraSaveExtension() diff --git a/comfy_extras/nodes_lotus.py b/comfy_extras/nodes_lotus.py index 739dbdd3d..9f62ba2bf 100644 --- a/comfy_extras/nodes_lotus.py +++ b/comfy_extras/nodes_lotus.py @@ -1,20 +1,22 @@ +from typing_extensions import override + import torch import comfy.model_management as mm +from comfy_api.latest import ComfyExtension, io -class LotusConditioning: + +class LotusConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - }, - } + def define_schema(cls): + return io.Schema( + node_id="LotusConditioning", + category="conditioning/lotus", + inputs=[], + outputs=[io.Conditioning.Output(display_name="conditioning")], + ) - RETURN_TYPES = ("CONDITIONING",) - RETURN_NAMES = ("conditioning",) - FUNCTION = "conditioning" - CATEGORY = "conditioning/lotus" - - def conditioning(self): + @classmethod + def execute(cls) -> io.NodeOutput: device = mm.get_torch_device() #lotus uses a frozen encoder and null conditioning, i'm just inlining the results of that operation since it doesn't change #and getting parity with the reference implementation would otherwise require inference and 800mb of tensors @@ -22,8 +24,16 @@ class LotusConditioning: cond = [[prompt_embeds, {}]] - return (cond,) + return io.NodeOutput(cond) -NODE_CLASS_MAPPINGS = { - "LotusConditioning" : LotusConditioning, -} + +class LotusExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + LotusConditioning, + ] + + +async def comfy_entrypoint() -> LotusExtension: + return LotusExtension() diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index f82337a67..50da5f4eb 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -1,4 +1,3 @@ -import io import nodes import node_helpers import torch @@ -8,46 +7,61 @@ import comfy.utils import math import numpy as np import av +from io import BytesIO +from typing_extensions import override from comfy.ldm.lightricks.symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords +from comfy_api.latest import ComfyExtension, io -class EmptyLTXVLatentVideo: +class EmptyLTXVLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "length": ("INT", {"default": 97, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyLTXVLatentVideo", + category="latent/video/ltxv", + inputs=[ + io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("length", default=97, min=1, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent/video/ltxv" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device()) - return ({"samples": latent}, ) + return io.NodeOutput({"samples": latent}) + generate = execute # TODO: remove -class LTXVImgToVideo: +class LTXVImgToVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE",), - "image": ("IMAGE",), - "width": ("INT", {"default": 768, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "height": ("INT", {"default": 512, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), - "length": ("INT", {"default": 97, "min": 9, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0}), - }} + def define_schema(cls): + return io.Schema( + node_id="LTXVImgToVideo", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32), + io.Int.Input("length", default=97, min=9, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("strength", default=1.0, min=0.0, max=1.0), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - - CATEGORY = "conditioning/video_models" - FUNCTION = "generate" - - def generate(self, positive, negative, image, vae, width, height, length, batch_size, strength): + @classmethod + def execute(cls, positive, negative, image, vae, width, height, length, batch_size, strength) -> io.NodeOutput: pixels = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) encode_pixels = pixels[:, :, :, :3] t = vae.encode(encode_pixels) @@ -62,7 +76,9 @@ class LTXVImgToVideo: ) conditioning_latent_frames_mask[:, :, :t.shape[2]] = 1.0 - strength - return (positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}, ) + return io.NodeOutput(positive, negative, {"samples": latent, "noise_mask": conditioning_latent_frames_mask}) + + generate = execute # TODO: remove def conditioning_get_any_value(conditioning, key, default=None): @@ -93,35 +109,46 @@ def get_keyframe_idxs(cond): num_keyframes = torch.unique(keyframe_idxs[:, 0]).shape[0] return keyframe_idxs, num_keyframes -class LTXVAddGuide: +class LTXVAddGuide(io.ComfyNode): + NUM_PREFIX_FRAMES = 2 + PATCHIFIER = SymmetricPatchifier(1) + @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "vae": ("VAE",), - "latent": ("LATENT",), - "image": ("IMAGE", {"tooltip": "Image or video to condition the latent video on. Must be 8*n + 1 frames." - "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames."}), - "frame_idx": ("INT", {"default": 0, "min": -9999, "max": 9999, - "tooltip": "Frame index to start the conditioning at. For single-frame images or " - "videos with 1-8 frames, any frame_idx value is acceptable. For videos with 9+ " - "frames, frame_idx must be divisible by 8, otherwise it will be rounded down to " - "the nearest multiple of 8. Negative values are counted from the end of the video."}), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="LTXVAddGuide", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Latent.Input("latent"), + io.Image.Input( + "image", + tooltip="Image or video to condition the latent video on. Must be 8*n + 1 frames. " + "If the video is not 8*n + 1 frames, it will be cropped to the nearest 8*n + 1 frames.", + ), + io.Int.Input( + "frame_idx", + default=0, + min=-9999, + max=9999, + tooltip="Frame index to start the conditioning at. " + "For single-frame images or videos with 1-8 frames, any frame_idx value is acceptable. " + "For videos with 9+ frames, frame_idx must be divisible by 8, otherwise it will be rounded " + "down to the nearest multiple of 8. Negative values are counted from the end of the video.", + ), + io.Float.Input("strength", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - - CATEGORY = "conditioning/video_models" - FUNCTION = "generate" - - def __init__(self): - self._num_prefix_frames = 2 - self._patchifier = SymmetricPatchifier(1) - - def encode(self, vae, latent_width, latent_height, images, scale_factors): + @classmethod + def encode(cls, vae, latent_width, latent_height, images, scale_factors): time_scale_factor, width_scale_factor, height_scale_factor = scale_factors images = images[:(images.shape[0] - 1) // time_scale_factor * time_scale_factor + 1] pixels = comfy.utils.common_upscale(images.movedim(-1, 1), latent_width * width_scale_factor, latent_height * height_scale_factor, "bilinear", crop="disabled").movedim(1, -1) @@ -129,7 +156,8 @@ class LTXVAddGuide: t = vae.encode(encode_pixels) return encode_pixels, t - def get_latent_index(self, cond, latent_length, guide_length, frame_idx, scale_factors): + @classmethod + def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors): time_scale_factor, _, _ = scale_factors _, num_keyframes = get_keyframe_idxs(cond) latent_count = latent_length - num_keyframes @@ -141,9 +169,10 @@ class LTXVAddGuide: return frame_idx, latent_idx - def add_keyframe_index(self, cond, frame_idx, guiding_latent, scale_factors): + @classmethod + def add_keyframe_index(cls, cond, frame_idx, guiding_latent, scale_factors): keyframe_idxs, _ = get_keyframe_idxs(cond) - _, latent_coords = self._patchifier.patchify(guiding_latent) + _, latent_coords = cls.PATCHIFIER.patchify(guiding_latent) pixel_coords = latent_to_pixel_coords(latent_coords, scale_factors, causal_fix=frame_idx == 0) # we need the causal fix only if we're placing the new latents at index 0 pixel_coords[:, 0] += frame_idx if keyframe_idxs is None: @@ -152,8 +181,9 @@ class LTXVAddGuide: keyframe_idxs = torch.cat([keyframe_idxs, pixel_coords], dim=2) return node_helpers.conditioning_set_values(cond, {"keyframe_idxs": keyframe_idxs}) - def append_keyframe(self, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors): - _, latent_idx = self.get_latent_index( + @classmethod + def append_keyframe(cls, positive, negative, frame_idx, latent_image, noise_mask, guiding_latent, strength, scale_factors): + _, latent_idx = cls.get_latent_index( cond=positive, latent_length=latent_image.shape[2], guide_length=guiding_latent.shape[2], @@ -162,8 +192,8 @@ class LTXVAddGuide: ) noise_mask[:, :, latent_idx:latent_idx + guiding_latent.shape[2]] = 1.0 - positive = self.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors) - negative = self.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) + positive = cls.add_keyframe_index(positive, frame_idx, guiding_latent, scale_factors) + negative = cls.add_keyframe_index(negative, frame_idx, guiding_latent, scale_factors) mask = torch.full( (noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]), @@ -176,7 +206,8 @@ class LTXVAddGuide: noise_mask = torch.cat([noise_mask, mask], dim=2) return positive, negative, latent_image, noise_mask - def replace_latent_frames(self, latent_image, noise_mask, guiding_latent, latent_idx, strength): + @classmethod + def replace_latent_frames(cls, latent_image, noise_mask, guiding_latent, latent_idx, strength): cond_length = guiding_latent.shape[2] assert latent_image.shape[2] >= latent_idx + cond_length, "Conditioning frames exceed the length of the latent sequence." @@ -195,20 +226,21 @@ class LTXVAddGuide: return latent_image, noise_mask - def generate(self, positive, negative, vae, latent, image, frame_idx, strength): + @classmethod + def execute(cls, positive, negative, vae, latent, image, frame_idx, strength) -> io.NodeOutput: scale_factors = vae.downscale_index_formula latent_image = latent["samples"] noise_mask = get_noise_mask(latent) _, _, latent_length, latent_height, latent_width = latent_image.shape - image, t = self.encode(vae, latent_width, latent_height, image, scale_factors) + image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors) - frame_idx, latent_idx = self.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) + frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence." - num_prefix_frames = min(self._num_prefix_frames, t.shape[2]) + num_prefix_frames = min(cls.NUM_PREFIX_FRAMES, t.shape[2]) - positive, negative, latent_image, noise_mask = self.append_keyframe( + positive, negative, latent_image, noise_mask = cls.append_keyframe( positive, negative, frame_idx, @@ -223,9 +255,9 @@ class LTXVAddGuide: t = t[:, :, num_prefix_frames:] if t.shape[2] == 0: - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) - latent_image, noise_mask = self.replace_latent_frames( + latent_image, noise_mask = cls.replace_latent_frames( latent_image, noise_mask, t, @@ -233,34 +265,37 @@ class LTXVAddGuide: strength, ) - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) + + generate = execute # TODO: remove -class LTXVCropGuides: +class LTXVCropGuides(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "latent": ("LATENT",), - } - } + def define_schema(cls): + return io.Schema( + node_id="LTXVCropGuides", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Latent.Input("latent"), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") - - CATEGORY = "conditioning/video_models" - FUNCTION = "crop" - - def __init__(self): - self._patchifier = SymmetricPatchifier(1) - - def crop(self, positive, negative, latent): + @classmethod + def execute(cls, positive, negative, latent) -> io.NodeOutput: latent_image = latent["samples"].clone() noise_mask = get_noise_mask(latent) _, num_keyframes = get_keyframe_idxs(positive) if num_keyframes == 0: - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) latent_image = latent_image[:, :, :-num_keyframes] noise_mask = noise_mask[:, :, :-num_keyframes] @@ -268,44 +303,54 @@ class LTXVCropGuides: positive = node_helpers.conditioning_set_values(positive, {"keyframe_idxs": None}) negative = node_helpers.conditioning_set_values(negative, {"keyframe_idxs": None}) - return (positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) + return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask}) + + crop = execute # TODO: remove -class LTXVConditioning: +class LTXVConditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "frame_rate": ("FLOAT", {"default": 25.0, "min": 0.0, "max": 1000.0, "step": 0.01}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING") - RETURN_NAMES = ("positive", "negative") - FUNCTION = "append" + def define_schema(cls): + return io.Schema( + node_id="LTXVConditioning", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Float.Input("frame_rate", default=25.0, min=0.0, max=1000.0, step=0.01), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + ) - CATEGORY = "conditioning/video_models" - - def append(self, positive, negative, frame_rate): + @classmethod + def execute(cls, positive, negative, frame_rate) -> io.NodeOutput: positive = node_helpers.conditioning_set_values(positive, {"frame_rate": frame_rate}) negative = node_helpers.conditioning_set_values(negative, {"frame_rate": frame_rate}) - return (positive, negative) + return io.NodeOutput(positive, negative) -class ModelSamplingLTXV: +class ModelSamplingLTXV(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "max_shift": ("FLOAT", {"default": 2.05, "min": 0.0, "max": 100.0, "step":0.01}), - "base_shift": ("FLOAT", {"default": 0.95, "min": 0.0, "max": 100.0, "step":0.01}), - }, - "optional": {"latent": ("LATENT",), } - } + def define_schema(cls): + return io.Schema( + node_id="ModelSamplingLTXV", + category="advanced/model", + inputs=[ + io.Model.Input("model"), + io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01), + io.Float.Input("base_shift", default=0.95, min=0.0, max=100.0, step=0.01), + io.Latent.Input("latent", optional=True), + ], + outputs=[ + io.Model.Output(), + ], + ) - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - - CATEGORY = "advanced/model" - - def patch(self, model, max_shift, base_shift, latent=None): + @classmethod + def execute(cls, model, max_shift, base_shift, latent=None) -> io.NodeOutput: m = model.clone() if latent is None: @@ -329,37 +374,41 @@ class ModelSamplingLTXV: model_sampling.set_parameters(shift=shift) m.add_object_patch("model_sampling", model_sampling) - return (m, ) + return io.NodeOutput(m) -class LTXVScheduler: +class LTXVScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"steps": ("INT", {"default": 20, "min": 1, "max": 10000}), - "max_shift": ("FLOAT", {"default": 2.05, "min": 0.0, "max": 100.0, "step":0.01}), - "base_shift": ("FLOAT", {"default": 0.95, "min": 0.0, "max": 100.0, "step":0.01}), - "stretch": ("BOOLEAN", { - "default": True, - "tooltip": "Stretch the sigmas to be in the range [terminal, 1]." - }), - "terminal": ( - "FLOAT", - { - "default": 0.1, "min": 0.0, "max": 0.99, "step": 0.01, - "tooltip": "The terminal value of the sigmas after stretching." - }, - ), - }, - "optional": {"latent": ("LATENT",), } - } + def define_schema(cls): + return io.Schema( + node_id="LTXVScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Int.Input("steps", default=20, min=1, max=10000), + io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01), + io.Float.Input("base_shift", default=0.95, min=0.0, max=100.0, step=0.01), + io.Boolean.Input( + id="stretch", + default=True, + tooltip="Stretch the sigmas to be in the range [terminal, 1].", + ), + io.Float.Input( + id="terminal", + default=0.1, + min=0.0, + max=0.99, + step=0.01, + tooltip="The terminal value of the sigmas after stretching.", + ), + io.Latent.Input("latent", optional=True), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" - - FUNCTION = "get_sigmas" - - def get_sigmas(self, steps, max_shift, base_shift, stretch, terminal, latent=None): + @classmethod + def execute(cls, steps, max_shift, base_shift, stretch, terminal, latent=None) -> io.NodeOutput: if latent is None: tokens = 4096 else: @@ -389,7 +438,7 @@ class LTXVScheduler: stretched = 1.0 - (one_minus_z / scale_factor) sigmas[non_zero_mask] = stretched - return (sigmas,) + return io.NodeOutput(sigmas) def encode_single_frame(output_file, image_array: np.ndarray, crf): container = av.open(output_file, "w", format="mp4") @@ -423,52 +472,55 @@ def preprocess(image: torch.Tensor, crf=29): return image image_array = (image[:(image.shape[0] // 2) * 2, :(image.shape[1] // 2) * 2] * 255.0).byte().cpu().numpy() - with io.BytesIO() as output_file: + with BytesIO() as output_file: encode_single_frame(output_file, image_array, crf) video_bytes = output_file.getvalue() - with io.BytesIO(video_bytes) as video_file: + with BytesIO(video_bytes) as video_file: image_array = decode_single_frame(video_file) tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0 return tensor -class LTXVPreprocess: +class LTXVPreprocess(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "image": ("IMAGE",), - "img_compression": ( - "INT", - { - "default": 35, - "min": 0, - "max": 100, - "tooltip": "Amount of compression to apply on image.", - }, + def define_schema(cls): + return io.Schema( + node_id="LTXVPreprocess", + category="image", + inputs=[ + io.Image.Input("image"), + io.Int.Input( + id="img_compression", default=35, min=0, max=100, tooltip="Amount of compression to apply on image." ), - } - } + ], + outputs=[ + io.Image.Output(display_name="output_image"), + ], + ) - FUNCTION = "preprocess" - RETURN_TYPES = ("IMAGE",) - RETURN_NAMES = ("output_image",) - CATEGORY = "image" - - def preprocess(self, image, img_compression): + @classmethod + def execute(cls, image, img_compression) -> io.NodeOutput: output_images = [] for i in range(image.shape[0]): output_images.append(preprocess(image[i], img_compression)) - return (torch.stack(output_images),) + return io.NodeOutput(torch.stack(output_images)) + + preprocess = execute # TODO: remove + +class LtxvExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EmptyLTXVLatentVideo, + LTXVImgToVideo, + ModelSamplingLTXV, + LTXVConditioning, + LTXVScheduler, + LTXVAddGuide, + LTXVPreprocess, + LTXVCropGuides, + ] -NODE_CLASS_MAPPINGS = { - "EmptyLTXVLatentVideo": EmptyLTXVLatentVideo, - "LTXVImgToVideo": LTXVImgToVideo, - "ModelSamplingLTXV": ModelSamplingLTXV, - "LTXVConditioning": LTXVConditioning, - "LTXVScheduler": LTXVScheduler, - "LTXVAddGuide": LTXVAddGuide, - "LTXVPreprocess": LTXVPreprocess, - "LTXVCropGuides": LTXVCropGuides, -} +async def comfy_entrypoint() -> LtxvExtension: + return LtxvExtension() diff --git a/comfy_extras/nodes_lumina2.py b/comfy_extras/nodes_lumina2.py index 275189785..89ff2397a 100644 --- a/comfy_extras/nodes_lumina2.py +++ b/comfy_extras/nodes_lumina2.py @@ -1,20 +1,27 @@ -from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict +from typing_extensions import override import torch +from comfy_api.latest import ComfyExtension, io -class RenormCFG: + +class RenormCFG(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "cfg_trunc": ("FLOAT", {"default": 100, "min": 0.0, "max": 100.0, "step": 0.01}), - "renorm_cfg": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="RenormCFG", + category="advanced/model", + inputs=[ + io.Model.Input("model"), + io.Float.Input("cfg_trunc", default=100, min=0.0, max=100.0, step=0.01), + io.Float.Input("renorm_cfg", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "advanced/model" - - def patch(self, model, cfg_trunc, renorm_cfg): + @classmethod + def execute(cls, model, cfg_trunc, renorm_cfg) -> io.NodeOutput: def renorm_cfg_func(args): cond_denoised = args["cond_denoised"] uncond_denoised = args["uncond_denoised"] @@ -53,10 +60,10 @@ class RenormCFG: m = model.clone() m.set_model_sampler_cfg_function(renorm_cfg_func) - return (m, ) + return io.NodeOutput(m) -class CLIPTextEncodeLumina2(ComfyNodeABC): +class CLIPTextEncodeLumina2(io.ComfyNode): SYSTEM_PROMPT = { "superior": "You are an assistant designed to generate superior images with the superior "\ "degree of image-text alignment based on textual prompts or user prompts.", @@ -69,36 +76,52 @@ class CLIPTextEncodeLumina2(ComfyNodeABC): "Alignment: You are an assistant designed to generate high-quality images with the highest "\ "degree of image-text alignment based on textual prompts." @classmethod - def INPUT_TYPES(s) -> InputTypeDict: - return { - "required": { - "system_prompt": (list(CLIPTextEncodeLumina2.SYSTEM_PROMPT.keys()), {"tooltip": CLIPTextEncodeLumina2.SYSTEM_PROMPT_TIP}), - "user_prompt": (IO.STRING, {"multiline": True, "dynamicPrompts": True, "tooltip": "The text to be encoded."}), - "clip": (IO.CLIP, {"tooltip": "The CLIP model used for encoding the text."}) - } - } - RETURN_TYPES = (IO.CONDITIONING,) - OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeLumina2", + display_name="CLIP Text Encode for Lumina2", + category="conditioning", + description="Encodes a system prompt and a user prompt using a CLIP model into an embedding " + "that can be used to guide the diffusion model towards generating specific images.", + inputs=[ + io.Combo.Input( + "system_prompt", + options=list(cls.SYSTEM_PROMPT.keys()), + tooltip=cls.SYSTEM_PROMPT_TIP, + ), + io.String.Input( + "user_prompt", + multiline=True, + dynamic_prompts=True, + tooltip="The text to be encoded.", + ), + io.Clip.Input("clip", tooltip="The CLIP model used for encoding the text."), + ], + outputs=[ + io.Conditioning.Output( + tooltip="A conditioning containing the embedded text used to guide the diffusion model.", + ), + ], + ) - CATEGORY = "conditioning" - DESCRIPTION = "Encodes a system prompt and a user prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images." - - def encode(self, clip, user_prompt, system_prompt): + @classmethod + def execute(cls, clip, user_prompt, system_prompt) -> io.NodeOutput: if clip is None: raise RuntimeError("ERROR: clip input is invalid: None\n\nIf the clip is from a checkpoint loader node your checkpoint does not contain a valid clip or text encoder model.") - system_prompt = CLIPTextEncodeLumina2.SYSTEM_PROMPT[system_prompt] + system_prompt = cls.SYSTEM_PROMPT[system_prompt] prompt = f'{system_prompt} {user_prompt}' tokens = clip.tokenize(prompt) - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodeLumina2": CLIPTextEncodeLumina2, - "RenormCFG": RenormCFG -} +class Lumina2Extension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodeLumina2, + RenormCFG, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - "CLIPTextEncodeLumina2": "CLIP Text Encode for Lumina2", -} +async def comfy_entrypoint() -> Lumina2Extension: + return Lumina2Extension() diff --git a/comfy_extras/nodes_mahiro.py b/comfy_extras/nodes_mahiro.py index 8fcdfba75..07b3353f4 100644 --- a/comfy_extras/nodes_mahiro.py +++ b/comfy_extras/nodes_mahiro.py @@ -1,17 +1,29 @@ +from typing_extensions import override import torch import torch.nn.functional as F -class Mahiro: +from comfy_api.latest import ComfyExtension, io + + +class Mahiro(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL",), - }} - RETURN_TYPES = ("MODEL",) - RETURN_NAMES = ("patched_model",) - FUNCTION = "patch" - CATEGORY = "_for_testing" - DESCRIPTION = "Modify the guidance to scale more on the 'direction' of the positive prompt rather than the difference between the negative prompt." - def patch(self, model): + def define_schema(cls): + return io.Schema( + node_id="Mahiro", + display_name="Mahiro is so cute that she deserves a better guidance function!! (。・ω・。)", + category="_for_testing", + description="Modify the guidance to scale more on the 'direction' of the positive prompt rather than the difference between the negative prompt.", + inputs=[ + io.Model.Input("model"), + ], + outputs=[ + io.Model.Output(display_name="patched_model"), + ], + is_experimental=True, + ) + + @classmethod + def execute(cls, model) -> io.NodeOutput: m = model.clone() def mahiro_normd(args): scale: float = args['cond_scale'] @@ -30,12 +42,16 @@ class Mahiro: wm = (simsc*cfg + (4-simsc)*leap) / 4 return wm m.set_model_sampler_post_cfg_function(mahiro_normd) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "Mahiro": Mahiro -} -NODE_DISPLAY_NAME_MAPPINGS = { - "Mahiro": "Mahiro is so cute that she deserves a better guidance function!! (。・ω・。)", -} +class MahiroExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Mahiro, + ] + + +async def comfy_entrypoint() -> MahiroExtension: + return MahiroExtension() diff --git a/comfy_extras/nodes_mochi.py b/comfy_extras/nodes_mochi.py index 1c474faa9..d750194fc 100644 --- a/comfy_extras/nodes_mochi.py +++ b/comfy_extras/nodes_mochi.py @@ -1,23 +1,40 @@ -import nodes +from typing_extensions import override import torch import comfy.model_management +import nodes +from comfy_api.latest import ComfyExtension, io -class EmptyMochiLatentVideo: + +class EmptyMochiLatentVideo(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "length": ("INT", {"default": 25, "min": 7, "max": nodes.MAX_RESOLUTION, "step": 6}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def define_schema(cls): + return io.Schema( + node_id="EmptyMochiLatentVideo", + category="latent/video", + inputs=[ + io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("length", default=25, min=7, max=nodes.MAX_RESOLUTION, step=6), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) - CATEGORY = "latent/video" - - def generate(self, width, height, length, batch_size=1): + @classmethod + def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput: latent = torch.zeros([batch_size, 12, ((length - 1) // 6) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) - return ({"samples":latent}, ) + return io.NodeOutput({"samples": latent}) -NODE_CLASS_MAPPINGS = { - "EmptyMochiLatentVideo": EmptyMochiLatentVideo, -} + +class MochiExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + EmptyMochiLatentVideo, + ] + + +async def comfy_entrypoint() -> MochiExtension: + return MochiExtension() diff --git a/comfy_extras/nodes_model_downscale.py b/comfy_extras/nodes_model_downscale.py index 49420dee9..f7ca9699d 100644 --- a/comfy_extras/nodes_model_downscale.py +++ b/comfy_extras/nodes_model_downscale.py @@ -1,24 +1,33 @@ +from typing_extensions import override import comfy.utils +from comfy_api.latest import ComfyExtension, io -class PatchModelAddDownscale: - upscale_methods = ["bicubic", "nearest-exact", "bilinear", "area", "bislerp"] + +class PatchModelAddDownscale(io.ComfyNode): + UPSCALE_METHODS = ["bicubic", "nearest-exact", "bilinear", "area", "bislerp"] @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "block_number": ("INT", {"default": 3, "min": 1, "max": 32, "step": 1}), - "downscale_factor": ("FLOAT", {"default": 2.0, "min": 0.1, "max": 9.0, "step": 0.001}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 0.35, "min": 0.0, "max": 1.0, "step": 0.001}), - "downscale_after_skip": ("BOOLEAN", {"default": True}), - "downscale_method": (s.upscale_methods,), - "upscale_method": (s.upscale_methods,), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="PatchModelAddDownscale", + display_name="PatchModelAddDownscale (Kohya Deep Shrink)", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Int.Input("block_number", default=3, min=1, max=32, step=1), + io.Float.Input("downscale_factor", default=2.0, min=0.1, max=9.0, step=0.001), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=0.35, min=0.0, max=1.0, step=0.001), + io.Boolean.Input("downscale_after_skip", default=True), + io.Combo.Input("downscale_method", options=cls.UPSCALE_METHODS), + io.Combo.Input("upscale_method", options=cls.UPSCALE_METHODS), + ], + outputs=[ + io.Model.Output(), + ], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, block_number, downscale_factor, start_percent, end_percent, downscale_after_skip, downscale_method, upscale_method): + @classmethod + def execute(cls, model, block_number, downscale_factor, start_percent, end_percent, downscale_after_skip, downscale_method, upscale_method) -> io.NodeOutput: model_sampling = model.get_model_object("model_sampling") sigma_start = model_sampling.percent_to_sigma(start_percent) sigma_end = model_sampling.percent_to_sigma(end_percent) @@ -41,13 +50,21 @@ class PatchModelAddDownscale: else: m.set_model_input_block_patch(input_block_patch) m.set_model_output_block_patch(output_block_patch) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "PatchModelAddDownscale": PatchModelAddDownscale, -} NODE_DISPLAY_NAME_MAPPINGS = { # Sampling - "PatchModelAddDownscale": "PatchModelAddDownscale (Kohya Deep Shrink)", + "PatchModelAddDownscale": "", } + +class ModelDownscaleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PatchModelAddDownscale, + ] + + +async def comfy_entrypoint() -> ModelDownscaleExtension: + return ModelDownscaleExtension() diff --git a/comfy_extras/nodes_morphology.py b/comfy_extras/nodes_morphology.py index 075b26c40..67377e1bc 100644 --- a/comfy_extras/nodes_morphology.py +++ b/comfy_extras/nodes_morphology.py @@ -1,24 +1,34 @@ import torch import comfy.model_management +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io from kornia.morphology import dilation, erosion, opening, closing, gradient, top_hat, bottom_hat import kornia.color -class Morphology: +class Morphology(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"image": ("IMAGE",), - "operation": (["erode", "dilate", "open", "close", "gradient", "bottom_hat", "top_hat"],), - "kernel_size": ("INT", {"default": 3, "min": 3, "max": 999, "step": 1}), - }} + def define_schema(cls): + return io.Schema( + node_id="Morphology", + display_name="ImageMorphology", + category="image/postprocessing", + inputs=[ + io.Image.Input("image"), + io.Combo.Input( + "operation", + options=["erode", "dilate", "open", "close", "gradient", "bottom_hat", "top_hat"], + ), + io.Int.Input("kernel_size", default=3, min=3, max=999, step=1), + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "process" - - CATEGORY = "image/postprocessing" - - def process(self, image, operation, kernel_size): + @classmethod + def execute(cls, image, operation, kernel_size) -> io.NodeOutput: device = comfy.model_management.get_torch_device() kernel = torch.ones(kernel_size, kernel_size, device=device) image_k = image.to(device).movedim(-1, 1) @@ -39,49 +49,63 @@ class Morphology: else: raise ValueError(f"Invalid operation {operation} for morphology. Must be one of 'erode', 'dilate', 'open', 'close', 'gradient', 'tophat', 'bottomhat'") img_out = output.to(comfy.model_management.intermediate_device()).movedim(1, -1) - return (img_out,) + return io.NodeOutput(img_out) -class ImageRGBToYUV: +class ImageRGBToYUV(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "image": ("IMAGE",), - }} + def define_schema(cls): + return io.Schema( + node_id="ImageRGBToYUV", + category="image/batch", + inputs=[ + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(display_name="Y"), + io.Image.Output(display_name="U"), + io.Image.Output(display_name="V"), + ], + ) - RETURN_TYPES = ("IMAGE", "IMAGE", "IMAGE") - RETURN_NAMES = ("Y", "U", "V") - FUNCTION = "execute" - - CATEGORY = "image/batch" - - def execute(self, image): + @classmethod + def execute(cls, image) -> io.NodeOutput: out = kornia.color.rgb_to_ycbcr(image.movedim(-1, 1)).movedim(1, -1) - return (out[..., 0:1].expand_as(image), out[..., 1:2].expand_as(image), out[..., 2:3].expand_as(image)) + return io.NodeOutput(out[..., 0:1].expand_as(image), out[..., 1:2].expand_as(image), out[..., 2:3].expand_as(image)) -class ImageYUVToRGB: +class ImageYUVToRGB(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"Y": ("IMAGE",), - "U": ("IMAGE",), - "V": ("IMAGE",), - }} + def define_schema(cls): + return io.Schema( + node_id="ImageYUVToRGB", + category="image/batch", + inputs=[ + io.Image.Input("Y"), + io.Image.Input("U"), + io.Image.Input("V"), + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - FUNCTION = "execute" - - CATEGORY = "image/batch" - - def execute(self, Y, U, V): + @classmethod + def execute(cls, Y, U, V) -> io.NodeOutput: image = torch.cat([torch.mean(Y, dim=-1, keepdim=True), torch.mean(U, dim=-1, keepdim=True), torch.mean(V, dim=-1, keepdim=True)], dim=-1) out = kornia.color.ycbcr_to_rgb(image.movedim(-1, 1)).movedim(1, -1) - return (out,) + return io.NodeOutput(out) -NODE_CLASS_MAPPINGS = { - "Morphology": Morphology, - "ImageRGBToYUV": ImageRGBToYUV, - "ImageYUVToRGB": ImageYUVToRGB, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "Morphology": "ImageMorphology", -} +class MorphologyExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Morphology, + ImageRGBToYUV, + ImageYUVToRGB, + ] + + +async def comfy_entrypoint() -> MorphologyExtension: + return MorphologyExtension() + diff --git a/comfy_extras/nodes_nop.py b/comfy_extras/nodes_nop.py new file mode 100644 index 000000000..953061bcb --- /dev/null +++ b/comfy_extras/nodes_nop.py @@ -0,0 +1,39 @@ +from comfy_api.latest import ComfyExtension, io +from typing_extensions import override +# If you write a node that is so useless that it breaks ComfyUI it will be featured in this exclusive list + +# "native" block swap nodes are placebo at best and break the ComfyUI memory management system. +# They are also considered harmful because instead of users reporting issues with the built in +# memory management they install these stupid nodes and complain even harder. Now it completely +# breaks with some of the new ComfyUI memory optimizations so I have made the decision to NOP it +# out of all workflows. +class wanBlockSwap(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="wanBlockSwap", + category="", + description="NOP", + inputs=[ + io.Model.Input("model"), + ], + outputs=[ + io.Model.Output(), + ], + is_deprecated=True, + ) + + @classmethod + def execute(cls, model) -> io.NodeOutput: + return io.NodeOutput(model) + + +class NopExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + wanBlockSwap + ] + +async def comfy_entrypoint() -> NopExtension: + return NopExtension() diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py index e7c851ca2..73f0104d8 100644 --- a/comfy_extras/nodes_optimalsteps.py +++ b/comfy_extras/nodes_optimalsteps.py @@ -1,9 +1,12 @@ # from https://github.com/bebebe666/OptimalSteps - import numpy as np import torch +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + + def loglinear_interp(t_steps, num_steps): """ Performs log-linear interpolation of a given array of decreasing numbers. @@ -23,25 +26,28 @@ NOISE_LEVELS = {"FLUX": [0.9968, 0.9886, 0.9819, 0.975, 0.966, 0.9471, 0.9158, 0 "Chroma": [0.992, 0.99, 0.988, 0.985, 0.982, 0.978, 0.973, 0.968, 0.961, 0.953, 0.943, 0.931, 0.917, 0.9, 0.881, 0.858, 0.832, 0.802, 0.769, 0.731, 0.69, 0.646, 0.599, 0.55, 0.501, 0.451, 0.402, 0.355, 0.311, 0.27, 0.232, 0.199, 0.169, 0.143, 0.12, 0.101, 0.084, 0.07, 0.058, 0.048, 0.001], } -class OptimalStepsScheduler: +class OptimalStepsScheduler(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model_type": (["FLUX", "Wan", "Chroma"], ), - "steps": ("INT", {"default": 20, "min": 3, "max": 1000}), - "denoise": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), - } - } - RETURN_TYPES = ("SIGMAS",) - CATEGORY = "sampling/custom_sampling/schedulers" + def define_schema(cls): + return io.Schema( + node_id="OptimalStepsScheduler", + category="sampling/custom_sampling/schedulers", + inputs=[ + io.Combo.Input("model_type", options=["FLUX", "Wan", "Chroma"]), + io.Int.Input("steps", default=20, min=3, max=1000), + io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), + ], + outputs=[ + io.Sigmas.Output(), + ], + ) - FUNCTION = "get_sigmas" - - def get_sigmas(self, model_type, steps, denoise): + @classmethod + def execute(cls, model_type, steps, denoise) ->io.NodeOutput: total_steps = steps if denoise < 1.0: if denoise <= 0.0: - return (torch.FloatTensor([]),) + return io.NodeOutput(torch.FloatTensor([])) total_steps = round(steps * denoise) sigmas = NOISE_LEVELS[model_type][:] @@ -50,8 +56,16 @@ class OptimalStepsScheduler: sigmas = sigmas[-(total_steps + 1):] sigmas[-1] = 0 - return (torch.FloatTensor(sigmas), ) + return io.NodeOutput(torch.FloatTensor(sigmas)) -NODE_CLASS_MAPPINGS = { - "OptimalStepsScheduler": OptimalStepsScheduler, -} + +class OptimalStepsExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + OptimalStepsScheduler, + ] + + +async def comfy_entrypoint() -> OptimalStepsExtension: + return OptimalStepsExtension() diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py index eb28196f4..79fea5f0c 100644 --- a/comfy_extras/nodes_pag.py +++ b/comfy_extras/nodes_pag.py @@ -3,25 +3,30 @@ #My modified one here is more basic but has less chances of breaking with ComfyUI updates. +from typing_extensions import override + import comfy.model_patcher import comfy.samplers +from comfy_api.latest import ComfyExtension, io -class PerturbedAttentionGuidance: + +class PerturbedAttentionGuidance(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return { - "required": { - "model": ("MODEL",), - "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 100.0, "step": 0.01, "round": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="PerturbedAttentionGuidance", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input("scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01), + ], + outputs=[ + io.Model.Output(), + ], + ) - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" - - CATEGORY = "model_patches/unet" - - def patch(self, model, scale): + @classmethod + def execute(cls, model, scale) -> io.NodeOutput: unet_block = "middle" unet_block_id = 0 m = model.clone() @@ -49,8 +54,16 @@ class PerturbedAttentionGuidance: m.set_model_sampler_post_cfg_function(post_cfg_function) - return (m,) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "PerturbedAttentionGuidance": PerturbedAttentionGuidance, -} + +class PAGExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PerturbedAttentionGuidance, + ] + + +async def comfy_entrypoint() -> PAGExtension: + return PAGExtension() diff --git a/comfy_extras/nodes_perpneg.py b/comfy_extras/nodes_perpneg.py index 89e5eef90..cd068ce9c 100644 --- a/comfy_extras/nodes_perpneg.py +++ b/comfy_extras/nodes_perpneg.py @@ -5,6 +5,9 @@ import comfy.samplers import comfy.utils import node_helpers import math +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io + def perp_neg(x, noise_pred_pos, noise_pred_neg, noise_pred_nocond, neg_scale, cond_scale): pos = noise_pred_pos - noise_pred_nocond @@ -16,20 +19,27 @@ def perp_neg(x, noise_pred_pos, noise_pred_neg, noise_pred_nocond, neg_scale, co return cfg_result #TODO: This node should be removed, it has been replaced with PerpNegGuider -class PerpNeg: +class PerpNeg(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "empty_conditioning": ("CONDITIONING", ), - "neg_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="PerpNeg", + display_name="Perp-Neg (DEPRECATED by PerpNegGuider)", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("empty_conditioning"), + io.Float.Input("neg_scale", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.Model.Output(), + ], + is_experimental=True, + is_deprecated=True, + ) - CATEGORY = "_for_testing" - DEPRECATED = True - - def patch(self, model, empty_conditioning, neg_scale): + @classmethod + def execute(cls, model, empty_conditioning, neg_scale) -> io.NodeOutput: m = model.clone() nocond = comfy.sampler_helpers.convert_cond(empty_conditioning) @@ -50,7 +60,7 @@ class PerpNeg: m.set_model_sampler_cfg_function(cfg_function) - return (m, ) + return io.NodeOutput(m) class Guider_PerpNeg(comfy.samplers.CFGGuider): @@ -112,35 +122,42 @@ class Guider_PerpNeg(comfy.samplers.CFGGuider): return cfg_result -class PerpNegGuider: +class PerpNegGuider(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": - {"model": ("MODEL",), - "positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "empty_conditioning": ("CONDITIONING", ), - "cfg": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 100.0, "step":0.1, "round": 0.01}), - "neg_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 100.0, "step": 0.01}), - } - } + def define_schema(cls): + return io.Schema( + node_id="PerpNegGuider", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Conditioning.Input("empty_conditioning"), + io.Float.Input("cfg", default=8.0, min=0.0, max=100.0, step=0.1, round=0.01), + io.Float.Input("neg_scale", default=1.0, min=0.0, max=100.0, step=0.01), + ], + outputs=[ + io.Guider.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("GUIDER",) - - FUNCTION = "get_guider" - CATEGORY = "_for_testing" - - def get_guider(self, model, positive, negative, empty_conditioning, cfg, neg_scale): + @classmethod + def execute(cls, model, positive, negative, empty_conditioning, cfg, neg_scale) -> io.NodeOutput: guider = Guider_PerpNeg(model) guider.set_conds(positive, negative, empty_conditioning) guider.set_cfg(cfg, neg_scale) - return (guider,) + return io.NodeOutput(guider) -NODE_CLASS_MAPPINGS = { - "PerpNeg": PerpNeg, - "PerpNegGuider": PerpNegGuider, -} -NODE_DISPLAY_NAME_MAPPINGS = { - "PerpNeg": "Perp-Neg (DEPRECATED by PerpNegGuider)", -} +class PerpNegExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PerpNeg, + PerpNegGuider, + ] + + +async def comfy_entrypoint() -> PerpNegExtension: + return PerpNegExtension() diff --git a/comfy_extras/nodes_photomaker.py b/comfy_extras/nodes_photomaker.py index d358ed6d5..228183c07 100644 --- a/comfy_extras/nodes_photomaker.py +++ b/comfy_extras/nodes_photomaker.py @@ -4,6 +4,8 @@ import folder_paths import comfy.clip_model import comfy.clip_vision import comfy.ops +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io # code for model from: https://github.com/TencentARC/PhotoMaker/blob/main/photomaker/model.py under Apache License Version 2.0 VISION_CONFIG_DICT = { @@ -116,41 +118,52 @@ class PhotoMakerIDEncoder(comfy.clip_model.CLIPVisionModelProjection): return updated_prompt_embeds -class PhotoMakerLoader: +class PhotoMakerLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "photomaker_model_name": (folder_paths.get_filename_list("photomaker"), )}} + def define_schema(cls): + return io.Schema( + node_id="PhotoMakerLoader", + category="_for_testing/photomaker", + inputs=[ + io.Combo.Input("photomaker_model_name", options=folder_paths.get_filename_list("photomaker")), + ], + outputs=[ + io.Photomaker.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("PHOTOMAKER",) - FUNCTION = "load_photomaker_model" - - CATEGORY = "_for_testing/photomaker" - - def load_photomaker_model(self, photomaker_model_name): + @classmethod + def execute(cls, photomaker_model_name): photomaker_model_path = folder_paths.get_full_path_or_raise("photomaker", photomaker_model_name) photomaker_model = PhotoMakerIDEncoder() data = comfy.utils.load_torch_file(photomaker_model_path, safe_load=True) if "id_encoder" in data: data = data["id_encoder"] photomaker_model.load_state_dict(data) - return (photomaker_model,) + return io.NodeOutput(photomaker_model) -class PhotoMakerEncode: +class PhotoMakerEncode(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "photomaker": ("PHOTOMAKER",), - "image": ("IMAGE",), - "clip": ("CLIP", ), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True, "default": "photograph of photomaker"}), - }} + def define_schema(cls): + return io.Schema( + node_id="PhotoMakerEncode", + category="_for_testing/photomaker", + inputs=[ + io.Photomaker.Input("photomaker"), + io.Image.Input("image"), + io.Clip.Input("clip"), + io.String.Input("text", multiline=True, dynamic_prompts=True, default="photograph of photomaker"), + ], + outputs=[ + io.Conditioning.Output(), + ], + is_experimental=True, + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "apply_photomaker" - - CATEGORY = "_for_testing/photomaker" - - def apply_photomaker(self, photomaker, image, clip, text): + @classmethod + def execute(cls, photomaker, image, clip, text): special_token = "photomaker" pixel_values = comfy.clip_vision.clip_preprocess(image.to(photomaker.load_device)).float() try: @@ -178,11 +191,16 @@ class PhotoMakerEncode: else: out = cond - return ([[out, {"pooled_output": pooled}]], ) + return io.NodeOutput([[out, {"pooled_output": pooled}]]) -NODE_CLASS_MAPPINGS = { - "PhotoMakerLoader": PhotoMakerLoader, - "PhotoMakerEncode": PhotoMakerEncode, -} +class PhotomakerExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + PhotoMakerLoader, + PhotoMakerEncode, + ] +async def comfy_entrypoint() -> PhotomakerExtension: + return PhotomakerExtension() diff --git a/comfy_extras/nodes_pixart.py b/comfy_extras/nodes_pixart.py index 8d9276afe..a23e87b1f 100644 --- a/comfy_extras/nodes_pixart.py +++ b/comfy_extras/nodes_pixart.py @@ -1,24 +1,38 @@ -from nodes import MAX_RESOLUTION +from typing_extensions import override +import nodes +from comfy_api.latest import ComfyExtension, io -class CLIPTextEncodePixArtAlpha: +class CLIPTextEncodePixArtAlpha(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "width": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - "height": ("INT", {"default": 1024.0, "min": 0, "max": MAX_RESOLUTION}), - # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "text": ("STRING", {"multiline": True, "dynamicPrompts": True}), "clip": ("CLIP", ), - }} + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodePixArtAlpha", + category="advanced/conditioning", + description="Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma.", + inputs=[ + io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION), + io.Int.Input("height", default=1024, min=0, max=nodes.MAX_RESOLUTION), + # "aspect_ratio": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), + io.String.Input("text", multiline=True, dynamic_prompts=True), + io.Clip.Input("clip"), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - CATEGORY = "advanced/conditioning" - DESCRIPTION = "Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma." - - def encode(self, clip, width, height, text): + @classmethod + def execute(cls, clip, width, height, text): tokens = clip.tokenize(text) - return (clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height}),) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens, add_dict={"width": width, "height": height})) -NODE_CLASS_MAPPINGS = { - "CLIPTextEncodePixArtAlpha": CLIPTextEncodePixArtAlpha, -} + +class PixArtExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + CLIPTextEncodePixArtAlpha, + ] + +async def comfy_entrypoint() -> PixArtExtension: + return PixArtExtension() diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py index e6805696f..e749fa6ae 100644 --- a/comfy_extras/nodes_preview_any.py +++ b/comfy_extras/nodes_preview_any.py @@ -25,7 +25,7 @@ class PreviewAny(): value = str(source) elif source is not None: try: - value = json.dumps(source) + value = json.dumps(source, indent=4) except Exception: try: value = str(source) diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index 49747dc7a..525239ae5 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -1,24 +1,29 @@ import node_helpers import comfy.utils import math +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class TextEncodeQwenImageEdit: +class TextEncodeQwenImageEdit(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }, - "optional": {"vae": ("VAE", ), - "image": ("IMAGE", ),}} + def define_schema(cls): + return io.Schema( + node_id="TextEncodeQwenImageEdit", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Vae.Input("vae", optional=True), + io.Image.Input("image", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - - CATEGORY = "advanced/conditioning" - - def encode(self, clip, prompt, vae=None, image=None): + @classmethod + def execute(cls, clip, prompt, vae=None, image=None) -> io.NodeOutput: ref_latent = None if image is None: images = [] @@ -40,28 +45,30 @@ class TextEncodeQwenImageEdit: conditioning = clip.encode_from_tokens_scheduled(tokens) if ref_latent is not None: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": [ref_latent]}, append=True) - return (conditioning, ) + return io.NodeOutput(conditioning) -class TextEncodeQwenImageEditPlus: +class TextEncodeQwenImageEditPlus(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), - }, - "optional": {"vae": ("VAE", ), - "image1": ("IMAGE", ), - "image2": ("IMAGE", ), - "image3": ("IMAGE", ), - }} + def define_schema(cls): + return io.Schema( + node_id="TextEncodeQwenImageEditPlus", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("prompt", multiline=True, dynamic_prompts=True), + io.Vae.Input("vae", optional=True), + io.Image.Input("image1", optional=True), + io.Image.Input("image2", optional=True), + io.Image.Input("image3", optional=True), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" - - CATEGORY = "advanced/conditioning" - - def encode(self, clip, prompt, vae=None, image1=None, image2=None, image3=None): + @classmethod + def execute(cls, clip, prompt, vae=None, image1=None, image2=None, image3=None) -> io.NodeOutput: ref_latents = [] images = [image1, image2, image3] images_vl = [] @@ -94,10 +101,17 @@ class TextEncodeQwenImageEditPlus: conditioning = clip.encode_from_tokens_scheduled(tokens) if len(ref_latents) > 0: conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_latents": ref_latents}, append=True) - return (conditioning, ) + return io.NodeOutput(conditioning) -NODE_CLASS_MAPPINGS = { - "TextEncodeQwenImageEdit": TextEncodeQwenImageEdit, - "TextEncodeQwenImageEditPlus": TextEncodeQwenImageEditPlus, -} +class QwenExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TextEncodeQwenImageEdit, + TextEncodeQwenImageEditPlus, + ] + + +async def comfy_entrypoint() -> QwenExtension: + return QwenExtension() diff --git a/comfy_extras/nodes_rope.py b/comfy_extras/nodes_rope.py new file mode 100644 index 000000000..d1feb031e --- /dev/null +++ b/comfy_extras/nodes_rope.py @@ -0,0 +1,47 @@ +from comfy_api.latest import ComfyExtension, io +from typing_extensions import override + + +class ScaleROPE(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="ScaleROPE", + category="advanced/model_patches", + description="Scale and shift the ROPE of the model.", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.Float.Input("scale_x", default=1.0, min=0.0, max=100.0, step=0.1), + io.Float.Input("shift_x", default=0.0, min=-256.0, max=256.0, step=0.1), + + io.Float.Input("scale_y", default=1.0, min=0.0, max=100.0, step=0.1), + io.Float.Input("shift_y", default=0.0, min=-256.0, max=256.0, step=0.1), + + io.Float.Input("scale_t", default=1.0, min=0.0, max=100.0, step=0.1), + io.Float.Input("shift_t", default=0.0, min=-256.0, max=256.0, step=0.1), + + + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def execute(cls, model, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t) -> io.NodeOutput: + m = model.clone() + m.set_model_rope_options(scale_x, shift_x, scale_y, shift_y, scale_t, shift_t) + return io.NodeOutput(m) + + +class RopeExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + ScaleROPE + ] + + +async def comfy_entrypoint() -> RopeExtension: + return RopeExtension() diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py index d75b29e60..14782cb2b 100644 --- a/comfy_extras/nodes_sd3.py +++ b/comfy_extras/nodes_sd3.py @@ -3,64 +3,83 @@ import comfy.sd import comfy.model_management import nodes import torch -import comfy_extras.nodes_slg +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io +from comfy_extras.nodes_slg import SkipLayerGuidanceDiT -class TripleCLIPLoader: +class TripleCLIPLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ), "clip_name2": (folder_paths.get_filename_list("text_encoders"), ), "clip_name3": (folder_paths.get_filename_list("text_encoders"), ) - }} - RETURN_TYPES = ("CLIP",) - FUNCTION = "load_clip" + def define_schema(cls): + return io.Schema( + node_id="TripleCLIPLoader", + category="advanced/loaders", + description="[Recipes]\n\nsd3: clip-l, clip-g, t5", + inputs=[ + io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")), + io.Combo.Input("clip_name3", options=folder_paths.get_filename_list("text_encoders")), + ], + outputs=[ + io.Clip.Output(), + ], + ) - CATEGORY = "advanced/loaders" - - DESCRIPTION = "[Recipes]\n\nsd3: clip-l, clip-g, t5" - - def load_clip(self, clip_name1, clip_name2, clip_name3): + @classmethod + def execute(cls, clip_name1, clip_name2, clip_name3) -> io.NodeOutput: clip_path1 = folder_paths.get_full_path_or_raise("text_encoders", clip_name1) clip_path2 = folder_paths.get_full_path_or_raise("text_encoders", clip_name2) clip_path3 = folder_paths.get_full_path_or_raise("text_encoders", clip_name3) clip = comfy.sd.load_clip(ckpt_paths=[clip_path1, clip_path2, clip_path3], embedding_directory=folder_paths.get_folder_paths("embeddings")) - return (clip,) + return io.NodeOutput(clip) + + load_clip = execute # TODO: remove -class EmptySD3LatentImage: - def __init__(self): - self.device = comfy.model_management.intermediate_device() +class EmptySD3LatentImage(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="EmptySD3LatentImage", + category="latent/sd3", + inputs=[ + io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), + io.Int.Input("batch_size", default=1, min=1, max=4096), + ], + outputs=[ + io.Latent.Output(), + ], + ) @classmethod - def INPUT_TYPES(s): - return {"required": { "width": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "height": ("INT", {"default": 1024, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} - RETURN_TYPES = ("LATENT",) - FUNCTION = "generate" + def execute(cls, width, height, batch_size=1) -> io.NodeOutput: + latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device()) + return io.NodeOutput({"samples":latent}) - CATEGORY = "latent/sd3" - - def generate(self, width, height, batch_size=1): - latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=self.device) - return ({"samples":latent}, ) + generate = execute # TODO: remove -class CLIPTextEncodeSD3: +class CLIPTextEncodeSD3(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { - "clip": ("CLIP", ), - "clip_l": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "clip_g": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "t5xxl": ("STRING", {"multiline": True, "dynamicPrompts": True}), - "empty_padding": (["none", "empty_prompt"], ) - }} - RETURN_TYPES = ("CONDITIONING",) - FUNCTION = "encode" + def define_schema(cls): + return io.Schema( + node_id="CLIPTextEncodeSD3", + category="advanced/conditioning", + inputs=[ + io.Clip.Input("clip"), + io.String.Input("clip_l", multiline=True, dynamic_prompts=True), + io.String.Input("clip_g", multiline=True, dynamic_prompts=True), + io.String.Input("t5xxl", multiline=True, dynamic_prompts=True), + io.Combo.Input("empty_padding", options=["none", "empty_prompt"]), + ], + outputs=[ + io.Conditioning.Output(), + ], + ) - CATEGORY = "advanced/conditioning" - - def encode(self, clip, clip_l, clip_g, t5xxl, empty_padding): + @classmethod + def execute(cls, clip, clip_l, clip_g, t5xxl, empty_padding) -> io.NodeOutput: no_padding = empty_padding == "none" tokens = clip.tokenize(clip_g) @@ -82,57 +101,112 @@ class CLIPTextEncodeSD3: tokens["l"] += empty["l"] while len(tokens["l"]) > len(tokens["g"]): tokens["g"] += empty["g"] - return (clip.encode_from_tokens_scheduled(tokens), ) + return io.NodeOutput(clip.encode_from_tokens_scheduled(tokens)) + + encode = execute # TODO: remove -class ControlNetApplySD3(nodes.ControlNetApplyAdvanced): +class ControlNetApplySD3(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": {"positive": ("CONDITIONING", ), - "negative": ("CONDITIONING", ), - "control_net": ("CONTROL_NET", ), - "vae": ("VAE", ), - "image": ("IMAGE", ), - "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}) - }} - CATEGORY = "conditioning/controlnet" - DEPRECATED = True + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="ControlNetApplySD3", + display_name="Apply Controlnet with VAE", + category="conditioning/controlnet", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.ControlNet.Input("control_net"), + io.Vae.Input("vae"), + io.Image.Input("image"), + io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + ], + is_deprecated=True, + ) + + @classmethod + def execute(cls, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None) -> io.NodeOutput: + if strength == 0: + return io.NodeOutput(positive, negative) + + control_hint = image.movedim(-1, 1) + cnets = {} + + out = [] + for conditioning in [positive, negative]: + c = [] + for t in conditioning: + d = t[1].copy() + + prev_cnet = d.get('control', None) + if prev_cnet in cnets: + c_net = cnets[prev_cnet] + else: + c_net = control_net.copy().set_cond_hint(control_hint, strength, (start_percent, end_percent), + vae=vae, extra_concat=[]) + c_net.set_previous_controlnet(prev_cnet) + cnets[prev_cnet] = c_net + + d['control'] = c_net + d['control_apply_to_uncond'] = False + n = [t[0], d] + c.append(n) + out.append(c) + return io.NodeOutput(out[0], out[1]) + + apply_controlnet = execute # TODO: remove -class SkipLayerGuidanceSD3(comfy_extras.nodes_slg.SkipLayerGuidanceDiT): +class SkipLayerGuidanceSD3(io.ComfyNode): ''' Enhance guidance towards detailed dtructure by having another set of CFG negative with skipped layers. Inspired by Perturbed Attention Guidance (https://arxiv.org/abs/2403.17377) Experimental implementation by Dango233@StabilityAI. ''' + @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 10.0, "step": 0.1}), - "start_percent": ("FLOAT", {"default": 0.01, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 1.0, "step": 0.001}) - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "skip_guidance_sd3" + def define_schema(cls): + return io.Schema( + node_id="SkipLayerGuidanceSD3", + category="advanced/guidance", + description="Generic version of SkipLayerGuidance node that can be used on every DiT model.", + inputs=[ + io.Model.Input("model"), + io.String.Input("layers", default="7, 8, 9", multiline=False), + io.Float.Input("scale", default=3.0, min=0.0, max=10.0, step=0.1), + io.Float.Input("start_percent", default=0.01, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=0.15, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Model.Output(), + ], + is_experimental=True, + ) - CATEGORY = "advanced/guidance" + @classmethod + def execute(cls, model, layers, scale, start_percent, end_percent) -> io.NodeOutput: + return SkipLayerGuidanceDiT().execute(model=model, scale=scale, start_percent=start_percent, end_percent=end_percent, double_layers=layers) - def skip_guidance_sd3(self, model, layers, scale, start_percent, end_percent): - return self.skip_guidance(model=model, scale=scale, start_percent=start_percent, end_percent=end_percent, double_layers=layers) + skip_guidance_sd3 = execute # TODO: remove -NODE_CLASS_MAPPINGS = { - "TripleCLIPLoader": TripleCLIPLoader, - "EmptySD3LatentImage": EmptySD3LatentImage, - "CLIPTextEncodeSD3": CLIPTextEncodeSD3, - "ControlNetApplySD3": ControlNetApplySD3, - "SkipLayerGuidanceSD3": SkipLayerGuidanceSD3, -} +class SD3Extension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TripleCLIPLoader, + EmptySD3LatentImage, + CLIPTextEncodeSD3, + ControlNetApplySD3, + SkipLayerGuidanceSD3, + ] -NODE_DISPLAY_NAME_MAPPINGS = { - # Sampling - "ControlNetApplySD3": "Apply Controlnet with VAE", -} + +async def comfy_entrypoint() -> SD3Extension: + return SD3Extension() diff --git a/comfy_extras/nodes_slg.py b/comfy_extras/nodes_slg.py index 7adff202e..f462faa8f 100644 --- a/comfy_extras/nodes_slg.py +++ b/comfy_extras/nodes_slg.py @@ -1,33 +1,40 @@ import comfy.model_patcher import comfy.samplers import re +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io -class SkipLayerGuidanceDiT: +class SkipLayerGuidanceDiT(io.ComfyNode): ''' Enhance guidance towards detailed dtructure by having another set of CFG negative with skipped layers. Inspired by Perturbed Attention Guidance (https://arxiv.org/abs/2403.17377) Original experimental implementation for SD3 by Dango233@StabilityAI. ''' + @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "double_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "single_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "scale": ("FLOAT", {"default": 3.0, "min": 0.0, "max": 10.0, "step": 0.1}), - "start_percent": ("FLOAT", {"default": 0.01, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 0.15, "min": 0.0, "max": 1.0, "step": 0.001}), - "rescaling_scale": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "skip_guidance" - EXPERIMENTAL = True + def define_schema(cls): + return io.Schema( + node_id="SkipLayerGuidanceDiT", + category="advanced/guidance", + description="Generic version of SkipLayerGuidance node that can be used on every DiT model.", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.String.Input("double_layers", default="7, 8, 9"), + io.String.Input("single_layers", default="7, 8, 9"), + io.Float.Input("scale", default=3.0, min=0.0, max=10.0, step=0.1), + io.Float.Input("start_percent", default=0.01, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=0.15, min=0.0, max=1.0, step=0.001), + io.Float.Input("rescaling_scale", default=0.0, min=0.0, max=10.0, step=0.01), + ], + outputs=[ + io.Model.Output(), + ], + ) - DESCRIPTION = "Generic version of SkipLayerGuidance node that can be used on every DiT model." - - CATEGORY = "advanced/guidance" - - def skip_guidance(self, model, scale, start_percent, end_percent, double_layers="", single_layers="", rescaling_scale=0): + @classmethod + def execute(cls, model, scale, start_percent, end_percent, double_layers="", single_layers="", rescaling_scale=0) -> io.NodeOutput: # check if layer is comma separated integers def skip(args, extra_args): return args @@ -43,7 +50,7 @@ class SkipLayerGuidanceDiT: single_layers = [int(i) for i in single_layers] if len(double_layers) == 0 and len(single_layers) == 0: - return (model, ) + return io.NodeOutput(model) def post_cfg_function(args): model = args["model"] @@ -76,29 +83,36 @@ class SkipLayerGuidanceDiT: m = model.clone() m.set_model_sampler_post_cfg_function(post_cfg_function) - return (m, ) + return io.NodeOutput(m) -class SkipLayerGuidanceDiTSimple: + skip_guidance = execute # TODO: remove + + +class SkipLayerGuidanceDiTSimple(io.ComfyNode): ''' Simple version of the SkipLayerGuidanceDiT node that only modifies the uncond pass. ''' @classmethod - def INPUT_TYPES(s): - return {"required": {"model": ("MODEL", ), - "double_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "single_layers": ("STRING", {"default": "7, 8, 9", "multiline": False}), - "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}), - "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "skip_guidance" - EXPERIMENTAL = True + def define_schema(cls): + return io.Schema( + node_id="SkipLayerGuidanceDiTSimple", + category="advanced/guidance", + description="Simple version of the SkipLayerGuidanceDiT node that only modifies the uncond pass.", + is_experimental=True, + inputs=[ + io.Model.Input("model"), + io.String.Input("double_layers", default="7, 8, 9"), + io.String.Input("single_layers", default="7, 8, 9"), + io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001), + io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001), + ], + outputs=[ + io.Model.Output(), + ], + ) - DESCRIPTION = "Simple version of the SkipLayerGuidanceDiT node that only modifies the uncond pass." - - CATEGORY = "advanced/guidance" - - def skip_guidance(self, model, start_percent, end_percent, double_layers="", single_layers=""): + @classmethod + def execute(cls, model, start_percent, end_percent, double_layers="", single_layers="") -> io.NodeOutput: def skip(args, extra_args): return args @@ -113,7 +127,7 @@ class SkipLayerGuidanceDiTSimple: single_layers = [int(i) for i in single_layers] if len(double_layers) == 0 and len(single_layers) == 0: - return (model, ) + return io.NodeOutput(model) def calc_cond_batch_function(args): x = args["input"] @@ -144,9 +158,19 @@ class SkipLayerGuidanceDiTSimple: m = model.clone() m.set_model_sampler_calc_cond_batch_function(calc_cond_batch_function) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "SkipLayerGuidanceDiT": SkipLayerGuidanceDiT, - "SkipLayerGuidanceDiTSimple": SkipLayerGuidanceDiTSimple, -} + skip_guidance = execute # TODO: remove + + +class SkipLayerGuidanceExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + SkipLayerGuidanceDiT, + SkipLayerGuidanceDiTSimple, + ] + + +async def comfy_entrypoint() -> SkipLayerGuidanceExtension: + return SkipLayerGuidanceExtension() diff --git a/comfy_extras/nodes_stable3d.py b/comfy_extras/nodes_stable3d.py index be2e34c28..c6d8a683d 100644 --- a/comfy_extras/nodes_stable3d.py +++ b/comfy_extras/nodes_stable3d.py @@ -1,6 +1,8 @@ import torch import nodes import comfy.utils +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io def camera_embeddings(elevation, azimuth): elevation = torch.as_tensor([elevation]) @@ -20,26 +22,31 @@ def camera_embeddings(elevation, azimuth): return embeddings -class StableZero123_Conditioning: +class StableZero123_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "init_image": ("IMAGE",), - "vae": ("VAE",), - "width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="StableZero123_Conditioning", + category="conditioning/3d_models", + inputs=[ + io.ClipVision.Input("clip_vision"), + io.Image.Input("init_image"), + io.Vae.Input("vae"), + io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False) + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent") + ] + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/3d_models" - - def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth): + @classmethod + def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth) -> io.NodeOutput: output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) @@ -51,30 +58,35 @@ class StableZero123_Conditioning: positive = [[cond, {"concat_latent_image": t}]] negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] latent = torch.zeros([batch_size, 4, height // 8, width // 8]) - return (positive, negative, {"samples":latent}) + return io.NodeOutput(positive, negative, {"samples":latent}) -class StableZero123_Conditioning_Batched: +class StableZero123_Conditioning_Batched(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "init_image": ("IMAGE",), - "vae": ("VAE",), - "width": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 256, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), - "elevation": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "azimuth": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "elevation_batch_increment": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - "azimuth_batch_increment": ("FLOAT", {"default": 0.0, "min": -180.0, "max": 180.0, "step": 0.1, "round": False}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="StableZero123_Conditioning_Batched", + category="conditioning/3d_models", + inputs=[ + io.ClipVision.Input("clip_vision"), + io.Image.Input("init_image"), + io.Vae.Input("vae"), + io.Int.Input("width", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=256, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("batch_size", default=1, min=1, max=4096), + io.Float.Input("elevation", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("azimuth", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("elevation_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False), + io.Float.Input("azimuth_batch_increment", default=0.0, min=-180.0, max=180.0, step=0.1, round=False) + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent") + ] + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/3d_models" - - def encode(self, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment): + @classmethod + def execute(cls, clip_vision, init_image, vae, width, height, batch_size, elevation, azimuth, elevation_batch_increment, azimuth_batch_increment) -> io.NodeOutput: output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) @@ -93,27 +105,32 @@ class StableZero123_Conditioning_Batched: positive = [[cond, {"concat_latent_image": t}]] negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t)}]] latent = torch.zeros([batch_size, 4, height // 8, width // 8]) - return (positive, negative, {"samples":latent, "batch_index": [0] * batch_size}) + return io.NodeOutput(positive, negative, {"samples":latent, "batch_index": [0] * batch_size}) -class SV3D_Conditioning: +class SV3D_Conditioning(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "clip_vision": ("CLIP_VISION",), - "init_image": ("IMAGE",), - "vae": ("VAE",), - "width": ("INT", {"default": 576, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "height": ("INT", {"default": 576, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 8}), - "video_frames": ("INT", {"default": 21, "min": 1, "max": 4096}), - "elevation": ("FLOAT", {"default": 0.0, "min": -90.0, "max": 90.0, "step": 0.1, "round": False}), - }} - RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") - RETURN_NAMES = ("positive", "negative", "latent") + def define_schema(cls): + return io.Schema( + node_id="SV3D_Conditioning", + category="conditioning/3d_models", + inputs=[ + io.ClipVision.Input("clip_vision"), + io.Image.Input("init_image"), + io.Vae.Input("vae"), + io.Int.Input("width", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=576, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("video_frames", default=21, min=1, max=4096), + io.Float.Input("elevation", default=0.0, min=-90.0, max=90.0, step=0.1, round=False) + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent") + ] + ) - FUNCTION = "encode" - - CATEGORY = "conditioning/3d_models" - - def encode(self, clip_vision, init_image, vae, width, height, video_frames, elevation): + @classmethod + def execute(cls, clip_vision, init_image, vae, width, height, video_frames, elevation) -> io.NodeOutput: output = clip_vision.encode_image(init_image) pooled = output.image_embeds.unsqueeze(0) pixels = comfy.utils.common_upscale(init_image.movedim(-1,1), width, height, "bilinear", "center").movedim(1,-1) @@ -133,11 +150,17 @@ class SV3D_Conditioning: positive = [[pooled, {"concat_latent_image": t, "elevation": elevations, "azimuth": azimuths}]] negative = [[torch.zeros_like(pooled), {"concat_latent_image": torch.zeros_like(t), "elevation": elevations, "azimuth": azimuths}]] latent = torch.zeros([video_frames, 4, height // 8, width // 8]) - return (positive, negative, {"samples":latent}) + return io.NodeOutput(positive, negative, {"samples":latent}) -NODE_CLASS_MAPPINGS = { - "StableZero123_Conditioning": StableZero123_Conditioning, - "StableZero123_Conditioning_Batched": StableZero123_Conditioning_Batched, - "SV3D_Conditioning": SV3D_Conditioning, -} +class Stable3DExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + StableZero123_Conditioning, + StableZero123_Conditioning_Batched, + SV3D_Conditioning, + ] + +async def comfy_entrypoint() -> Stable3DExtension: + return Stable3DExtension() diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py index 9f77c06fc..87bf29b8f 100644 --- a/comfy_extras/nodes_tomesd.py +++ b/comfy_extras/nodes_tomesd.py @@ -1,7 +1,9 @@ #Taken from: https://github.com/dbolya/tomesd import torch -from typing import Tuple, Callable +from typing import Tuple, Callable, Optional +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io import math def do_nothing(x: torch.Tensor, mode:str=None): @@ -144,33 +146,45 @@ def get_functions(x, ratio, original_shape): -class TomePatchModel: +class TomePatchModel(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "ratio": ("FLOAT", {"default": 0.3, "min": 0.0, "max": 1.0, "step": 0.01}), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls): + return io.Schema( + node_id="TomePatchModel", + category="model_patches/unet", + inputs=[ + io.Model.Input("model"), + io.Float.Input("ratio", default=0.3, min=0.0, max=1.0, step=0.01), + ], + outputs=[io.Model.Output()], + ) - CATEGORY = "model_patches/unet" - - def patch(self, model, ratio): - self.u = None + @classmethod + def execute(cls, model, ratio) -> io.NodeOutput: + u: Optional[Callable] = None def tomesd_m(q, k, v, extra_options): + nonlocal u #NOTE: In the reference code get_functions takes x (input of the transformer block) as the argument instead of q #however from my basic testing it seems that using q instead gives better results - m, self.u = get_functions(q, ratio, extra_options["original_shape"]) + m, u = get_functions(q, ratio, extra_options["original_shape"]) return m(q), k, v def tomesd_u(n, extra_options): - return self.u(n) + nonlocal u + return u(n) m = model.clone() m.set_model_attn1_patch(tomesd_m) m.set_model_attn1_output_patch(tomesd_u) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "TomePatchModel": TomePatchModel, -} +class TomePatchModelExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TomePatchModel, + ] + + +async def comfy_entrypoint() -> TomePatchModelExtension: + return TomePatchModelExtension() diff --git a/comfy_extras/nodes_torch_compile.py b/comfy_extras/nodes_torch_compile.py index 605536678..adbeece2f 100644 --- a/comfy_extras/nodes_torch_compile.py +++ b/comfy_extras/nodes_torch_compile.py @@ -1,23 +1,39 @@ +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io from comfy_api.torch_helpers import set_torch_compile_wrapper -class TorchCompileModel: +class TorchCompileModel(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model": ("MODEL",), - "backend": (["inductor", "cudagraphs"],), - }} - RETURN_TYPES = ("MODEL",) - FUNCTION = "patch" + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="TorchCompileModel", + category="_for_testing", + inputs=[ + io.Model.Input("model"), + io.Combo.Input( + "backend", + options=["inductor", "cudagraphs"], + ), + ], + outputs=[io.Model.Output()], + is_experimental=True, + ) - CATEGORY = "_for_testing" - EXPERIMENTAL = True - - def patch(self, model, backend): + @classmethod + def execute(cls, model, backend) -> io.NodeOutput: m = model.clone() set_torch_compile_wrapper(model=m, backend=backend) - return (m, ) + return io.NodeOutput(m) -NODE_CLASS_MAPPINGS = { - "TorchCompileModel": TorchCompileModel, -} + +class TorchCompileExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + TorchCompileModel, + ] + + +async def comfy_entrypoint() -> TorchCompileExtension: + return TorchCompileExtension() diff --git a/comfy_extras/nodes_upscale_model.py b/comfy_extras/nodes_upscale_model.py index 04c948341..4d62b87be 100644 --- a/comfy_extras/nodes_upscale_model.py +++ b/comfy_extras/nodes_upscale_model.py @@ -4,6 +4,8 @@ from comfy import model_management import torch import comfy.utils import folder_paths +from typing_extensions import override +from comfy_api.latest import ComfyExtension, io try: from spandrel_extra_arches import EXTRA_REGISTRY @@ -13,17 +15,23 @@ try: except: pass -class UpscaleModelLoader: +class UpscaleModelLoader(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "model_name": (folder_paths.get_filename_list("upscale_models"), ), - }} - RETURN_TYPES = ("UPSCALE_MODEL",) - FUNCTION = "load_model" + def define_schema(cls): + return io.Schema( + node_id="UpscaleModelLoader", + display_name="Load Upscale Model", + category="loaders", + inputs=[ + io.Combo.Input("model_name", options=folder_paths.get_filename_list("upscale_models")), + ], + outputs=[ + io.UpscaleModel.Output(), + ], + ) - CATEGORY = "loaders" - - def load_model(self, model_name): + @classmethod + def execute(cls, model_name) -> io.NodeOutput: model_path = folder_paths.get_full_path_or_raise("upscale_models", model_name) sd = comfy.utils.load_torch_file(model_path, safe_load=True) if "module.layers.0.residual_group.blocks.0.norm1.weight" in sd: @@ -33,21 +41,29 @@ class UpscaleModelLoader: if not isinstance(out, ImageModelDescriptor): raise Exception("Upscale model must be a single-image model.") - return (out, ) + return io.NodeOutput(out) + + load_model = execute # TODO: remove -class ImageUpscaleWithModel: +class ImageUpscaleWithModel(io.ComfyNode): @classmethod - def INPUT_TYPES(s): - return {"required": { "upscale_model": ("UPSCALE_MODEL",), - "image": ("IMAGE",), - }} - RETURN_TYPES = ("IMAGE",) - FUNCTION = "upscale" + def define_schema(cls): + return io.Schema( + node_id="ImageUpscaleWithModel", + display_name="Upscale Image (using Model)", + category="image/upscaling", + inputs=[ + io.UpscaleModel.Input("upscale_model"), + io.Image.Input("image"), + ], + outputs=[ + io.Image.Output(), + ], + ) - CATEGORY = "image/upscaling" - - def upscale(self, upscale_model, image): + @classmethod + def execute(cls, upscale_model, image) -> io.NodeOutput: device = model_management.get_torch_device() memory_required = model_management.module_size(upscale_model.model) @@ -75,9 +91,19 @@ class ImageUpscaleWithModel: upscale_model.to("cpu") s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0) - return (s,) + return io.NodeOutput(s) -NODE_CLASS_MAPPINGS = { - "UpscaleModelLoader": UpscaleModelLoader, - "ImageUpscaleWithModel": ImageUpscaleWithModel -} + upscale = execute # TODO: remove + + +class UpscaleModelExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + UpscaleModelLoader, + ImageUpscaleWithModel, + ] + + +async def comfy_entrypoint() -> UpscaleModelExtension: + return UpscaleModelExtension() diff --git a/comfyui_version.py b/comfyui_version.py index d469a8194..1e554eb9f 100644 --- a/comfyui_version.py +++ b/comfyui_version.py @@ -1,3 +1,3 @@ # This file is automatically generated by the build process when version is # updated in pyproject.toml. -__version__ = "0.3.60" +__version__ = "0.3.69" diff --git a/cuda_malloc.py b/cuda_malloc.py index c1d9ae3ca..6520d5123 100644 --- a/cuda_malloc.py +++ b/cuda_malloc.py @@ -1,6 +1,6 @@ import os import importlib.util -from comfy.cli_args import args +from comfy.cli_args import args, PerformanceFeature import subprocess #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import. @@ -75,8 +75,9 @@ if not args.cuda_malloc: spec.loader.exec_module(module) version = module.__version__ - if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch - args.cuda_malloc = cuda_malloc_supported() + if int(version[0]) >= 2 and "+cu" in version: # enable by default for torch version 2.0 and up only on cuda torch + if PerformanceFeature.AutoTune not in args.fast: # Autotune has issues with cuda malloc + args.cuda_malloc = cuda_malloc_supported() except: pass diff --git a/custom_nodes/example_node.py.example b/custom_nodes/example_node.py.example index 29ab2aa72..779c35787 100644 --- a/custom_nodes/example_node.py.example +++ b/custom_nodes/example_node.py.example @@ -1,96 +1,70 @@ -class Example: +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + + +class Example(io.ComfyNode): """ - A example node + An example node Class methods ------------- - INPUT_TYPES (dict): - Tell the main program input parameters of nodes. - IS_CHANGED: + define_schema (io.Schema): + Tell the main program the metadata, input, output parameters of nodes. + fingerprint_inputs: optional method to control when the node is re executed. + check_lazy_status: + optional method to control list of input names that need to be evaluated. - Attributes - ---------- - RETURN_TYPES (`tuple`): - The type of each element in the output tuple. - RETURN_NAMES (`tuple`): - Optional: The name of each output in the output tuple. - FUNCTION (`str`): - The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute() - OUTPUT_NODE ([`bool`]): - If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example. - The backend iterates on these output nodes and tries to execute all their parents if their parent graph is properly connected. - Assumed to be False if not present. - CATEGORY (`str`): - The category the node should appear in the UI. - DEPRECATED (`bool`): - Indicates whether the node is deprecated. Deprecated nodes are hidden by default in the UI, but remain - functional in existing workflows that use them. - EXPERIMENTAL (`bool`): - Indicates whether the node is experimental. Experimental nodes are marked as such in the UI and may be subject to - significant changes or removal in future versions. Use with caution in production workflows. - execute(s) -> tuple || None: - The entry point method. The name of this method must be the same as the value of property `FUNCTION`. - For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"` then it must be `foo`. """ - def __init__(self): - pass @classmethod - def INPUT_TYPES(s): + def define_schema(cls) -> io.Schema: """ - Return a dictionary which contains config for all input fields. - Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT". - Input types "INT", "STRING" or "FLOAT" are special values for fields on the node. - The type can be a list for selection. - - Returns: `dict`: - - Key input_fields_group (`string`): Can be either required, hidden or optional. A node class must have property `required` - - Value input_fields (`dict`): Contains input fields config: - * Key field_name (`string`): Name of a entry-point method's argument - * Value field_config (`tuple`): - + First value is a string indicate the type of field or a list for selection. - + Second value is a config for type "INT", "STRING" or "FLOAT". + Return a schema which contains all information about the node. + Some types: "Model", "Vae", "Clip", "Conditioning", "Latent", "Image", "Int", "String", "Float", "Combo". + For outputs the "io.Model.Output" should be used, for inputs the "io.Model.Input" can be used. + The type can be a "Combo" - this will be a list for selection. """ - return { - "required": { - "image": ("IMAGE",), - "int_field": ("INT", { - "default": 0, - "min": 0, #Minimum value - "max": 4096, #Maximum value - "step": 64, #Slider's step - "display": "number", # Cosmetic only: display as "number" or "slider" - "lazy": True # Will only be evaluated if check_lazy_status requires it - }), - "float_field": ("FLOAT", { - "default": 1.0, - "min": 0.0, - "max": 10.0, - "step": 0.01, - "round": 0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. - "display": "number", - "lazy": True - }), - "print_to_screen": (["enable", "disable"],), - "string_field": ("STRING", { - "multiline": False, #True if you want the field to look like the one on the ClipTextEncode node - "default": "Hello World!", - "lazy": True - }), - }, - } + return io.Schema( + node_id="Example", + display_name="Example Node", + category="Example", + inputs=[ + io.Image.Input("image"), + io.Int.Input( + "int_field", + min=0, + max=4096, + step=64, # Slider's step + display_mode=io.NumberDisplay.number, # Cosmetic only: display as "number" or "slider" + lazy=True, # Will only be evaluated if check_lazy_status requires it + ), + io.Float.Input( + "float_field", + default=1.0, + min=0.0, + max=10.0, + step=0.01, + round=0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding. + display_mode=io.NumberDisplay.number, + lazy=True, + ), + io.Combo.Input("print_to_screen", options=["enable", "disable"]), + io.String.Input( + "string_field", + multiline=False, # True if you want the field to look like the one on the ClipTextEncode node + default="Hello world!", + lazy=True, + ) + ], + outputs=[ + io.Image.Output(), + ], + ) - RETURN_TYPES = ("IMAGE",) - #RETURN_NAMES = ("image_output_name",) - - FUNCTION = "test" - - #OUTPUT_NODE = False - - CATEGORY = "Example" - - def check_lazy_status(self, image, string_field, int_field, float_field, print_to_screen): + @classmethod + def check_lazy_status(cls, image, string_field, int_field, float_field, print_to_screen): """ Return a list of input names that need to be evaluated. @@ -107,7 +81,8 @@ class Example: else: return [] - def test(self, image, string_field, int_field, float_field, print_to_screen): + @classmethod + def execute(cls, image, string_field, int_field, float_field, print_to_screen) -> io.NodeOutput: if print_to_screen == "enable": print(f"""Your input contains: string_field aka input text: {string_field} @@ -116,7 +91,7 @@ class Example: """) #do some processing on the image, in this example I just invert it image = 1.0 - image - return (image,) + return io.NodeOutput(image) """ The node will always be re executed if any of the inputs change but @@ -127,7 +102,7 @@ class Example: changes between executions the LoadImage node is executed again. """ #@classmethod - #def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen): + #def fingerprint_inputs(s, image, string_field, int_field, float_field, print_to_screen): # return "" # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension @@ -143,13 +118,13 @@ async def get_hello(request): return web.json_response("hello") -# A dictionary that contains all nodes you want to export with their names -# NOTE: names should be globally unique -NODE_CLASS_MAPPINGS = { - "Example": Example -} +class ExampleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + Example, + ] -# A dictionary that contains the friendly/humanly readable titles for the nodes -NODE_DISPLAY_NAME_MAPPINGS = { - "Example": "Example Node" -} + +async def comfy_entrypoint() -> ExampleExtension: # ComfyUI calls this to load your extension and its nodes. + return ExampleExtension() diff --git a/execution.py b/execution.py index 1dc35738b..17c77beab 100644 --- a/execution.py +++ b/execution.py @@ -18,9 +18,10 @@ from comfy_execution.caching import ( BasicCache, CacheKeySetID, CacheKeySetInputSignature, - DependencyAwareCache, + NullCache, HierarchicalCache, LRUCache, + RAMPressureCache, ) from comfy_execution.graph import ( DynamicPrompt, @@ -88,54 +89,62 @@ class IsChangedCache: return self.is_changed[node_id] +class CacheEntry(NamedTuple): + ui: dict + outputs: list + + class CacheType(Enum): CLASSIC = 0 LRU = 1 - DEPENDENCY_AWARE = 2 + NONE = 2 + RAM_PRESSURE = 3 class CacheSet: - def __init__(self, cache_type=None, cache_size=None): - if cache_type == CacheType.DEPENDENCY_AWARE: - self.init_dependency_aware_cache() + def __init__(self, cache_type=None, cache_args={}): + if cache_type == CacheType.NONE: + self.init_null_cache() logging.info("Disabling intermediate node cache.") + elif cache_type == CacheType.RAM_PRESSURE: + cache_ram = cache_args.get("ram", 16.0) + self.init_ram_cache(cache_ram) + logging.info("Using RAM pressure cache.") elif cache_type == CacheType.LRU: - if cache_size is None: - cache_size = 0 + cache_size = cache_args.get("lru", 0) self.init_lru_cache(cache_size) logging.info("Using LRU cache") else: self.init_classic_cache() - self.all = [self.outputs, self.ui, self.objects] + self.all = [self.outputs, self.objects] # Performs like the old cache -- dump data ASAP def init_classic_cache(self): self.outputs = HierarchicalCache(CacheKeySetInputSignature) - self.ui = HierarchicalCache(CacheKeySetInputSignature) self.objects = HierarchicalCache(CacheKeySetID) def init_lru_cache(self, cache_size): self.outputs = LRUCache(CacheKeySetInputSignature, max_size=cache_size) - self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size) self.objects = HierarchicalCache(CacheKeySetID) - # only hold cached items while the decendents have not executed - def init_dependency_aware_cache(self): - self.outputs = DependencyAwareCache(CacheKeySetInputSignature) - self.ui = DependencyAwareCache(CacheKeySetInputSignature) - self.objects = DependencyAwareCache(CacheKeySetID) + def init_ram_cache(self, min_headroom): + self.outputs = RAMPressureCache(CacheKeySetInputSignature) + self.objects = HierarchicalCache(CacheKeySetID) + + def init_null_cache(self): + self.outputs = NullCache() + self.objects = NullCache() def recursive_debug_dump(self): result = { "outputs": self.outputs.recursive_debug_dump(), - "ui": self.ui.recursive_debug_dump(), } return result SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org") -def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}): +def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}): is_v3 = issubclass(class_def, _ComfyNodeInternal) if is_v3: valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True) @@ -153,17 +162,17 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)): input_unique_id = input_data[0] output_index = input_data[1] - if outputs is None: + if execution_list is None: mark_missing() continue # This might be a lazily-evaluated input - cached_output = outputs.get(input_unique_id) - if cached_output is None: + cached = execution_list.get_cache(input_unique_id, unique_id) + if cached is None or cached.outputs is None: mark_missing() continue - if output_index >= len(cached_output): + if output_index >= len(cached.outputs): mark_missing() continue - obj = cached_output[output_index] + obj = cached.outputs[output_index] input_data_all[x] = obj elif input_category is not None: input_data_all[x] = [input_data] @@ -392,7 +401,7 @@ def format_value(x): else: return str(x) -async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes): +async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) display_node_id = dynprompt.get_display_node_id(unique_id) @@ -400,11 +409,15 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, inputs = dynprompt.get_node(unique_id)['inputs'] class_type = dynprompt.get_node(unique_id)['class_type'] class_def = nodes.NODE_CLASS_MAPPINGS[class_type] - if caches.outputs.get(unique_id) is not None: + cached = caches.outputs.get(unique_id) + if cached is not None: if server.client_id is not None: - cached_output = caches.ui.get(unique_id) or {} - server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id) + cached_ui = cached.ui or {} + server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_ui.get("output",None), "prompt_id": prompt_id }, server.client_id) + if cached.ui is not None: + ui_outputs[unique_id] = cached.ui get_progress_state().finish_progress(unique_id) + execution_list.cache_update(unique_id, cached) return (ExecutionResult.SUCCESS, None, None) input_data_all = None @@ -434,8 +447,8 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, for r in result: if is_link(r): source_node, source_output = r[0], r[1] - node_output = caches.outputs.get(source_node)[source_output] - for o in node_output: + node_cached = execution_list.get_cache(source_node, unique_id) + for o in node_cached.outputs[source_output]: resolved_output.append(o) else: @@ -443,10 +456,11 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, resolved_outputs.append(tuple(resolved_output)) output_data = merge_result_data(resolved_outputs, class_def) output_ui = [] + del pending_subgraph_results[unique_id] has_subgraph = False else: get_progress_state().start_progress(unique_id) - input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data) + input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) @@ -504,7 +518,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, asyncio.create_task(await_completion()) return (ExecutionResult.PENDING, None, None) if len(output_ui) > 0: - caches.ui.set(unique_id, { + ui_outputs[unique_id] = { "meta": { "node_id": unique_id, "display_node": display_node_id, @@ -512,7 +526,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, "real_node_id": real_node_id, }, "output": output_ui - }) + } if server.client_id is not None: server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id }, server.client_id) if has_subgraph: @@ -525,10 +539,6 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, if new_graph is None: cached_outputs.append((False, node_outputs)) else: - # Check for conflicts - for node_id in new_graph.keys(): - if dynprompt.has_node(node_id): - raise DuplicateNodeError(f"Attempt to add duplicate node {node_id}. Ensure node ids are unique and deterministic or use graph_utils.GraphBuilder.") for node_id, node_info in new_graph.items(): new_node_ids.append(node_id) display_id = node_info.get("override_display_id", unique_id) @@ -549,11 +559,16 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, subcache.clean_unused() for node_id in new_output_ids: execution_list.add_node(node_id) + execution_list.cache_link(node_id, unique_id) for link in new_output_links: execution_list.add_strong_link(link[0], link[1], unique_id) pending_subgraph_results[unique_id] = cached_outputs return (ExecutionResult.PENDING, None, None) - caches.outputs.set(unique_id, output_data) + + cache_entry = CacheEntry(ui=ui_outputs.get(unique_id), outputs=output_data) + execution_list.cache_update(unique_id, cache_entry) + caches.outputs.set(unique_id, cache_entry) + except comfy.model_management.InterruptProcessingException as iex: logging.info("Processing interrupted") @@ -597,14 +612,14 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, return (ExecutionResult.SUCCESS, None, None) class PromptExecutor: - def __init__(self, server, cache_type=False, cache_size=None): - self.cache_size = cache_size + def __init__(self, server, cache_type=False, cache_args=None): + self.cache_args = cache_args self.cache_type = cache_type self.server = server self.reset() def reset(self): - self.caches = CacheSet(cache_type=self.cache_type, cache_size=self.cache_size) + self.caches = CacheSet(cache_type=self.cache_type, cache_args=self.cache_args) self.status_messages = [] self.success = True @@ -679,6 +694,7 @@ class PromptExecutor: broadcast=False) pending_subgraph_results = {} pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results + ui_node_outputs = {} executed = set() execution_list = ExecutionList(dynamic_prompt, self.caches.outputs) current_outputs = self.caches.outputs.all_node_ids() @@ -692,7 +708,7 @@ class PromptExecutor: break assert node_id is not None, "Node ID should not be None at this point" - result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes) + result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs) self.success = result != ExecutionResult.FAILURE if result == ExecutionResult.FAILURE: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) @@ -701,18 +717,16 @@ class PromptExecutor: execution_list.unstage_node_execution() else: # result == ExecutionResult.SUCCESS: execution_list.complete_node_execution() + self.caches.outputs.poll(ram_headroom=self.cache_args["ram"]) else: # Only execute when the while-loop ends without break self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False) ui_outputs = {} meta_outputs = {} - all_node_ids = self.caches.ui.all_node_ids() - for node_id in all_node_ids: - ui_info = self.caches.ui.get(node_id) - if ui_info is not None: - ui_outputs[node_id] = ui_info["output"] - meta_outputs[node_id] = ui_info["meta"] + for node_id, ui_info in ui_node_outputs.items(): + ui_outputs[node_id] = ui_info["output"] + meta_outputs[node_id] = ui_info["meta"] self.history_result = { "outputs": ui_outputs, "meta": meta_outputs, @@ -1110,7 +1124,7 @@ class PromptQueue: messages: List[str] def task_done(self, item_id, history_result, - status: Optional['PromptQueue.ExecutionStatus']): + status: Optional['PromptQueue.ExecutionStatus'], process_item=None): with self.mutex: prompt = self.currently_running.pop(item_id) if len(self.history) > MAXIMUM_HISTORY_SIZE: @@ -1120,10 +1134,8 @@ class PromptQueue: if status is not None: status_dict = copy.deepcopy(status._asdict()) - # Remove sensitive data from extra_data before storing in history - for sensitive_val in SENSITIVE_EXTRA_DATA_KEYS: - if sensitive_val in prompt[3]: - prompt[3].pop(sensitive_val) + if process_item is not None: + prompt = process_item(prompt) self.history[prompt[1]] = { "prompt": prompt, diff --git a/extra_model_paths.yaml.example b/extra_model_paths.yaml.example index b55913a5a..34df01681 100644 --- a/extra_model_paths.yaml.example +++ b/extra_model_paths.yaml.example @@ -1,25 +1,5 @@ #Rename this to extra_model_paths.yaml and ComfyUI will load it - -#config for a1111 ui -#all you have to do is change the base_path to where yours is installed -a111: - base_path: path/to/stable-diffusion-webui/ - - checkpoints: models/Stable-diffusion - configs: models/Stable-diffusion - vae: models/VAE - loras: | - models/Lora - models/LyCORIS - upscale_models: | - models/ESRGAN - models/RealESRGAN - models/SwinIR - embeddings: embeddings - hypernetworks: models/hypernetworks - controlnet: models/ControlNet - #config for comfyui #your base path should be either an existing comfy install or a central folder where you store all of your models, loras, etc. @@ -28,7 +8,9 @@ a111: # # You can use is_default to mark that these folders should be listed first, and used as the default dirs for eg downloads # #is_default: true # checkpoints: models/checkpoints/ -# clip: models/clip/ +# text_encoders: | +# models/text_encoders/ +# models/clip/ # legacy location still supported # clip_vision: models/clip_vision/ # configs: models/configs/ # controlnet: models/controlnet/ @@ -39,6 +21,32 @@ a111: # loras: models/loras/ # upscale_models: models/upscale_models/ # vae: models/vae/ +# audio_encoders: models/audio_encoders/ +# model_patches: models/model_patches/ + + +#config for a1111 ui +#all you have to do is uncomment this (remove the #) and change the base_path to where yours is installed + +#a111: +# base_path: path/to/stable-diffusion-webui/ +# checkpoints: models/Stable-diffusion +# configs: models/Stable-diffusion +# vae: models/VAE +# loras: | +# models/Lora +# models/LyCORIS +# upscale_models: | +# models/ESRGAN +# models/RealESRGAN +# models/SwinIR +# embeddings: embeddings +# hypernetworks: models/hypernetworks +# controlnet: models/ControlNet + + +# For a full list of supported keys (style_models, vae_approx, hypernetworks, photomaker, +# model_patches, audio_encoders, classifiers, etc.) see folder_paths.py. #other_ui: # base_path: path/to/ui diff --git a/main.py b/main.py index c33f0e17b..e1b0f1620 100644 --- a/main.py +++ b/main.py @@ -115,6 +115,7 @@ if os.name == "nt": os.environ['MIMALLOC_PURGE_DELAY'] = '0' if __name__ == "__main__": + os.environ['TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL'] = '1' if args.default_device is not None: default_dev = args.default_device devices = list(range(32)) @@ -127,6 +128,7 @@ if __name__ == "__main__": if args.cuda_device is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda_device) os.environ['HIP_VISIBLE_DEVICES'] = str(args.cuda_device) + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = str(args.cuda_device) logging.info("Set cuda device to: {}".format(args.cuda_device)) if args.oneapi_device_selector is not None: @@ -170,10 +172,12 @@ def prompt_worker(q, server_instance): cache_type = execution.CacheType.CLASSIC if args.cache_lru > 0: cache_type = execution.CacheType.LRU + elif args.cache_ram > 0: + cache_type = execution.CacheType.RAM_PRESSURE elif args.cache_none: - cache_type = execution.CacheType.DEPENDENCY_AWARE + cache_type = execution.CacheType.NONE - e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_size=args.cache_lru) + e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : args.cache_ram } ) last_gc_collect = 0 need_gc = False gc_collect_interval = 10.0 @@ -190,14 +194,21 @@ def prompt_worker(q, server_instance): prompt_id = item[1] server_instance.last_prompt_id = prompt_id - e.execute(item[2], prompt_id, item[3], item[4]) + sensitive = item[5] + extra_data = item[3].copy() + for k in sensitive: + extra_data[k] = sensitive[k] + + e.execute(item[2], prompt_id, extra_data, item[4]) need_gc = True + + remove_sensitive = lambda prompt: prompt[:5] + prompt[6:] q.task_done(item_id, e.history_result, status=execution.PromptQueue.ExecutionStatus( status_str='success' if e.success else 'error', completed=e.success, - messages=e.status_messages)) + messages=e.status_messages), process_item=remove_sensitive) if server_instance.client_id is not None: server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id) diff --git a/middleware/cache_middleware.py b/middleware/cache_middleware.py index 374ef7934..f02135369 100644 --- a/middleware/cache_middleware.py +++ b/middleware/cache_middleware.py @@ -26,11 +26,12 @@ async def cache_control( """Cache control middleware that sets appropriate cache headers based on file type and response status""" response: web.Response = await handler(request) - if ( - request.path.endswith(".js") - or request.path.endswith(".css") - or request.path.endswith("index.json") - ): + path_filename = request.path.rsplit("/", 1)[-1] + is_entry_point = path_filename.startswith("index") and path_filename.endswith( + ".json" + ) + + if request.path.endswith(".js") or request.path.endswith(".css") or is_entry_point: response.headers.setdefault("Cache-Control", "no-cache") return response diff --git a/nodes.py b/nodes.py index 1a6784b68..f6aeedc78 100644 --- a/nodes.py +++ b/nodes.py @@ -2027,7 +2027,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "DiffControlNetLoader": "Load ControlNet Model (diff)", "StyleModelLoader": "Load Style Model", "CLIPVisionLoader": "Load CLIP Vision", - "UpscaleModelLoader": "Load Upscale Model", "UNETLoader": "Load Diffusion Model", # Conditioning "CLIPVisionEncode": "CLIP Vision Encode", @@ -2065,7 +2064,6 @@ NODE_DISPLAY_NAME_MAPPINGS = { "LoadImageOutput": "Load Image (from Outputs)", "ImageScale": "Upscale Image", "ImageScaleBy": "Upscale Image By", - "ImageUpscaleWithModel": "Upscale Image (using Model)", "ImageInvert": "Invert Image", "ImagePadForOutpaint": "Pad Image for Outpainting", "ImageBatch": "Batch Images", @@ -2297,6 +2295,7 @@ async def init_builtin_extra_nodes(): "nodes_gits.py", "nodes_controlnet.py", "nodes_hunyuan.py", + "nodes_eps.py", "nodes_flux.py", "nodes_lora_extract.py", "nodes_torch_compile.py", @@ -2330,6 +2329,8 @@ async def init_builtin_extra_nodes(): "nodes_model_patch.py", "nodes_easycache.py", "nodes_audio_encoder.py", + "nodes_rope.py", + "nodes_nop.py", ] import_failed = [] @@ -2350,12 +2351,14 @@ async def init_builtin_api_nodes(): "nodes_kling.py", "nodes_bfl.py", "nodes_bytedance.py", + "nodes_ltxv.py", "nodes_luma.py", "nodes_recraft.py", "nodes_pixverse.py", "nodes_stability.py", "nodes_pika.py", "nodes_runway.py", + "nodes_sora.py", "nodes_tripo.py", "nodes_moonvalley.py", "nodes_rodin.py", diff --git a/pyproject.toml b/pyproject.toml index 7340c320b..a14b383b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ComfyUI" -version = "0.3.60" +version = "0.3.69" readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.9" @@ -22,3 +22,50 @@ lint.select = [ "F", ] exclude = ["*.ipynb", "**/generated/*.pyi"] + +[tool.pylint] +master.py-version = "3.10" +master.extension-pkg-allow-list = [ + "pydantic", +] +reports.output-format = "colorized" +similarities.ignore-imports = "yes" +messages_control.disable = [ + "missing-module-docstring", + "missing-class-docstring", + "missing-function-docstring", + "line-too-long", + "too-few-public-methods", + "too-many-public-methods", + "too-many-instance-attributes", + "too-many-positional-arguments", + "broad-exception-raised", + "too-many-lines", + "invalid-name", + "unused-argument", + "broad-exception-caught", + "consider-using-with", + "fixme", + "too-many-statements", + "too-many-branches", + "too-many-locals", + "too-many-arguments", + "too-many-return-statements", + "too-many-nested-blocks", + "duplicate-code", + "abstract-method", + "superfluous-parens", + "arguments-differ", + "redefined-builtin", + "unnecessary-lambda", + "dangerous-default-value", + "invalid-overridden-method", + # next warnings should be fixed in future + "bad-classmethod-argument", # Class method should have 'cls' as first argument + "wrong-import-order", # Standard imports should be placed before third party imports + "ungrouped-imports", + "unnecessary-pass", + "unnecessary-lambda-assignment", + "no-else-return", + "unused-variable", +] diff --git a/requirements.txt b/requirements.txt index b3f81e8fa..249c36dee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -comfyui-frontend-package==1.26.13 -comfyui-workflow-templates==0.1.88 -comfyui-embedded-docs==0.2.6 +comfyui-frontend-package==1.28.8 +comfyui-workflow-templates==0.2.11 +comfyui-embedded-docs==0.3.1 torch torchsde torchvision @@ -25,6 +25,5 @@ av>=14.2.0 #non essential dependencies: kornia>=0.7.1 spandrel -soundfile pydantic~=2.0 pydantic-settings~=2.0 diff --git a/server.py b/server.py index 80e9d3fa7..d059d3dc9 100644 --- a/server.py +++ b/server.py @@ -2,6 +2,7 @@ import os import sys import asyncio import traceback +import time import nodes import folder_paths @@ -35,6 +36,7 @@ from comfy_api.internal import _ComfyNodeInternal from app.user_manager import UserManager from app.model_manager import ModelFileManager from app.custom_node_manager import CustomNodeManager +from app.subgraph_manager import SubgraphManager from typing import Optional, Union from api_server.routes.internal.internal_routes import InternalRoutes from protocol import BinaryEventTypes @@ -48,6 +50,28 @@ async def send_socket_catch_exception(function, message): except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err: logging.warning("send error: {}".format(err)) +# Track deprecated paths that have been warned about to only warn once per file +_deprecated_paths_warned = set() + +@web.middleware +async def deprecation_warning(request: web.Request, handler): + """Middleware to warn about deprecated frontend API paths""" + path = request.path + + if path.startswith("/scripts/ui") or path.startswith("/extensions/core/"): + # Only warn once per unique file path + if path not in _deprecated_paths_warned: + _deprecated_paths_warned.add(path) + logging.warning( + f"[DEPRECATION WARNING] Detected import of deprecated legacy API: {path}. " + f"This is likely caused by a custom node extension using outdated APIs. " + f"Please update your extensions or contact the extension author for an updated version." + ) + + response: web.Response = await handler(request) + return response + + @web.middleware async def compress_body(request: web.Request, handler): accept_encoding = request.headers.get("Accept-Encoding", "") @@ -151,6 +175,7 @@ class PromptServer(): self.user_manager = UserManager() self.model_file_manager = ModelFileManager() self.custom_node_manager = CustomNodeManager() + self.subgraph_manager = SubgraphManager() self.internal_routes = InternalRoutes(self) self.supports = ["custom_nodes_from_web"] self.prompt_queue = execution.PromptQueue(self) @@ -159,7 +184,7 @@ class PromptServer(): self.client_session:Optional[aiohttp.ClientSession] = None self.number = 0 - middlewares = [cache_control] + middlewares = [cache_control, deprecation_warning] if args.enable_compress_response_body: middlewares.append(compress_body) @@ -667,8 +692,9 @@ class PromptServer(): async def get_queue(request): queue_info = {} current_queue = self.prompt_queue.get_current_queue_volatile() - queue_info['queue_running'] = current_queue[0] - queue_info['queue_pending'] = current_queue[1] + remove_sensitive = lambda queue: [x[:5] for x in queue] + queue_info['queue_running'] = remove_sensitive(current_queue[0]) + queue_info['queue_pending'] = remove_sensitive(current_queue[1]) return web.json_response(queue_info) @routes.post("/prompt") @@ -704,7 +730,12 @@ class PromptServer(): extra_data["client_id"] = json_data["client_id"] if valid[0]: outputs_to_execute = valid[2] - self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute)) + sensitive = {} + for sensitive_val in execution.SENSITIVE_EXTRA_DATA_KEYS: + if sensitive_val in extra_data: + sensitive[sensitive_val] = extra_data.pop(sensitive_val) + extra_data["create_time"] = int(time.time() * 1000) # timestamp in milliseconds + self.prompt_queue.put((number, prompt_id, prompt, extra_data, outputs_to_execute, sensitive)) response = {"prompt_id": prompt_id, "number": number, "node_errors": valid[3]} return web.json_response(response) else: @@ -797,6 +828,7 @@ class PromptServer(): self.user_manager.add_routes(self.routes) self.model_file_manager.add_routes(self.routes) self.custom_node_manager.add_routes(self.routes, self.app, nodes.LOADED_MODULE_DIRS.items()) + self.subgraph_manager.add_routes(self.routes, nodes.LOADED_MODULE_DIRS.items()) self.app.add_subapp('/internal', self.internal_routes.get_app()) # Prefix every route with /api for easier matching for delegation. diff --git a/tests-unit/comfy_quant/test_mixed_precision.py b/tests-unit/comfy_quant/test_mixed_precision.py new file mode 100644 index 000000000..f8d1fd04e --- /dev/null +++ b/tests-unit/comfy_quant/test_mixed_precision.py @@ -0,0 +1,232 @@ +import unittest +import torch +import sys +import os + +# Add comfy to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +def has_gpu(): + return torch.cuda.is_available() + +from comfy.cli_args import args +if not has_gpu(): + args.cpu = True + +from comfy import ops +from comfy.quant_ops import QuantizedTensor + + +class SimpleModel(torch.nn.Module): + def __init__(self, operations=ops.disable_weight_init): + super().__init__() + self.layer1 = operations.Linear(10, 20, device="cpu", dtype=torch.bfloat16) + self.layer2 = operations.Linear(20, 30, device="cpu", dtype=torch.bfloat16) + self.layer3 = operations.Linear(30, 40, device="cpu", dtype=torch.bfloat16) + + def forward(self, x): + x = self.layer1(x) + x = torch.nn.functional.relu(x) + x = self.layer2(x) + x = torch.nn.functional.relu(x) + x = self.layer3(x) + return x + + +class TestMixedPrecisionOps(unittest.TestCase): + + def test_all_layers_standard(self): + """Test that model with no quantization works normally""" + # Configure no quantization + ops.MixedPrecisionOps._layer_quant_config = {} + + # Create model + model = SimpleModel(operations=ops.MixedPrecisionOps) + + # Initialize weights manually + model.layer1.weight = torch.nn.Parameter(torch.randn(20, 10, dtype=torch.bfloat16)) + model.layer1.bias = torch.nn.Parameter(torch.randn(20, dtype=torch.bfloat16)) + model.layer2.weight = torch.nn.Parameter(torch.randn(30, 20, dtype=torch.bfloat16)) + model.layer2.bias = torch.nn.Parameter(torch.randn(30, dtype=torch.bfloat16)) + model.layer3.weight = torch.nn.Parameter(torch.randn(40, 30, dtype=torch.bfloat16)) + model.layer3.bias = torch.nn.Parameter(torch.randn(40, dtype=torch.bfloat16)) + + # Initialize weight_function and bias_function + for layer in [model.layer1, model.layer2, model.layer3]: + layer.weight_function = [] + layer.bias_function = [] + + # Forward pass + input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) + output = model(input_tensor) + + self.assertEqual(output.shape, (5, 40)) + self.assertEqual(output.dtype, torch.bfloat16) + + def test_mixed_precision_load(self): + """Test loading a mixed precision model from state dict""" + # Configure mixed precision: layer1 is FP8, layer2 and layer3 are standard + layer_quant_config = { + "layer1": { + "format": "float8_e4m3fn", + "params": {} + }, + "layer3": { + "format": "float8_e4m3fn", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create state dict with mixed precision + fp8_weight1 = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) + fp8_weight3 = torch.randn(40, 30, dtype=torch.float32).to(torch.float8_e4m3fn) + + state_dict = { + # Layer 1: FP8 E4M3FN + "layer1.weight": fp8_weight1, + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer1.weight_scale": torch.tensor(2.0, dtype=torch.float32), + + # Layer 2: Standard BF16 + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + + # Layer 3: FP8 E4M3FN + "layer3.weight": fp8_weight3, + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + "layer3.weight_scale": torch.tensor(1.5, dtype=torch.float32), + } + + # Create model and load state dict (strict=False because custom loading pops keys) + model = SimpleModel(operations=ops.MixedPrecisionOps) + model.load_state_dict(state_dict, strict=False) + + # Verify weights are wrapped in QuantizedTensor + self.assertIsInstance(model.layer1.weight, QuantizedTensor) + self.assertEqual(model.layer1.weight._layout_type, "TensorCoreFP8Layout") + + # Layer 2 should NOT be quantized + self.assertNotIsInstance(model.layer2.weight, QuantizedTensor) + + # Layer 3 should be quantized + self.assertIsInstance(model.layer3.weight, QuantizedTensor) + self.assertEqual(model.layer3.weight._layout_type, "TensorCoreFP8Layout") + + # Verify scales were loaded + self.assertEqual(model.layer1.weight._layout_params['scale'].item(), 2.0) + self.assertEqual(model.layer3.weight._layout_params['scale'].item(), 1.5) + + # Forward pass + input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) + output = model(input_tensor) + + self.assertEqual(output.shape, (5, 40)) + + def test_state_dict_quantized_preserved(self): + """Test that quantized weights are preserved in state_dict()""" + # Configure mixed precision + layer_quant_config = { + "layer1": { + "format": "float8_e4m3fn", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create and load model + fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) + state_dict1 = { + "layer1.weight": fp8_weight, + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer1.weight_scale": torch.tensor(3.0, dtype=torch.float32), + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + "layer3.weight": torch.randn(40, 30, dtype=torch.bfloat16), + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + } + + model = SimpleModel(operations=ops.MixedPrecisionOps) + model.load_state_dict(state_dict1, strict=False) + + # Save state dict + state_dict2 = model.state_dict() + + # Verify layer1.weight is a QuantizedTensor with scale preserved + self.assertIsInstance(state_dict2["layer1.weight"], QuantizedTensor) + self.assertEqual(state_dict2["layer1.weight"]._layout_params['scale'].item(), 3.0) + self.assertEqual(state_dict2["layer1.weight"]._layout_type, "TensorCoreFP8Layout") + + # Verify non-quantized layers are standard tensors + self.assertNotIsInstance(state_dict2["layer2.weight"], QuantizedTensor) + self.assertNotIsInstance(state_dict2["layer3.weight"], QuantizedTensor) + + def test_weight_function_compatibility(self): + """Test that weight_function (LoRA) works with quantized layers""" + # Configure FP8 quantization + layer_quant_config = { + "layer1": { + "format": "float8_e4m3fn", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create and load model + fp8_weight = torch.randn(20, 10, dtype=torch.float32).to(torch.float8_e4m3fn) + state_dict = { + "layer1.weight": fp8_weight, + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer1.weight_scale": torch.tensor(2.0, dtype=torch.float32), + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + "layer3.weight": torch.randn(40, 30, dtype=torch.bfloat16), + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + } + + model = SimpleModel(operations=ops.MixedPrecisionOps) + model.load_state_dict(state_dict, strict=False) + + # Add a weight function (simulating LoRA) + # This should trigger dequantization during forward pass + def apply_lora(weight): + lora_delta = torch.randn_like(weight) * 0.01 + return weight + lora_delta + + model.layer1.weight_function.append(apply_lora) + + # Forward pass should work with LoRA (triggers weight_function path) + input_tensor = torch.randn(5, 10, dtype=torch.bfloat16) + output = model(input_tensor) + + self.assertEqual(output.shape, (5, 40)) + + def test_error_handling_unknown_format(self): + """Test that unknown formats raise error""" + # Configure with unknown format + layer_quant_config = { + "layer1": { + "format": "unknown_format_xyz", + "params": {} + } + } + ops.MixedPrecisionOps._layer_quant_config = layer_quant_config + + # Create state dict + state_dict = { + "layer1.weight": torch.randn(20, 10, dtype=torch.bfloat16), + "layer1.bias": torch.randn(20, dtype=torch.bfloat16), + "layer2.weight": torch.randn(30, 20, dtype=torch.bfloat16), + "layer2.bias": torch.randn(30, dtype=torch.bfloat16), + "layer3.weight": torch.randn(40, 30, dtype=torch.bfloat16), + "layer3.bias": torch.randn(40, dtype=torch.bfloat16), + } + + # Load should raise KeyError for unknown format in QUANT_FORMAT_MIXINS + model = SimpleModel(operations=ops.MixedPrecisionOps) + with self.assertRaises(KeyError): + model.load_state_dict(state_dict, strict=False) + +if __name__ == "__main__": + unittest.main() + diff --git a/tests-unit/comfy_quant/test_quant_registry.py b/tests-unit/comfy_quant/test_quant_registry.py new file mode 100644 index 000000000..9cb54ede8 --- /dev/null +++ b/tests-unit/comfy_quant/test_quant_registry.py @@ -0,0 +1,190 @@ +import unittest +import torch +import sys +import os + +# Add comfy to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) + +def has_gpu(): + return torch.cuda.is_available() + +from comfy.cli_args import args +if not has_gpu(): + args.cpu = True + +from comfy.quant_ops import QuantizedTensor, TensorCoreFP8Layout + + +class TestQuantizedTensor(unittest.TestCase): + """Test the QuantizedTensor subclass with FP8 layout""" + + def test_creation(self): + """Test creating a QuantizedTensor with TensorCoreFP8Layout""" + fp8_data = torch.randn(256, 128, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(2.0) + layout_params = {'scale': scale, 'orig_dtype': torch.bfloat16} + + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) + + self.assertIsInstance(qt, QuantizedTensor) + self.assertEqual(qt.shape, (256, 128)) + self.assertEqual(qt.dtype, torch.float8_e4m3fn) + self.assertEqual(qt._layout_params['scale'], scale) + self.assertEqual(qt._layout_params['orig_dtype'], torch.bfloat16) + self.assertEqual(qt._layout_type, "TensorCoreFP8Layout") + + def test_dequantize(self): + """Test explicit dequantization""" + + fp8_data = torch.ones(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(3.0) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) + dequantized = qt.dequantize() + + self.assertEqual(dequantized.dtype, torch.float32) + self.assertTrue(torch.allclose(dequantized, torch.ones(10, 20) * 3.0, rtol=0.1)) + + def test_from_float(self): + """Test creating QuantizedTensor from float tensor""" + float_tensor = torch.randn(64, 32, dtype=torch.float32) + scale = torch.tensor(1.5) + + qt = QuantizedTensor.from_float( + float_tensor, + "TensorCoreFP8Layout", + scale=scale, + dtype=torch.float8_e4m3fn + ) + + self.assertIsInstance(qt, QuantizedTensor) + self.assertEqual(qt.dtype, torch.float8_e4m3fn) + self.assertEqual(qt.shape, (64, 32)) + + # Verify dequantization gives approximately original values + dequantized = qt.dequantize() + mean_rel_error = ((dequantized - float_tensor).abs() / (float_tensor.abs() + 1e-6)).mean() + self.assertLess(mean_rel_error, 0.1) + + +class TestGenericUtilities(unittest.TestCase): + """Test generic utility operations""" + + def test_detach(self): + """Test detach operation on quantized tensor""" + fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(1.5) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) + + # Detach should return a new QuantizedTensor + qt_detached = qt.detach() + + self.assertIsInstance(qt_detached, QuantizedTensor) + self.assertEqual(qt_detached.shape, qt.shape) + self.assertEqual(qt_detached._layout_type, "TensorCoreFP8Layout") + + def test_clone(self): + """Test clone operation on quantized tensor""" + fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(1.5) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) + + # Clone should return a new QuantizedTensor + qt_cloned = qt.clone() + + self.assertIsInstance(qt_cloned, QuantizedTensor) + self.assertEqual(qt_cloned.shape, qt.shape) + self.assertEqual(qt_cloned._layout_type, "TensorCoreFP8Layout") + + # Verify it's a deep copy + self.assertIsNot(qt_cloned._qdata, qt._qdata) + + @unittest.skipUnless(has_gpu(), "GPU not available") + def test_to_device(self): + """Test device transfer""" + fp8_data = torch.randn(10, 20, dtype=torch.float32).to(torch.float8_e4m3fn) + scale = torch.tensor(1.5) + layout_params = {'scale': scale, 'orig_dtype': torch.float32} + qt = QuantizedTensor(fp8_data, "TensorCoreFP8Layout", layout_params) + + # Moving to same device should work (CPU to CPU) + qt_cpu = qt.to('cpu') + + self.assertIsInstance(qt_cpu, QuantizedTensor) + self.assertEqual(qt_cpu.device.type, 'cpu') + self.assertEqual(qt_cpu._layout_params['scale'].device.type, 'cpu') + + +class TestTensorCoreFP8Layout(unittest.TestCase): + """Test the TensorCoreFP8Layout implementation""" + + def test_quantize(self): + """Test quantization method""" + float_tensor = torch.randn(32, 64, dtype=torch.float32) + scale = torch.tensor(1.5) + + qdata, layout_params = TensorCoreFP8Layout.quantize( + float_tensor, + scale=scale, + dtype=torch.float8_e4m3fn + ) + + self.assertEqual(qdata.dtype, torch.float8_e4m3fn) + self.assertEqual(qdata.shape, float_tensor.shape) + self.assertIn('scale', layout_params) + self.assertIn('orig_dtype', layout_params) + self.assertEqual(layout_params['orig_dtype'], torch.float32) + + def test_dequantize(self): + """Test dequantization method""" + float_tensor = torch.ones(10, 20, dtype=torch.float32) * 3.0 + scale = torch.tensor(1.0) + + qdata, layout_params = TensorCoreFP8Layout.quantize( + float_tensor, + scale=scale, + dtype=torch.float8_e4m3fn + ) + + dequantized = TensorCoreFP8Layout.dequantize(qdata, **layout_params) + + # Should approximately match original + self.assertTrue(torch.allclose(dequantized, float_tensor, rtol=0.1, atol=0.1)) + + +class TestFallbackMechanism(unittest.TestCase): + """Test fallback for unsupported operations""" + + def test_unsupported_op_dequantizes(self): + """Test that unsupported operations fall back to dequantization""" + # Set seed for reproducibility + torch.manual_seed(42) + + # Create quantized tensor + a_fp32 = torch.randn(10, 20, dtype=torch.float32) + scale = torch.tensor(1.0) + a_q = QuantizedTensor.from_float( + a_fp32, + "TensorCoreFP8Layout", + scale=scale, + dtype=torch.float8_e4m3fn + ) + + # Call an operation that doesn't have a registered handler + # For example, torch.abs + result = torch.abs(a_q) + + # Should work via fallback (dequantize → abs → return) + self.assertNotIsInstance(result, QuantizedTensor) + expected = torch.abs(a_fp32) + # FP8 introduces quantization error, so use loose tolerance + mean_error = (result - expected).abs().mean() + self.assertLess(mean_error, 0.05, f"Mean error {mean_error:.4f} is too large") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests-unit/server_test/test_cache_control.py b/tests-unit/server_test/test_cache_control.py index 8de59125a..fa68d9408 100644 --- a/tests-unit/server_test/test_cache_control.py +++ b/tests-unit/server_test/test_cache_control.py @@ -48,6 +48,13 @@ CACHE_SCENARIOS = [ "expected_cache": "no-cache", "should_have_header": True, }, + { + "name": "localized_index_json_no_cache", + "path": "/templates/index.zh.json", + "status": 200, + "expected_cache": "no-cache", + "should_have_header": True, + }, # Non-matching files { "name": "html_no_header", diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py index ef73ad9fd..ace0d2279 100644 --- a/tests/execution/test_execution.py +++ b/tests/execution/test_execution.py @@ -152,12 +152,12 @@ class TestExecution: # Initialize server and client # @fixture(scope="class", autouse=True, params=[ - # (use_lru, lru_size) - (False, 0), - (True, 0), - (True, 100), + { "extra_args" : [], "should_cache_results" : True }, + { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True }, + { "extra_args" : ["--cache-lru", 100], "should_cache_results" : True }, + { "extra_args" : ["--cache-none"], "should_cache_results" : False }, ]) - def _server(self, args_pytest, request): + def server(self, args_pytest, request): # Start server pargs = [ 'python','main.py', @@ -167,12 +167,10 @@ class TestExecution: '--extra-model-paths-config', 'tests/execution/extra_model_paths.yaml', '--cpu', ] - use_lru, lru_size = request.param - if use_lru: - pargs += ['--cache-lru', str(lru_size)] + pargs += [ str(param) for param in request.param["extra_args"] ] print("Running server with args:", pargs) # noqa: T201 p = subprocess.Popen(pargs) - yield + yield request.param p.kill() torch.cuda.empty_cache() @@ -193,7 +191,7 @@ class TestExecution: return comfy_client @fixture(scope="class", autouse=True) - def shared_client(self, args_pytest, _server): + def shared_client(self, args_pytest, server): client = self.start_client(args_pytest["listen"], args_pytest["port"]) yield client del client @@ -225,7 +223,7 @@ class TestExecution: assert result.did_run(mask) assert result.did_run(lazy_mix) - def test_full_cache(self, client: ComfyClient, builder: GraphBuilder): + def test_full_cache(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -237,9 +235,12 @@ class TestExecution: client.run(g) result2 = client.run(g) for node_id, node in g.nodes.items(): - assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached" + else: + assert result2.did_run(node), f"Node {node_id} was cached, but should have been run" - def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder): + def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubImage", content="BLACK", height=512, width=512, batch_size=1) input2 = g.node("StubImage", content="NOISE", height=512, width=512, batch_size=1) @@ -251,8 +252,12 @@ class TestExecution: client.run(g) mask.inputs['value'] = 0.4 result2 = client.run(g) - assert not result2.did_run(input1), "Input1 should have been cached" - assert not result2.did_run(input2), "Input2 should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(input1), "Input1 should have been cached" + assert not result2.did_run(input2), "Input2 should have been cached" + else: + assert result2.did_run(input1), "Input1 should have been rerun" + assert result2.did_run(input2), "Input2 should have been rerun" def test_error(self, client: ComfyClient, builder: GraphBuilder): g = builder @@ -411,7 +416,7 @@ class TestExecution: input2 = g.node("StubImage", id="removeme", content="WHITE", height=512, width=512, batch_size=1) client.run(g) - def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder): + def test_custom_is_changed(self, client: ComfyClient, builder: GraphBuilder, server): g = builder # Creating the nodes in this specific order previously caused a bug save = g.node("SaveImage") @@ -427,7 +432,10 @@ class TestExecution: result3 = client.run(g) result4 = client.run(g) assert result1.did_run(is_changed), "is_changed should have been run" - assert not result2.did_run(is_changed), "is_changed should have been cached" + if server["should_cache_results"]: + assert not result2.did_run(is_changed), "is_changed should have been cached" + else: + assert result2.did_run(is_changed), "is_changed should have been re-run" assert result3.did_run(is_changed), "is_changed should have been re-run" assert result4.did_run(is_changed), "is_changed should not have been cached" @@ -514,7 +522,7 @@ class TestExecution: assert len(images2) == 1, "Should have 1 image" # This tests that only constant outputs are used in the call to `IS_CHANGED` - def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder): + def test_is_changed_with_outputs(self, client: ComfyClient, builder: GraphBuilder, server): g = builder input1 = g.node("StubConstantImage", value=0.5, height=512, width=512, batch_size=1) test_node = g.node("TestIsChangedWithConstants", image=input1.out(0), value=0.5) @@ -530,7 +538,11 @@ class TestExecution: images = result.get_images(output) assert len(images) == 1, "Should have 1 image" assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25" - assert not result.did_run(test_node), "The execution should have been cached" + if server["should_cache_results"]: + assert not result.did_run(test_node), "The execution should have been cached" + else: + assert result.did_run(test_node), "The execution should have been re-run" + def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks): # Warmup execution to ensure server is fully initialized