From ac85d7887f23372b3ed4fb3f2234546685af0e0b Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Fri, 24 Apr 2026 16:24:28 -0700
Subject: [PATCH 01/27] Add benchmark for model serving

---
 benchmarks/benchmarking_model_serving.py | 444 +++++++++++++++++++++++
 1 file changed, 444 insertions(+)
 create mode 100644 benchmarks/benchmarking_model_serving.py

diff --git a/benchmarks/benchmarking_model_serving.py b/benchmarks/benchmarking_model_serving.py
new file mode 100644
index 000000000..493d0a574
--- /dev/null
+++ b/benchmarks/benchmarking_model_serving.py
@@ -0,0 +1,444 @@
+from __future__ import annotations
+
+import argparse
+import asyncio
+import dataclasses
+import json
+import math
+import os
+import random
+import statistics
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import torch
+
+# Make the repo root importable when running directly from the benchmarks/ dir.
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import comfy.model_management
+import comfy.sd
+
+
+# -----------------------------
+# Data models
+# -----------------------------
+
+@dataclasses.dataclass
+class RequestSpec:
+    profile_name: str
+    batch_size: int
+    width: int
+    height: int
+    num_frames: int
+    steps: int
+    cfg_scale: float
+    seed: int
+    timeout_s: float = 180.0
+    extra: Dict[str, Any] = dataclasses.field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class RequestResult:
+    request_id: int
+    profile_name: str
+    ok: bool
+    error: Optional[str]
+    latency_ms: float
+    queue_wait_ms: float
+    step_latencies_ms: List[float]
+    ttfs_ms: float          # time to first (denoising) step
+    peak_vram_mb: float
+    est_mem_mb: Optional[float]
+    started_at: float
+    ended_at: float
+
+
+@dataclasses.dataclass
+class RunSummary:
+    total_requests: int
+    success: int
+    failed: int
+    throughput_req_s: float
+    p50_ms: float
+    p90_ms: float
+    p95_ms: float
+    p99_ms: float
+    mean_ms: float
+    ttfs_p50_ms: float
+    ttfs_p99_ms: float
+    step_mean_ms: float
+    step_p99_ms: float
+    max_vram_mb: float
+
+
+# -----------------------------
+# Helpers
+# -----------------------------
+
+def percentile(values: List[float], p: float) -> float:
+    if not values:
+        return float("nan")
+    values = sorted(values)
+    k = (len(values) - 1) * (p / 100.0)
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return values[int(k)]
+    return values[f] * (c - k) + values[c] * (k - f)
+
+
+def now() -> float:
+    return time.perf_counter()
+
+
+def gpu_peak_mb() -> float:
+    if not torch.cuda.is_available():
+        return 0.0
+    return torch.cuda.max_memory_allocated() / (1024 ** 2)
+
+
+def reset_gpu_peak() -> None:
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+
+
+def sync_cuda() -> None:
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+
+def build_request_stream(
+    num_requests: int,
+    base_seed: int,
+    profiles: List[RequestSpec],
+    weighted: Optional[List[float]] = None,
+) -> List[RequestSpec]:
+    rnd = random.Random(base_seed)
+    out: List[RequestSpec] = []
+    for i in range(num_requests):
+        p = rnd.choices(profiles, weights=weighted, k=1)[0]
+        out.append(dataclasses.replace(p, seed=base_seed + i))
+    return out
+
+
+# -----------------------------
+# Model adapter
+# -----------------------------
+
+class WanRunner:
+    """
+    Thin adapter around ComfyUI model loading + the BaseModel.apply_model call path.
+
+    Only the DiT denoiser is timed — no VAE encode/decode, no CLIP, no scheduler
+    overhead — so measurements reflect true model inference cost.
+
+    Latent shape convention (WAN):  [B, 16, T, H//8, W//8]
+    Text conditioning shape (UMT5): [B, text_seq_len, text_dim]  (zeros for benchmarking)
+    Sigma schedule (flow-matching):  linspace(1.0 → 1/steps, steps)
+    """
+
+    def __init__(
+        self,
+        checkpoint: str,
+        device: str,
+        dtype_str: str,
+        text_seq_len: int = 512,
+        text_dim: int = 4096,
+    ):
+        self.checkpoint = checkpoint
+        self.device_str = device
+        self.dtype_str = dtype_str
+        self.text_seq_len = text_seq_len
+        self.text_dim = text_dim
+        self.patcher, self.model = self._load_model()
+
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+
+    def _load_model(self):
+        dtype_map = {
+            "fp16": torch.float16,
+            "bf16": torch.bfloat16,
+            "fp32": torch.float32,
+        }
+        dtype = dtype_map.get(self.dtype_str)
+        model_opts = {"dtype": dtype} if dtype is not None else {}
+
+        patcher = comfy.sd.load_diffusion_model(self.checkpoint, model_options=model_opts)
+        # force_full_load=True keeps the whole model resident on GPU rather than
+        # streaming weights on demand (important for latency benchmarking).
+        comfy.model_management.load_models_gpu([patcher], force_full_load=True)
+        return patcher, patcher.model
+
+    def _estimate_mem_mb(self, latent_shape: tuple, text_seq_len: int) -> Optional[float]:
+        cond_shapes = {
+            "c_crossattn": [(latent_shape[0], text_seq_len, self.text_dim)],
+        }
+        try:
+            return self.model.memory_required(latent_shape, cond_shapes) / (1024 ** 2)
+        except Exception:
+            return None
+
+    # ------------------------------------------------------------------
+    # Single-request execution
+    # ------------------------------------------------------------------
+
+    @torch.inference_mode()
+    def run_one(self, req: RequestSpec) -> RequestResult:
+        start = now()
+        reset_gpu_peak()
+
+        step_latencies: List[float] = []
+        ttfs_ms = float("nan")
+        est_mem_mb: Optional[float] = None
+        ok = True
+        err = None
+
+        try:
+            device = comfy.model_management.get_torch_device()
+            dtype = self.model.get_dtype_inference()
+
+            # Latent noise tensor: [B, 16 channels, T frames, H/8, W/8]
+            latent_shape = (
+                req.batch_size, 16,
+                req.num_frames,
+                req.height // 8,
+                req.width // 8,
+            )
+            x = torch.randn(latent_shape, dtype=dtype, device=device)
+            est_mem_mb = self._estimate_mem_mb(latent_shape, self.text_seq_len)
+
+            # Fake text conditioning — zeros have the right shape, non-zero
+            # values are not needed for throughput/latency benchmarking.
+            cross_attn = torch.zeros(
+                req.batch_size, self.text_seq_len, self.text_dim,
+                dtype=dtype, device=device,
+            )
+
+            # Linear sigma schedule: 1.0 → 1/steps  (flow-matching, noise→clean)
+            sigmas = torch.linspace(1.0, 1.0 / req.steps, req.steps, device=device)
+
+            for step_i, sigma_val in enumerate(sigmas):
+                sigma_t = sigma_val.expand(req.batch_size)
+                t0 = now()
+                x = self.model.apply_model(x, sigma_t, c_crossattn=cross_attn)
+                sync_cuda()
+                elapsed_ms = (now() - t0) * 1000.0
+                step_latencies.append(elapsed_ms)
+                if step_i == 0:
+                    ttfs_ms = elapsed_ms
+
+        except Exception as e:
+            ok = False
+            err = repr(e)
+
+        end = now()
+        return RequestResult(
+            request_id=-1,
+            profile_name=req.profile_name,
+            ok=ok,
+            error=err,
+            latency_ms=(end - start) * 1000.0,
+            queue_wait_ms=0.0,     # filled in by the scheduler
+            step_latencies_ms=step_latencies,
+            ttfs_ms=ttfs_ms,
+            peak_vram_mb=gpu_peak_mb(),
+            est_mem_mb=est_mem_mb,
+            started_at=start,
+            ended_at=end,
+        )
+
+
+# -----------------------------
+# Serving-style scheduler
+# -----------------------------
+
+async def run_closed_loop(
+    runner: WanRunner,
+    requests: List[RequestSpec],
+    concurrency: int,
+    request_rate: float = float("inf"),
+) -> List[RequestResult]:
+    """
+    Closed-loop scheduler (default) or Poisson open-loop when request_rate is finite.
+
+    Each request is dispatched to a thread so the asyncio event loop stays
+    free to issue the next request while the GPU is busy.
+    """
+    sem = asyncio.Semaphore(concurrency)
+    results: List[Optional[RequestResult]] = [None] * len(requests)
+
+    async def worker(i: int, req: RequestSpec) -> None:
+        async with sem:
+            t_enq = now()
+            res = await asyncio.to_thread(runner.run_one, req)
+            res.request_id = i
+            res.queue_wait_ms = max(0.0, (res.started_at - t_enq) * 1000.0)
+            results[i] = res
+
+    if request_rate == float("inf") or request_rate <= 0:
+        await asyncio.gather(*(worker(i, r) for i, r in enumerate(requests)))
+    else:
+        tasks: List[asyncio.Task] = []
+        for i, req in enumerate(requests):
+            if i > 0:
+                await asyncio.sleep(random.expovariate(request_rate))
+            tasks.append(asyncio.create_task(worker(i, req)))
+        await asyncio.gather(*tasks)
+
+    return [r for r in results if r is not None]
+
+
+def summarize(results: List[RequestResult], wall_s: float) -> RunSummary:
+    lat = [r.latency_ms for r in results if r.ok]
+    ttfs = [r.ttfs_ms for r in results if r.ok and math.isfinite(r.ttfs_ms)]
+    all_steps = [s for r in results if r.ok for s in r.step_latencies_ms]
+    succ = sum(1 for r in results if r.ok)
+    fail = len(results) - succ
+    return RunSummary(
+        total_requests=len(results),
+        success=succ,
+        failed=fail,
+        throughput_req_s=(succ / wall_s) if wall_s > 0 else 0.0,
+        p50_ms=percentile(lat, 50),
+        p90_ms=percentile(lat, 90),
+        p95_ms=percentile(lat, 95),
+        p99_ms=percentile(lat, 99),
+        mean_ms=(statistics.mean(lat) if lat else float("nan")),
+        ttfs_p50_ms=percentile(ttfs, 50),
+        ttfs_p99_ms=percentile(ttfs, 99),
+        step_mean_ms=(statistics.mean(all_steps) if all_steps else float("nan")),
+        step_p99_ms=percentile(all_steps, 99),
+        max_vram_mb=max((r.peak_vram_mb for r in results), default=0.0),
+    )
+
+
+def print_summary(
+    args: argparse.Namespace,
+    summ: RunSummary,
+    total_requests: int,
+    wall_s: float,
+) -> None:
+    w = 60
+    sep = "-" * w
+    print("\n" + "=" * w)
+    print("{s:^{n}}".format(s=" WAN Benchmark Result ", n=w))
+    print("=" * w)
+    print("{:<40} {:<}".format("Checkpoint:", Path(args.checkpoint).name))
+    print("{:<40} {:<}".format("Device / dtype:", f"{args.device}/{args.dtype}"))
+    print("{:<40} {:<}".format("Concurrency:", args.concurrency))
+    rate_str = f"{args.request_rate:.1f} req/s" if args.request_rate != float("inf") else "inf (closed-loop)"
+    print("{:<40} {:<}".format("Request rate:", rate_str))
+    print(sep)
+    print("{:<40} {:<.2f}".format("Benchmark duration (s):", wall_s))
+    print("{:<40} {}/{}".format("Successful requests:", summ.success, total_requests))
+    if summ.failed:
+        print("{:<40} {:<}".format("Failed requests:", summ.failed))
+    print(sep)
+    print("{:<40} {:<.3f}".format("Throughput (req/s):", summ.throughput_req_s))
+    print("{:<40} {:<.1f}".format("Latency mean (ms):", summ.mean_ms))
+    print("{:<40} {:<.1f}".format("Latency p50  (ms):", summ.p50_ms))
+    print("{:<40} {:<.1f}".format("Latency p90  (ms):", summ.p90_ms))
+    print("{:<40} {:<.1f}".format("Latency p95  (ms):", summ.p95_ms))
+    print("{:<40} {:<.1f}".format("Latency p99  (ms):", summ.p99_ms))
+    print(sep)
+    print("{:<40} {:<.1f}".format("TTFS p50 (ms):", summ.ttfs_p50_ms))
+    print("{:<40} {:<.1f}".format("TTFS p99 (ms):", summ.ttfs_p99_ms))
+    print("{:<40} {:<.1f}".format("Step latency mean (ms):", summ.step_mean_ms))
+    print("{:<40} {:<.1f}".format("Step latency p99  (ms):", summ.step_p99_ms))
+    print(sep)
+    print("{:<40} {:<.1f}".format("Peak VRAM (MB):", summ.max_vram_mb))
+    print("=" * w)
+
+
+# -----------------------------
+# CLI
+# -----------------------------
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Benchmark ComfyUI WAN diffusion model denoising throughput and latency."
+    )
+    p.add_argument(
+        "--checkpoint", required=True,
+        help="Path to the WAN diffusion-model checkpoint (.safetensors / .pt).",
+    )
+    p.add_argument("--device", default="cuda")
+    p.add_argument("--dtype", default="fp16", choices=["fp16", "bf16", "fp32"])
+    p.add_argument("--num-requests", type=int, default=100)
+    p.add_argument("--concurrency", type=int, default=4,
+                   help="Max number of in-flight requests (semaphore width).")
+    p.add_argument(
+        "--request-rate", type=float, default=float("inf"),
+        help="Poisson arrival rate in req/s.  inf = closed-loop (default).",
+    )
+    p.add_argument("--warmup-requests", type=int, default=2,
+                   help="Warmup iterations excluded from metrics.")
+    p.add_argument("--seed", type=int, default=1234)
+    p.add_argument("--text-seq-len", type=int, default=512,
+                   help="Cross-attention sequence length (UMT5 default: 512).")
+    p.add_argument("--text-dim", type=int, default=4096,
+                   help="Text embedding width (UMT5-XXL: 4096).")
+    p.add_argument("--out-dir", type=Path, default=Path("benchmarks/out"))
+    p.add_argument("--output-file", type=Path, default=None,
+                   help="Override path for the summary JSON output.")
+    return p.parse_args()
+
+
+def default_profiles() -> List[RequestSpec]:
+    return [
+        RequestSpec("wan21_t2v_720p_16f_30s", 1, 1280, 720, 16, 30, 6.0, 0),
+        RequestSpec("wan21_t2v_720p_32f_30s", 1, 1280, 720, 32, 30, 6.0, 0),
+        RequestSpec("wan21_t2v_480p_32f_20s", 1,  854, 480, 32, 20, 6.0, 0),
+    ]
+
+
+async def main_async() -> None:
+    args = parse_args()
+    args.out_dir.mkdir(parents=True, exist_ok=True)
+
+    runner = WanRunner(
+        checkpoint=args.checkpoint,
+        device=args.device,
+        dtype_str=args.dtype,
+        text_seq_len=args.text_seq_len,
+        text_dim=args.text_dim,
+    )
+
+    all_reqs = build_request_stream(
+        args.num_requests + args.warmup_requests,
+        args.seed,
+        default_profiles(),
+    )
+    warmup_reqs = all_reqs[: args.warmup_requests]
+    bench_reqs = all_reqs[args.warmup_requests :]
+
+    if warmup_reqs:
+        print(f"Running {len(warmup_reqs)} warmup request(s)...")
+        for req in warmup_reqs:
+            runner.run_one(req)
+        print("Warmup complete.")
+
+    print(f"Benchmarking {len(bench_reqs)} requests (concurrency={args.concurrency})...")
+    t0 = now()
+    results = await run_closed_loop(runner, bench_reqs, args.concurrency, args.request_rate)
+    wall_s = now() - t0
+
+    summ = summarize(results, wall_s)
+    print_summary(args, summ, len(bench_reqs), wall_s)
+
+    out_file = args.output_file or (args.out_dir / "summary.json")
+    with open(args.out_dir / "requests.jsonl", "w") as f:
+        for r in results:
+            f.write(json.dumps(dataclasses.asdict(r)) + "\n")
+    with open(out_file, "w") as f:
+        json.dump(dataclasses.asdict(summ), f, indent=2)
+    print(f"\nResults written to {args.out_dir}/")
+
+
+if __name__ == "__main__":
+    asyncio.run(main_async())

From 96363fa74a48612ee855611eeb656be650885daa Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Sun, 26 Apr 2026 16:48:58 -0700
Subject: [PATCH 02/27] Revert "Add benchmark for model serving"

This reverts commit ac85d7887f23372b3ed4fb3f2234546685af0e0b.
---
 benchmarks/benchmarking_model_serving.py | 444 -----------------------
 1 file changed, 444 deletions(-)
 delete mode 100644 benchmarks/benchmarking_model_serving.py

diff --git a/benchmarks/benchmarking_model_serving.py b/benchmarks/benchmarking_model_serving.py
deleted file mode 100644
index 493d0a574..000000000
--- a/benchmarks/benchmarking_model_serving.py
+++ /dev/null
@@ -1,444 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import asyncio
-import dataclasses
-import json
-import math
-import os
-import random
-import statistics
-import sys
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import torch
-
-# Make the repo root importable when running directly from the benchmarks/ dir.
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-import comfy.model_management
-import comfy.sd
-
-
-# -----------------------------
-# Data models
-# -----------------------------
-
-@dataclasses.dataclass
-class RequestSpec:
-    profile_name: str
-    batch_size: int
-    width: int
-    height: int
-    num_frames: int
-    steps: int
-    cfg_scale: float
-    seed: int
-    timeout_s: float = 180.0
-    extra: Dict[str, Any] = dataclasses.field(default_factory=dict)
-
-
-@dataclasses.dataclass
-class RequestResult:
-    request_id: int
-    profile_name: str
-    ok: bool
-    error: Optional[str]
-    latency_ms: float
-    queue_wait_ms: float
-    step_latencies_ms: List[float]
-    ttfs_ms: float          # time to first (denoising) step
-    peak_vram_mb: float
-    est_mem_mb: Optional[float]
-    started_at: float
-    ended_at: float
-
-
-@dataclasses.dataclass
-class RunSummary:
-    total_requests: int
-    success: int
-    failed: int
-    throughput_req_s: float
-    p50_ms: float
-    p90_ms: float
-    p95_ms: float
-    p99_ms: float
-    mean_ms: float
-    ttfs_p50_ms: float
-    ttfs_p99_ms: float
-    step_mean_ms: float
-    step_p99_ms: float
-    max_vram_mb: float
-
-
-# -----------------------------
-# Helpers
-# -----------------------------
-
-def percentile(values: List[float], p: float) -> float:
-    if not values:
-        return float("nan")
-    values = sorted(values)
-    k = (len(values) - 1) * (p / 100.0)
-    f = math.floor(k)
-    c = math.ceil(k)
-    if f == c:
-        return values[int(k)]
-    return values[f] * (c - k) + values[c] * (k - f)
-
-
-def now() -> float:
-    return time.perf_counter()
-
-
-def gpu_peak_mb() -> float:
-    if not torch.cuda.is_available():
-        return 0.0
-    return torch.cuda.max_memory_allocated() / (1024 ** 2)
-
-
-def reset_gpu_peak() -> None:
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-
-
-def sync_cuda() -> None:
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-
-
-def build_request_stream(
-    num_requests: int,
-    base_seed: int,
-    profiles: List[RequestSpec],
-    weighted: Optional[List[float]] = None,
-) -> List[RequestSpec]:
-    rnd = random.Random(base_seed)
-    out: List[RequestSpec] = []
-    for i in range(num_requests):
-        p = rnd.choices(profiles, weights=weighted, k=1)[0]
-        out.append(dataclasses.replace(p, seed=base_seed + i))
-    return out
-
-
-# -----------------------------
-# Model adapter
-# -----------------------------
-
-class WanRunner:
-    """
-    Thin adapter around ComfyUI model loading + the BaseModel.apply_model call path.
-
-    Only the DiT denoiser is timed — no VAE encode/decode, no CLIP, no scheduler
-    overhead — so measurements reflect true model inference cost.
-
-    Latent shape convention (WAN):  [B, 16, T, H//8, W//8]
-    Text conditioning shape (UMT5): [B, text_seq_len, text_dim]  (zeros for benchmarking)
-    Sigma schedule (flow-matching):  linspace(1.0 → 1/steps, steps)
-    """
-
-    def __init__(
-        self,
-        checkpoint: str,
-        device: str,
-        dtype_str: str,
-        text_seq_len: int = 512,
-        text_dim: int = 4096,
-    ):
-        self.checkpoint = checkpoint
-        self.device_str = device
-        self.dtype_str = dtype_str
-        self.text_seq_len = text_seq_len
-        self.text_dim = text_dim
-        self.patcher, self.model = self._load_model()
-
-    # ------------------------------------------------------------------
-    # Internals
-    # ------------------------------------------------------------------
-
-    def _load_model(self):
-        dtype_map = {
-            "fp16": torch.float16,
-            "bf16": torch.bfloat16,
-            "fp32": torch.float32,
-        }
-        dtype = dtype_map.get(self.dtype_str)
-        model_opts = {"dtype": dtype} if dtype is not None else {}
-
-        patcher = comfy.sd.load_diffusion_model(self.checkpoint, model_options=model_opts)
-        # force_full_load=True keeps the whole model resident on GPU rather than
-        # streaming weights on demand (important for latency benchmarking).
-        comfy.model_management.load_models_gpu([patcher], force_full_load=True)
-        return patcher, patcher.model
-
-    def _estimate_mem_mb(self, latent_shape: tuple, text_seq_len: int) -> Optional[float]:
-        cond_shapes = {
-            "c_crossattn": [(latent_shape[0], text_seq_len, self.text_dim)],
-        }
-        try:
-            return self.model.memory_required(latent_shape, cond_shapes) / (1024 ** 2)
-        except Exception:
-            return None
-
-    # ------------------------------------------------------------------
-    # Single-request execution
-    # ------------------------------------------------------------------
-
-    @torch.inference_mode()
-    def run_one(self, req: RequestSpec) -> RequestResult:
-        start = now()
-        reset_gpu_peak()
-
-        step_latencies: List[float] = []
-        ttfs_ms = float("nan")
-        est_mem_mb: Optional[float] = None
-        ok = True
-        err = None
-
-        try:
-            device = comfy.model_management.get_torch_device()
-            dtype = self.model.get_dtype_inference()
-
-            # Latent noise tensor: [B, 16 channels, T frames, H/8, W/8]
-            latent_shape = (
-                req.batch_size, 16,
-                req.num_frames,
-                req.height // 8,
-                req.width // 8,
-            )
-            x = torch.randn(latent_shape, dtype=dtype, device=device)
-            est_mem_mb = self._estimate_mem_mb(latent_shape, self.text_seq_len)
-
-            # Fake text conditioning — zeros have the right shape, non-zero
-            # values are not needed for throughput/latency benchmarking.
-            cross_attn = torch.zeros(
-                req.batch_size, self.text_seq_len, self.text_dim,
-                dtype=dtype, device=device,
-            )
-
-            # Linear sigma schedule: 1.0 → 1/steps  (flow-matching, noise→clean)
-            sigmas = torch.linspace(1.0, 1.0 / req.steps, req.steps, device=device)
-
-            for step_i, sigma_val in enumerate(sigmas):
-                sigma_t = sigma_val.expand(req.batch_size)
-                t0 = now()
-                x = self.model.apply_model(x, sigma_t, c_crossattn=cross_attn)
-                sync_cuda()
-                elapsed_ms = (now() - t0) * 1000.0
-                step_latencies.append(elapsed_ms)
-                if step_i == 0:
-                    ttfs_ms = elapsed_ms
-
-        except Exception as e:
-            ok = False
-            err = repr(e)
-
-        end = now()
-        return RequestResult(
-            request_id=-1,
-            profile_name=req.profile_name,
-            ok=ok,
-            error=err,
-            latency_ms=(end - start) * 1000.0,
-            queue_wait_ms=0.0,     # filled in by the scheduler
-            step_latencies_ms=step_latencies,
-            ttfs_ms=ttfs_ms,
-            peak_vram_mb=gpu_peak_mb(),
-            est_mem_mb=est_mem_mb,
-            started_at=start,
-            ended_at=end,
-        )
-
-
-# -----------------------------
-# Serving-style scheduler
-# -----------------------------
-
-async def run_closed_loop(
-    runner: WanRunner,
-    requests: List[RequestSpec],
-    concurrency: int,
-    request_rate: float = float("inf"),
-) -> List[RequestResult]:
-    """
-    Closed-loop scheduler (default) or Poisson open-loop when request_rate is finite.
-
-    Each request is dispatched to a thread so the asyncio event loop stays
-    free to issue the next request while the GPU is busy.
-    """
-    sem = asyncio.Semaphore(concurrency)
-    results: List[Optional[RequestResult]] = [None] * len(requests)
-
-    async def worker(i: int, req: RequestSpec) -> None:
-        async with sem:
-            t_enq = now()
-            res = await asyncio.to_thread(runner.run_one, req)
-            res.request_id = i
-            res.queue_wait_ms = max(0.0, (res.started_at - t_enq) * 1000.0)
-            results[i] = res
-
-    if request_rate == float("inf") or request_rate <= 0:
-        await asyncio.gather(*(worker(i, r) for i, r in enumerate(requests)))
-    else:
-        tasks: List[asyncio.Task] = []
-        for i, req in enumerate(requests):
-            if i > 0:
-                await asyncio.sleep(random.expovariate(request_rate))
-            tasks.append(asyncio.create_task(worker(i, req)))
-        await asyncio.gather(*tasks)
-
-    return [r for r in results if r is not None]
-
-
-def summarize(results: List[RequestResult], wall_s: float) -> RunSummary:
-    lat = [r.latency_ms for r in results if r.ok]
-    ttfs = [r.ttfs_ms for r in results if r.ok and math.isfinite(r.ttfs_ms)]
-    all_steps = [s for r in results if r.ok for s in r.step_latencies_ms]
-    succ = sum(1 for r in results if r.ok)
-    fail = len(results) - succ
-    return RunSummary(
-        total_requests=len(results),
-        success=succ,
-        failed=fail,
-        throughput_req_s=(succ / wall_s) if wall_s > 0 else 0.0,
-        p50_ms=percentile(lat, 50),
-        p90_ms=percentile(lat, 90),
-        p95_ms=percentile(lat, 95),
-        p99_ms=percentile(lat, 99),
-        mean_ms=(statistics.mean(lat) if lat else float("nan")),
-        ttfs_p50_ms=percentile(ttfs, 50),
-        ttfs_p99_ms=percentile(ttfs, 99),
-        step_mean_ms=(statistics.mean(all_steps) if all_steps else float("nan")),
-        step_p99_ms=percentile(all_steps, 99),
-        max_vram_mb=max((r.peak_vram_mb for r in results), default=0.0),
-    )
-
-
-def print_summary(
-    args: argparse.Namespace,
-    summ: RunSummary,
-    total_requests: int,
-    wall_s: float,
-) -> None:
-    w = 60
-    sep = "-" * w
-    print("\n" + "=" * w)
-    print("{s:^{n}}".format(s=" WAN Benchmark Result ", n=w))
-    print("=" * w)
-    print("{:<40} {:<}".format("Checkpoint:", Path(args.checkpoint).name))
-    print("{:<40} {:<}".format("Device / dtype:", f"{args.device}/{args.dtype}"))
-    print("{:<40} {:<}".format("Concurrency:", args.concurrency))
-    rate_str = f"{args.request_rate:.1f} req/s" if args.request_rate != float("inf") else "inf (closed-loop)"
-    print("{:<40} {:<}".format("Request rate:", rate_str))
-    print(sep)
-    print("{:<40} {:<.2f}".format("Benchmark duration (s):", wall_s))
-    print("{:<40} {}/{}".format("Successful requests:", summ.success, total_requests))
-    if summ.failed:
-        print("{:<40} {:<}".format("Failed requests:", summ.failed))
-    print(sep)
-    print("{:<40} {:<.3f}".format("Throughput (req/s):", summ.throughput_req_s))
-    print("{:<40} {:<.1f}".format("Latency mean (ms):", summ.mean_ms))
-    print("{:<40} {:<.1f}".format("Latency p50  (ms):", summ.p50_ms))
-    print("{:<40} {:<.1f}".format("Latency p90  (ms):", summ.p90_ms))
-    print("{:<40} {:<.1f}".format("Latency p95  (ms):", summ.p95_ms))
-    print("{:<40} {:<.1f}".format("Latency p99  (ms):", summ.p99_ms))
-    print(sep)
-    print("{:<40} {:<.1f}".format("TTFS p50 (ms):", summ.ttfs_p50_ms))
-    print("{:<40} {:<.1f}".format("TTFS p99 (ms):", summ.ttfs_p99_ms))
-    print("{:<40} {:<.1f}".format("Step latency mean (ms):", summ.step_mean_ms))
-    print("{:<40} {:<.1f}".format("Step latency p99  (ms):", summ.step_p99_ms))
-    print(sep)
-    print("{:<40} {:<.1f}".format("Peak VRAM (MB):", summ.max_vram_mb))
-    print("=" * w)
-
-
-# -----------------------------
-# CLI
-# -----------------------------
-
-def parse_args() -> argparse.Namespace:
-    p = argparse.ArgumentParser(
-        description="Benchmark ComfyUI WAN diffusion model denoising throughput and latency."
-    )
-    p.add_argument(
-        "--checkpoint", required=True,
-        help="Path to the WAN diffusion-model checkpoint (.safetensors / .pt).",
-    )
-    p.add_argument("--device", default="cuda")
-    p.add_argument("--dtype", default="fp16", choices=["fp16", "bf16", "fp32"])
-    p.add_argument("--num-requests", type=int, default=100)
-    p.add_argument("--concurrency", type=int, default=4,
-                   help="Max number of in-flight requests (semaphore width).")
-    p.add_argument(
-        "--request-rate", type=float, default=float("inf"),
-        help="Poisson arrival rate in req/s.  inf = closed-loop (default).",
-    )
-    p.add_argument("--warmup-requests", type=int, default=2,
-                   help="Warmup iterations excluded from metrics.")
-    p.add_argument("--seed", type=int, default=1234)
-    p.add_argument("--text-seq-len", type=int, default=512,
-                   help="Cross-attention sequence length (UMT5 default: 512).")
-    p.add_argument("--text-dim", type=int, default=4096,
-                   help="Text embedding width (UMT5-XXL: 4096).")
-    p.add_argument("--out-dir", type=Path, default=Path("benchmarks/out"))
-    p.add_argument("--output-file", type=Path, default=None,
-                   help="Override path for the summary JSON output.")
-    return p.parse_args()
-
-
-def default_profiles() -> List[RequestSpec]:
-    return [
-        RequestSpec("wan21_t2v_720p_16f_30s", 1, 1280, 720, 16, 30, 6.0, 0),
-        RequestSpec("wan21_t2v_720p_32f_30s", 1, 1280, 720, 32, 30, 6.0, 0),
-        RequestSpec("wan21_t2v_480p_32f_20s", 1,  854, 480, 32, 20, 6.0, 0),
-    ]
-
-
-async def main_async() -> None:
-    args = parse_args()
-    args.out_dir.mkdir(parents=True, exist_ok=True)
-
-    runner = WanRunner(
-        checkpoint=args.checkpoint,
-        device=args.device,
-        dtype_str=args.dtype,
-        text_seq_len=args.text_seq_len,
-        text_dim=args.text_dim,
-    )
-
-    all_reqs = build_request_stream(
-        args.num_requests + args.warmup_requests,
-        args.seed,
-        default_profiles(),
-    )
-    warmup_reqs = all_reqs[: args.warmup_requests]
-    bench_reqs = all_reqs[args.warmup_requests :]
-
-    if warmup_reqs:
-        print(f"Running {len(warmup_reqs)} warmup request(s)...")
-        for req in warmup_reqs:
-            runner.run_one(req)
-        print("Warmup complete.")
-
-    print(f"Benchmarking {len(bench_reqs)} requests (concurrency={args.concurrency})...")
-    t0 = now()
-    results = await run_closed_loop(runner, bench_reqs, args.concurrency, args.request_rate)
-    wall_s = now() - t0
-
-    summ = summarize(results, wall_s)
-    print_summary(args, summ, len(bench_reqs), wall_s)
-
-    out_file = args.output_file or (args.out_dir / "summary.json")
-    with open(args.out_dir / "requests.jsonl", "w") as f:
-        for r in results:
-            f.write(json.dumps(dataclasses.asdict(r)) + "\n")
-    with open(out_file, "w") as f:
-        json.dump(dataclasses.asdict(summ), f, indent=2)
-    print(f"\nResults written to {args.out_dir}/")
-
-
-if __name__ == "__main__":
-    asyncio.run(main_async())

From 00379b4acf8c65822812b4ad5c7dd68ec6d3043d Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Sun, 26 Apr 2026 19:41:55 -0700
Subject: [PATCH 03/27] Move benchmark serving client into benchmarks folder

---
 benchmarks/benchmark_comfyui_serving.py | 374 ++++++++++++++++++++++++
 comfy/cli_args.py                       |   1 +
 main.py                                 |  37 ++-
 server.py                               |  78 ++++-
 4 files changed, 475 insertions(+), 15 deletions(-)
 create mode 100644 benchmarks/benchmark_comfyui_serving.py

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
new file mode 100644
index 000000000..07d498c21
--- /dev/null
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -0,0 +1,374 @@
+#!/usr/bin/env python3
+"""
+Simple serving benchmark client for ComfyUI's HTTP API.
+
+This script is inspired by diffusion serving benchmarks and is designed to:
+  - submit prompts to ComfyUI (/prompt or /bench/prompt),
+  - optionally shape request arrivals (fixed rate or Poisson),
+  - poll completion via /history/{prompt_id},
+  - report latency/throughput/error metrics.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import math
+import random
+import statistics
+import time
+import uuid
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Any
+
+import aiohttp
+
+
+@dataclass
+class RequestResult:
+    request_index: int
+    prompt_id: str | None
+    ok: bool
+    error: str | None
+    queued_at: float
+    started_at: float
+    finished_at: float
+    end_to_end_s: float
+    queue_wait_ms: float | None
+    execution_ms: float | None
+
+
+def percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return float("nan")
+    if len(values) == 1:
+        return values[0]
+    values = sorted(values)
+    rank = (len(values) - 1) * (pct / 100.0)
+    lower = math.floor(rank)
+    upper = math.ceil(rank)
+    if lower == upper:
+        return values[lower]
+    weight = rank - lower
+    return values[lower] * (1.0 - weight) + values[upper] * weight
+
+
+def patch_seed_in_prompt(prompt: dict[str, Any], seed: int, seed_path: str | None) -> dict[str, Any]:
+    """
+    Patch prompt seed in-place for common sampler nodes.
+    seed_path format: "<node_id>.<input_name>".
+    """
+    if seed_path:
+        try:
+            node_id, input_name = seed_path.split(".", 1)
+            prompt[node_id]["inputs"][input_name] = seed
+            return prompt
+        except Exception as exc:
+            raise ValueError(f"Invalid --seed-path '{seed_path}': {exc}") from exc
+
+    # Best-effort fallback: update any input key named 'seed' or 'noise_seed'
+    for node in prompt.values():
+        if not isinstance(node, dict):
+            continue
+        inputs = node.get("inputs")
+        if not isinstance(inputs, dict):
+            continue
+        if "seed" in inputs:
+            inputs["seed"] = seed
+        if "noise_seed" in inputs:
+            inputs["noise_seed"] = seed
+    return prompt
+
+
+def load_prompt_template(path: Path) -> dict[str, Any]:
+    data = json.loads(path.read_text())
+    if "prompt" in data and isinstance(data["prompt"], dict):
+        return data
+    if isinstance(data, dict):
+        return {"prompt": data}
+    raise ValueError("Prompt file must be a JSON object (prompt graph or wrapper with 'prompt').")
+
+
+async def submit_prompt(
+    session: aiohttp.ClientSession,
+    base_url: str,
+    endpoint: str,
+    payload: dict[str, Any],
+    timeout_s: float,
+) -> str:
+    url = f"{base_url}{endpoint}"
+    async with session.post(url, json=payload, timeout=timeout_s) as resp:
+        text = await resp.text()
+        if resp.status != 200:
+            raise RuntimeError(f"submit failed [{resp.status}] {text}")
+        body = json.loads(text)
+        prompt_id = body.get("prompt_id")
+        if not prompt_id:
+            raise RuntimeError(f"missing prompt_id in response: {body}")
+        return prompt_id
+
+
+async def wait_for_prompt_done(
+    session: aiohttp.ClientSession,
+    base_url: str,
+    prompt_id: str,
+    poll_interval_s: float,
+    timeout_s: float,
+) -> tuple[float | None, float | None]:
+    """
+    Returns (queue_wait_ms, execution_ms) when available from history status messages.
+    Falls back to (None, None) if unavailable.
+    """
+    deadline = time.perf_counter() + timeout_s
+    history_url = f"{base_url}/history/{prompt_id}"
+
+    while time.perf_counter() < deadline:
+        async with session.get(history_url, timeout=timeout_s) as resp:
+            if resp.status != 200:
+                text = await resp.text()
+                raise RuntimeError(f"history failed [{resp.status}] {text}")
+
+            payload = await resp.json()
+            if not payload:
+                await asyncio.sleep(poll_interval_s)
+                continue
+
+            history_item = payload.get(prompt_id)
+            if history_item is None:
+                await asyncio.sleep(poll_interval_s)
+                continue
+
+            status = history_item.get("status", {})
+            status_str = status.get("status_str")
+            messages = status.get("messages", [])
+            if status_str not in ("success", "error"):
+                await asyncio.sleep(poll_interval_s)
+                continue
+
+            queue_wait_ms = None
+            execution_ms = None
+            try:
+                timestamp_map: dict[str, int] = {}
+                for event, msg in messages:
+                    if isinstance(msg, dict) and "timestamp" in msg:
+                        timestamp_map[event] = int(msg["timestamp"])
+                start_ts = timestamp_map.get("execution_start")
+                end_ts = timestamp_map.get("execution_success") or timestamp_map.get("execution_error")
+                if start_ts is not None and end_ts is not None:
+                    execution_ms = max(0.0, end_ts - start_ts)
+            except Exception:
+                execution_ms = None
+
+            return queue_wait_ms, execution_ms
+
+        await asyncio.sleep(poll_interval_s)
+
+    raise TimeoutError(f"timed out waiting for prompt_id={prompt_id}")
+
+
+def build_arrival_schedule(num_requests: int, request_rate: float, poisson: bool, seed: int) -> list[float]:
+    """
+    Returns absolute offsets (seconds from benchmark start) for each request.
+    """
+    if request_rate <= 0:
+        return [0.0] * num_requests
+
+    rnd = random.Random(seed)
+    offsets: list[float] = []
+    t = 0.0
+    for _ in range(num_requests):
+        if poisson:
+            delta = rnd.expovariate(request_rate)
+        else:
+            delta = 1.0 / request_rate
+        t += delta
+        offsets.append(t)
+    return offsets
+
+
+async def run_request(
+    idx: int,
+    start_time: float,
+    scheduled_offset_s: float,
+    semaphore: asyncio.Semaphore,
+    session: aiohttp.ClientSession,
+    args: argparse.Namespace,
+    prompt_wrapper_template: dict[str, Any],
+) -> RequestResult:
+    await asyncio.sleep(max(0.0, (start_time + scheduled_offset_s) - time.perf_counter()))
+    queued_at = time.perf_counter()
+
+    async with semaphore:
+        started_at = time.perf_counter()
+        prompt_id = None
+        try:
+            payload = json.loads(json.dumps(prompt_wrapper_template))
+            payload.setdefault("extra_data", {})
+            payload["client_id"] = args.client_id
+
+            seed = args.base_seed + idx
+            payload["prompt"] = patch_seed_in_prompt(payload["prompt"], seed, args.seed_path)
+
+            prompt_id = await submit_prompt(
+                session=session,
+                base_url=args.host,
+                endpoint=args.endpoint,
+                payload=payload,
+                timeout_s=args.request_timeout_s,
+            )
+
+            queue_wait_ms, execution_ms = await wait_for_prompt_done(
+                session=session,
+                base_url=args.host,
+                prompt_id=prompt_id,
+                poll_interval_s=args.poll_interval_s,
+                timeout_s=args.request_timeout_s,
+            )
+            finished_at = time.perf_counter()
+            return RequestResult(
+                request_index=idx,
+                prompt_id=prompt_id,
+                ok=True,
+                error=None,
+                queued_at=queued_at,
+                started_at=started_at,
+                finished_at=finished_at,
+                end_to_end_s=finished_at - queued_at,
+                queue_wait_ms=queue_wait_ms,
+                execution_ms=execution_ms,
+            )
+        except Exception as exc:
+            finished_at = time.perf_counter()
+            return RequestResult(
+                request_index=idx,
+                prompt_id=prompt_id,
+                ok=False,
+                error=repr(exc),
+                queued_at=queued_at,
+                started_at=started_at,
+                finished_at=finished_at,
+                end_to_end_s=finished_at - queued_at,
+                queue_wait_ms=None,
+                execution_ms=None,
+            )
+
+
+def print_summary(results: list[RequestResult], wall_s: float) -> None:
+    success = [r for r in results if r.ok]
+    fail = [r for r in results if not r.ok]
+    lat_s = [r.end_to_end_s for r in success]
+    queue_wait_ms = [r.queue_wait_ms for r in success if r.queue_wait_ms is not None]
+    exec_ms = [r.execution_ms for r in success if r.execution_ms is not None]
+
+    throughput = (len(success) / wall_s) if wall_s > 0 else 0.0
+    print("\n=== ComfyUI Serving Benchmark Summary ===")
+    print(f"requests_total:   {len(results)}")
+    print(f"requests_success: {len(success)}")
+    print(f"requests_failed:  {len(fail)}")
+    print(f"wall_time_s:      {wall_s:.3f}")
+    print(f"throughput_req_s: {throughput:.3f}")
+
+    if lat_s:
+        print(f"latency_p50_s:    {percentile(lat_s, 50):.3f}")
+        print(f"latency_p90_s:    {percentile(lat_s, 90):.3f}")
+        print(f"latency_p95_s:    {percentile(lat_s, 95):.3f}")
+        print(f"latency_p99_s:    {percentile(lat_s, 99):.3f}")
+        print(f"latency_mean_s:   {statistics.mean(lat_s):.3f}")
+        print(f"latency_max_s:    {max(lat_s):.3f}")
+
+    if queue_wait_ms:
+        print(f"queue_wait_mean_ms: {statistics.mean(queue_wait_ms):.2f}")
+        print(f"queue_wait_p95_ms:  {percentile(queue_wait_ms, 95):.2f}")
+
+    if exec_ms:
+        print(f"execution_mean_ms:  {statistics.mean(exec_ms):.2f}")
+        print(f"execution_p95_ms:   {percentile(exec_ms, 95):.2f}")
+
+    if fail:
+        print("\nSample failures:")
+        for r in fail[:5]:
+            print(f"  idx={r.request_index} prompt_id={r.prompt_id} error={r.error}")
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Benchmark ComfyUI request serving.")
+    p.add_argument("--host", type=str, default="http://127.0.0.1:8188", help="ComfyUI base URL.")
+    p.add_argument(
+        "--endpoint",
+        type=str,
+        default="/prompt",
+        choices=("/prompt", "/bench/prompt"),
+        help="Submission endpoint.",
+    )
+    p.add_argument("--prompt-file", type=Path, required=True, help="Path to prompt JSON.")
+    p.add_argument("--num-requests", type=int, default=50)
+    p.add_argument("--max-concurrency", type=int, default=8)
+    p.add_argument("--request-rate", type=float, default=0.0, help="Requests/sec. 0 = fire immediately.")
+    p.add_argument("--poisson", action="store_true", help="Use Poisson inter-arrival when request-rate > 0.")
+    p.add_argument("--base-seed", type=int, default=1234)
+    p.add_argument(
+        "--seed-path",
+        type=str,
+        default=None,
+        help="Optional path to seed field in prompt: <node_id>.<input_name> (e.g. 3.seed).",
+    )
+    p.add_argument("--client-id", type=str, default=f"bench-{uuid.uuid4().hex[:12]}")
+    p.add_argument("--request-timeout-s", type=float, default=600.0)
+    p.add_argument("--poll-interval-s", type=float, default=0.2)
+    p.add_argument("--output-json", type=Path, default=None, help="Write detailed result JSON.")
+    p.add_argument("--seed", type=int, default=0, help="RNG seed for schedule generation.")
+    return p.parse_args()
+
+
+async def async_main(args: argparse.Namespace) -> None:
+    prompt_template = load_prompt_template(args.prompt_file)
+    schedule = build_arrival_schedule(
+        num_requests=args.num_requests,
+        request_rate=args.request_rate,
+        poisson=args.poisson,
+        seed=args.seed,
+    )
+    semaphore = asyncio.Semaphore(args.max_concurrency)
+    connector = aiohttp.TCPConnector(limit=max(args.max_concurrency * 2, 32))
+
+    started = time.perf_counter()
+    async with aiohttp.ClientSession(connector=connector) as session:
+        tasks = [
+            asyncio.create_task(
+                run_request(
+                    idx=i,
+                    start_time=started,
+                    scheduled_offset_s=schedule[i],
+                    semaphore=semaphore,
+                    session=session,
+                    args=args,
+                    prompt_wrapper_template=prompt_template,
+                )
+            )
+            for i in range(args.num_requests)
+        ]
+        results = await asyncio.gather(*tasks)
+    wall_s = time.perf_counter() - started
+
+    print_summary(results, wall_s)
+
+    if args.output_json is not None:
+        out = {
+            "config": vars(args),
+            "wall_time_s": wall_s,
+            "results": [asdict(r) for r in sorted(results, key=lambda x: x.request_index)],
+        }
+        args.output_json.parent.mkdir(parents=True, exist_ok=True)
+        args.output_json.write_text(json.dumps(out, indent=2))
+        print(f"\nWrote results to: {args.output_json}")
+
+
+def main() -> None:
+    args = parse_args()
+    asyncio.run(async_main(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index dbaadf723..e9828a5db 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -225,6 +225,7 @@ parser.add_argument(
 parser.add_argument("--user-directory", type=is_valid_directory, default=None, help="Set the ComfyUI user directory with an absolute path. Overrides --base-directory.")
 
 parser.add_argument("--enable-compress-response-body", action="store_true", help="Enable compressing response body.")
+parser.add_argument("--benchmark-server-only", action="store_true", help="Enable lightweight benchmark routes and worker fast-paths focused on model serving throughput/latency.")
 
 parser.add_argument(
     "--comfy-api-base",
diff --git a/main.py b/main.py
index dbaf2745c..5013bac42 100644
--- a/main.py
+++ b/main.py
@@ -293,6 +293,7 @@ def prompt_worker(q, server_instance):
     gc_collect_interval = 10.0
 
     while True:
+        benchmark_mode = args.benchmark_server_only
         timeout = 1000.0
         if need_gc:
             timeout = max(gc_collect_interval - (current_time - last_gc_collect), 0.0)
@@ -301,6 +302,7 @@ def prompt_worker(q, server_instance):
         if queue_item is not None:
             item, item_id = queue_item
             execution_start_time = time.perf_counter()
+            execution_start_wall_ms = int(time.time() * 1000)
             prompt_id = item[1]
             server_instance.last_prompt_id = prompt_id
 
@@ -308,15 +310,21 @@ def prompt_worker(q, server_instance):
             extra_data = item[3].copy()
             for k in sensitive:
                 extra_data[k] = sensitive[k]
+            benchmark_mode = args.benchmark_server_only or extra_data.get("benchmark_server_only", False)
 
-            asset_seeder.pause()
+            if not benchmark_mode:
+                asset_seeder.pause()
             e.execute(item[2], prompt_id, extra_data, item[4])
 
             need_gc = True
 
             remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
+            history_result = e.history_result
+            if benchmark_mode:
+                history_result = {"outputs": {}, "meta": {}}
+
             q.task_done(item_id,
-                        e.history_result,
+                        history_result,
                         status=execution.PromptQueue.ExecutionStatus(
                             status_str='success' if e.success else 'error',
                             completed=e.success,
@@ -325,16 +333,24 @@ def prompt_worker(q, server_instance):
                 server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id)
 
             current_time = time.perf_counter()
-            execution_time = current_time - execution_start_time
+            execution_time_s = current_time - execution_start_time
 
             # Log Time in a more readable way after 10 minutes
-            if execution_time > 600:
-                execution_time = time.strftime("%H:%M:%S", time.gmtime(execution_time))
-                logging.info(f"Prompt executed in {execution_time}")
+            if execution_time_s > 600:
+                execution_time_formatted = time.strftime("%H:%M:%S", time.gmtime(execution_time_s))
+                logging.info(f"Prompt executed in {execution_time_formatted}")
             else:
-                logging.info("Prompt executed in {:.2f} seconds".format(execution_time))
+                logging.info("Prompt executed in {:.2f} seconds".format(execution_time_s))
 
-            if not asset_seeder.is_disabled():
+            queue_wait_ms = 0.0
+            created_at = extra_data.get("create_time")
+            if isinstance(created_at, int):
+                queue_wait_ms = max(0.0, execution_start_wall_ms - created_at)
+
+            if benchmark_mode:
+                server_instance.record_benchmark_result(prompt_id, e.success, execution_time_s * 1000.0, queue_wait_ms)
+
+            if not benchmark_mode and not asset_seeder.is_disabled():
                 paths = _collect_output_absolute_paths(e.history_result)
                 register_output_files(paths, job_id=prompt_id)
 
@@ -360,9 +376,10 @@ def prompt_worker(q, server_instance):
                 need_gc = False
                 hook_breaker_ac10a0.restore_functions()
 
-                if not asset_seeder.is_disabled():
+                if not benchmark_mode and not asset_seeder.is_disabled():
                     asset_seeder.enqueue_enrich(roots=("output",), compute_hashes=True)
-                asset_seeder.resume()
+                if not benchmark_mode:
+                    asset_seeder.resume()
 
 
 async def run(server_instance, address='', port=8188, verbose=True, call_on_start=None):
diff --git a/server.py b/server.py
index 881da8e66..5db448b7f 100644
--- a/server.py
+++ b/server.py
@@ -16,6 +16,7 @@ import struct
 import ssl
 import socket
 import ipaddress
+import threading
 from PIL import Image, ImageOps
 from PIL.PngImagePlugin import PngInfo
 from io import BytesIO
@@ -252,6 +253,17 @@ class PromptServer():
         self.client_id = None
 
         self.on_prompt_handlers = []
+        self._benchmark_lock = threading.Lock()
+        self._benchmark_stats = {
+            "requests_total": 0,
+            "requests_success": 0,
+            "requests_error": 0,
+            "latency_ms_total": 0.0,
+            "latency_ms_max": 0.0,
+            "queue_wait_ms_total": 0.0,
+            "queue_wait_ms_max": 0.0,
+            "last_prompt_id": None,
+        }
 
         @routes.get('/ws')
         async def websocket_handler(request):
@@ -912,12 +924,17 @@ class PromptServer():
             queue_info['queue_pending'] = _remove_sensitive_from_queue(current_queue[1])
             return web.json_response(queue_info)
 
-        @routes.post("/prompt")
-        async def post_prompt(request):
-            logging.info("got prompt")
-            json_data =  await request.json()
-            json_data = self.trigger_on_prompt(json_data)
+        @routes.get("/bench/stats")
+        async def get_bench_stats(request):
+            stats = self.get_benchmark_stats()
+            return web.json_response(stats)
 
+        @routes.post("/bench/reset")
+        async def reset_bench_stats(request):
+            self.reset_benchmark_stats()
+            return web.json_response({"status": "ok"})
+
+        async def enqueue_prompt(json_data):
             if "number" in json_data:
                 number = float(json_data['number'])
             else:
@@ -967,6 +984,22 @@ class PromptServer():
                 }
                 return web.json_response({"error": error, "node_errors": {}}, status=400)
 
+        @routes.post("/bench/prompt")
+        async def post_bench_prompt(request):
+            json_data = await request.json()
+            json_data = self.trigger_on_prompt(json_data)
+            extra_data = json_data.setdefault("extra_data", {})
+            extra_data["benchmark_server_only"] = True
+            extra_data.setdefault("preview_method", "none")
+            return await enqueue_prompt(json_data)
+
+        @routes.post("/prompt")
+        async def post_prompt(request):
+            logging.info("got prompt")
+            json_data =  await request.json()
+            json_data = self.trigger_on_prompt(json_data)
+            return await enqueue_prompt(json_data)
+
         @routes.post("/queue")
         async def post_queue(request):
             json_data =  await request.json()
@@ -1111,6 +1144,41 @@ class PromptServer():
         prompt_info['exec_info'] = exec_info
         return prompt_info
 
+    def reset_benchmark_stats(self):
+        with self._benchmark_lock:
+            self._benchmark_stats = {
+                "requests_total": 0,
+                "requests_success": 0,
+                "requests_error": 0,
+                "latency_ms_total": 0.0,
+                "latency_ms_max": 0.0,
+                "queue_wait_ms_total": 0.0,
+                "queue_wait_ms_max": 0.0,
+                "last_prompt_id": None,
+            }
+
+    def record_benchmark_result(self, prompt_id, success, latency_ms, queue_wait_ms=0.0):
+        with self._benchmark_lock:
+            self._benchmark_stats["requests_total"] += 1
+            if success:
+                self._benchmark_stats["requests_success"] += 1
+            else:
+                self._benchmark_stats["requests_error"] += 1
+            self._benchmark_stats["latency_ms_total"] += max(0.0, latency_ms)
+            self._benchmark_stats["queue_wait_ms_total"] += max(0.0, queue_wait_ms)
+            self._benchmark_stats["latency_ms_max"] = max(self._benchmark_stats["latency_ms_max"], max(0.0, latency_ms))
+            self._benchmark_stats["queue_wait_ms_max"] = max(self._benchmark_stats["queue_wait_ms_max"], max(0.0, queue_wait_ms))
+            self._benchmark_stats["last_prompt_id"] = prompt_id
+
+    def get_benchmark_stats(self):
+        with self._benchmark_lock:
+            stats = dict(self._benchmark_stats)
+
+        total = stats["requests_total"]
+        stats["latency_ms_avg"] = (stats["latency_ms_total"] / total) if total > 0 else 0.0
+        stats["queue_wait_ms_avg"] = (stats["queue_wait_ms_total"] / total) if total > 0 else 0.0
+        return stats
+
     async def send(self, event, data, sid=None):
         if event == BinaryEventTypes.UNENCODED_PREVIEW_IMAGE:
             await self.send_image(data, sid=sid)

From c02b5d4c1ece453d57529cb306fd0f0c755d69cd Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 11:50:09 -0700
Subject: [PATCH 04/27] Generate prompt file automatically.

---
 benchmarks/benchmark_comfyui_serving.py | 458 +++++++++++++++++++++++-
 1 file changed, 457 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 07d498c21..6431be2bc 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -7,6 +7,41 @@ This script is inspired by diffusion serving benchmarks and is designed to:
   - optionally shape request arrivals (fixed rate or Poisson),
   - poll completion via /history/{prompt_id},
   - report latency/throughput/error metrics.
+
+Usage — Wan 2.2 I2V benchmark
+==============================
+
+Step 1 — Generate prompt files (downloads images, writes JSONs, then exits):
+
+  # Minimal: uses synthetic images, writes to prompts/wan22_i2v/
+  python3 benchmarks/benchmark_comfyui_serving.py \\
+    --generate-wan22-prompts \\
+    --num-requests 50
+
+  # With model download (needs ComfyUI root):
+  python3 benchmarks/benchmark_comfyui_serving.py \\
+    --generate-wan22-prompts \\
+    --download-models \\
+    --comfyui-base-dir /path/to/ComfyUI \\
+    --num-requests 50
+
+  # Custom image/output dirs:
+  python3 benchmarks/benchmark_comfyui_serving.py \\
+    --generate-wan22-prompts \\
+    --wan22-input-dir /data/images \\
+    --wan22-output-dir /data/prompts/wan22 \\
+    --wan22-num-images 30 \\
+    --num-requests 50
+
+Step 2 — Run the benchmark (point at any one of the generated prompt files):
+
+  python3 benchmarks/benchmark_comfyui_serving.py \\
+    --prompt-file prompts/wan22_i2v/wan22_i2v_prompt_0000.json \\
+    --num-requests 50 \\
+    --max-concurrency 4 \\
+    --host http://127.0.0.1:8188
+
+The setup step also prints the exact run command at the end, so you can copy it directly.
 """
 
 from __future__ import annotations
@@ -17,7 +52,9 @@ import json
 import math
 import random
 import statistics
+import subprocess
 import time
+import urllib.request
 import uuid
 from dataclasses import dataclass, asdict
 from pathlib import Path
@@ -26,6 +63,374 @@ from typing import Any
 import aiohttp
 
 
+# ──────────────────────────────────────────────────────────────────────────────
+# Wan 2.2 I2V benchmark setup helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+_WAN22_MODELS: list[tuple[str, str]] = [
+    (
+        "models/diffusion_models/wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
+        "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/diffusion_models/wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
+    ),
+    (
+        "models/diffusion_models/wan2.2_i2v_high_noise_14B_fp8_scaled.safetensors",
+        "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/diffusion_models/wan2.2_i2v_high_noise_14B_fp8_scaled.safetensors",
+    ),
+    (
+        "models/loras/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors",
+        "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/loras/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors",
+    ),
+    (
+        "models/loras/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors",
+        "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/loras/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors",
+    ),
+    (
+        "models/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+        "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+    ),
+    (
+        "models/vae/wan_2.1_vae.safetensors",
+        "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors",
+    ),
+]
+
+# Placeholder sentinel replaced by generate_prompt_file.
+_IMAGE_PLACEHOLDER = "__INPUT_IMAGE__"
+
+_WAN22_I2V_GRAPH: dict[str, Any] = {
+    "97": {
+        "inputs": {"image": _IMAGE_PLACEHOLDER},
+        "class_type": "LoadImage",
+        "_meta": {"title": "Start Frame Image"},
+    },
+    "108": {
+        "inputs": {
+            "filename_prefix": "video/Wan2.2_image_to_video",
+            "format": "auto",
+            "codec": "auto",
+            "video-preview": "",
+            "video": ["130:117", 0],
+        },
+        "class_type": "SaveVideo",
+        "_meta": {"title": "Save Video"},
+    },
+    "130:105": {
+        "inputs": {
+            "clip_name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+            "type": "wan",
+            "device": "default",
+        },
+        "class_type": "CLIPLoader",
+        "_meta": {"title": "Load CLIP"},
+    },
+    "130:106": {
+        "inputs": {"vae_name": "wan_2.1_vae.safetensors"},
+        "class_type": "VAELoader",
+        "_meta": {"title": "Load VAE"},
+    },
+    "130:107": {
+        "inputs": {
+            "text": "A felt-style little eagle cashier greeting, waving, and smiling at the camera.",
+            "clip": ["130:105", 0],
+        },
+        "class_type": "CLIPTextEncode",
+        "_meta": {"title": "CLIP Text Encode (Positive Prompt)"},
+    },
+    "130:109": {
+        "inputs": {"shift": 5.000000000000001, "model": ["130:126", 0]},
+        "class_type": "ModelSamplingSD3",
+        "_meta": {"title": "ModelSamplingSD3"},
+    },
+    "130:110": {
+        "inputs": {
+            "add_noise": "enable",
+            "noise_seed": 636787045983965,
+            "steps": 4,
+            "cfg": 1,
+            "sampler_name": "euler",
+            "scheduler": "simple",
+            "start_at_step": 0,
+            "end_at_step": 2,
+            "return_with_leftover_noise": "enable",
+            "model": ["130:109", 0],
+            "positive": ["130:128", 0],
+            "negative": ["130:128", 1],
+            "latent_image": ["130:128", 2],
+        },
+        "class_type": "KSamplerAdvanced",
+        "_meta": {"title": "KSampler (Advanced)"},
+    },
+    "130:111": {
+        "inputs": {
+            "add_noise": "disable",
+            "noise_seed": 0,
+            "steps": 4,
+            "cfg": 1,
+            "sampler_name": "euler",
+            "scheduler": "simple",
+            "start_at_step": 2,
+            "end_at_step": 4,
+            "return_with_leftover_noise": "disable",
+            "model": ["130:124", 0],
+            "positive": ["130:128", 0],
+            "negative": ["130:128", 1],
+            "latent_image": ["130:110", 0],
+        },
+        "class_type": "KSamplerAdvanced",
+        "_meta": {"title": "KSampler (Advanced)"},
+    },
+    "130:117": {
+        "inputs": {"fps": 16, "images": ["130:129", 0]},
+        "class_type": "CreateVideo",
+        "_meta": {"title": "Create Video"},
+    },
+    "130:122": {
+        "inputs": {
+            "unet_name": "wan2.2_i2v_high_noise_14B_fp8_scaled.safetensors",
+            "weight_dtype": "default",
+        },
+        "class_type": "UNETLoader",
+        "_meta": {"title": "Load Diffusion Model"},
+    },
+    "130:123": {
+        "inputs": {
+            "unet_name": "wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
+            "weight_dtype": "default",
+        },
+        "class_type": "UNETLoader",
+        "_meta": {"title": "Load Diffusion Model"},
+    },
+    "130:124": {
+        "inputs": {"shift": 5.000000000000001, "model": ["130:127", 0]},
+        "class_type": "ModelSamplingSD3",
+        "_meta": {"title": "ModelSamplingSD3"},
+    },
+    "130:125": {
+        "inputs": {
+            "text": (
+                "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，"
+                "JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，"
+                "形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+            ),
+            "clip": ["130:105", 0],
+        },
+        "class_type": "CLIPTextEncode",
+        "_meta": {"title": "CLIP Text Encode (Negative Prompt)"},
+    },
+    "130:126": {
+        "inputs": {
+            "lora_name": "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors",
+            "strength_model": 1.0000000000000002,
+            "model": ["130:122", 0],
+        },
+        "class_type": "LoraLoaderModelOnly",
+        "_meta": {"title": "Load LoRA"},
+    },
+    "130:127": {
+        "inputs": {
+            "lora_name": "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors",
+            "strength_model": 1.0000000000000002,
+            "model": ["130:123", 0],
+        },
+        "class_type": "LoraLoaderModelOnly",
+        "_meta": {"title": "Load LoRA"},
+    },
+    "130:128": {
+        "inputs": {
+            "width": 720,
+            "height": 720,
+            "length": 81,
+            "batch_size": 1,
+            "positive": ["130:107", 0],
+            "negative": ["130:125", 0],
+            "vae": ["130:106", 0],
+            "start_image": ["97", 0],
+        },
+        "class_type": "WanImageToVideo",
+        "_meta": {"title": "WanImageToVideo"},
+    },
+    "130:129": {
+        "inputs": {"samples": ["130:111", 0], "vae": ["130:106", 0]},
+        "class_type": "VAEDecode",
+        "_meta": {"title": "VAE Decode"},
+    },
+}
+
+_VBENCH_I2V_JSON_URL = (
+    "https://raw.githubusercontent.com/Vchitect/VBench/master/vbench2_beta_i2v/i2v-bench-info.json"
+)
+
+
+def download_wan22_models(base_dir: Path) -> None:
+    """Download Wan 2.2 I2V model files into *base_dir* using wget."""
+    for rel_path, url in _WAN22_MODELS:
+        dest = base_dir / rel_path
+        if dest.exists():
+            print(f"[setup] already exists, skipping: {dest}")
+            continue
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        print(f"[setup] downloading {dest.name} ...")
+        subprocess.run(["wget", "-O", str(dest), url], check=True)
+
+
+def _try_download_vbench_i2v(input_dir: Path) -> list[str]:
+    """
+    Attempt to fetch VBench I2V images via huggingface_hub.
+    Returns image basenames placed in *input_dir*, or [] on failure.
+    """
+    try:
+        from huggingface_hub import snapshot_download  # type: ignore
+    except ImportError:
+        print("[setup] huggingface_hub not available; skipping VBench download.")
+        return []
+
+    try:
+        print("[setup] downloading Vchitect/VBench_I2V dataset from HuggingFace ...")
+        cache_dir = input_dir / "_vbench_cache"
+        local = snapshot_download(
+            repo_id="Vchitect/VBench_I2V",
+            repo_type="dataset",
+            local_dir=str(cache_dir),
+        )
+    except Exception as exc:
+        print(f"[setup] VBench I2V download failed: {exc}")
+        return []
+
+    image_exts = {".png", ".jpg", ".jpeg", ".webp"}
+    found = sorted(p for p in Path(local).rglob("*") if p.suffix.lower() in image_exts)
+    if not found:
+        return []
+
+    import shutil
+
+    filenames: list[str] = []
+    for src in found:
+        dest = input_dir / src.name
+        if not dest.exists():
+            shutil.copy2(str(src), str(dest))
+        filenames.append(src.name)
+
+    print(f"[setup] prepared {len(filenames)} VBench I2V images in {input_dir}")
+    return filenames
+
+
+def _generate_synthetic_images(input_dir: Path, num_images: int) -> list[str]:
+    """Generate synthetic 720×720 white PNG placeholders; returns filenames."""
+    try:
+        from PIL import Image as PILImage  # type: ignore
+    except ImportError:
+        raise RuntimeError(
+            "Pillow is required for synthetic image generation. "
+            "Install it with: pip install Pillow"
+        )
+
+    filenames: list[str] = []
+    for i in range(num_images):
+        fname = f"benchmark_input_{i:04d}.png"
+        dest = input_dir / fname
+        if not dest.exists():
+            PILImage.new("RGB", (720, 720), color=(255, 255, 255)).save(str(dest))
+        filenames.append(fname)
+    return filenames
+
+
+def prepare_input_images(input_dir: Path, num_images: int = 20) -> list[str]:
+    """
+    Prepare benchmark input images in *input_dir*.
+
+    Priority:
+      1. Reuse any images already present in the directory.
+      2. Download Vchitect/VBench_I2V dataset via huggingface_hub.
+      3. Generate synthetic 720×720 white PNG placeholders with Pillow.
+
+    Returns a list of image basenames (not full paths).
+    """
+    input_dir.mkdir(parents=True, exist_ok=True)
+    image_exts = {".png", ".jpg", ".jpeg", ".webp"}
+
+    existing = sorted(
+        p.name for p in input_dir.iterdir() if p.suffix.lower() in image_exts
+    )
+    if existing:
+        print(f"[setup] found {len(existing)} existing images in {input_dir}")
+        return existing
+
+    filenames = _try_download_vbench_i2v(input_dir)
+    if filenames:
+        return filenames
+
+    print(f"[setup] generating {num_images} synthetic 720×720 placeholder images ...")
+    return _generate_synthetic_images(input_dir, num_images)
+
+
+def generate_prompt_file(
+    output_path: Path,
+    image_filename: str,
+    positive_prompt: str | None = None,
+) -> None:
+    """
+    Write a single Wan 2.2 I2V ComfyUI prompt JSON to *output_path*.
+
+    *image_filename* is substituted into the LoadImage node (node "97").
+    *positive_prompt* overrides the default positive text if provided.
+    """
+    graph: dict[str, Any] = json.loads(json.dumps(_WAN22_I2V_GRAPH))
+    graph["97"]["inputs"]["image"] = image_filename
+    if positive_prompt is not None:
+        graph["130:107"]["inputs"]["text"] = positive_prompt
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps({"prompt": graph}, indent=2))
+
+
+def generate_prompt_files(
+    output_dir: Path,
+    input_dir: Path,
+    num_prompts: int = 50,
+    num_images: int = 20,
+    download_models: bool = False,
+    comfyui_base_dir: Path | None = None,
+) -> list[Path]:
+    """
+    Full Wan 2.2 I2V benchmark setup:
+
+      1. Optionally download model weights into *comfyui_base_dir*.
+      2. Prepare input images in *input_dir* (VBench I2V or synthetic).
+      3. Generate *num_prompts* prompt JSON files in *output_dir*, cycling
+         through the available images.
+
+    Returns the list of generated prompt file paths.
+    """
+    if download_models:
+        if comfyui_base_dir is None:
+            raise ValueError("--comfyui-base-dir is required when --download-models is set")
+        download_wan22_models(comfyui_base_dir)
+
+    image_filenames = prepare_input_images(input_dir, num_images=num_images)
+    if not image_filenames:
+        raise RuntimeError(f"No input images available in {input_dir}")
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    generated: list[Path] = []
+    for i in range(num_prompts):
+        image_name = image_filenames[i % len(image_filenames)]
+        prompt_path = output_dir / f"wan22_i2v_prompt_{i:04d}.json"
+        generate_prompt_file(prompt_path, image_name)
+        generated.append(prompt_path)
+
+    print(f"[setup] generated {len(generated)} prompt files in {output_dir}")
+    print(f"[setup] example run:")
+    print(
+        f"  python benchmark_comfyui_serving.py"
+        f" --prompt-file {generated[0]}"
+        f" --num-requests {num_prompts}"
+    )
+    return generated
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+
+
 @dataclass
 class RequestResult:
     request_index: int
@@ -302,7 +707,46 @@ def parse_args() -> argparse.Namespace:
         choices=("/prompt", "/bench/prompt"),
         help="Submission endpoint.",
     )
-    p.add_argument("--prompt-file", type=Path, required=True, help="Path to prompt JSON.")
+    p.add_argument(
+        "--prompt-file",
+        type=Path,
+        default=None,
+        help="Path to prompt JSON. Required unless --generate-wan22-prompts is set.",
+    )
+    p.add_argument(
+        "--generate-wan22-prompts",
+        action="store_true",
+        help="Generate Wan 2.2 I2V prompt files (steps: prepare images, write JSONs) then exit.",
+    )
+    p.add_argument(
+        "--wan22-input-dir",
+        type=Path,
+        default=Path("inputs"),
+        help="Directory for benchmark input images (default: inputs/).",
+    )
+    p.add_argument(
+        "--wan22-output-dir",
+        type=Path,
+        default=Path("prompts/wan22_i2v"),
+        help="Directory where generated prompt JSON files are written (default: prompts/wan22_i2v/).",
+    )
+    p.add_argument(
+        "--wan22-num-images",
+        type=int,
+        default=20,
+        help="Number of synthetic images to generate when VBench download is unavailable (default: 20).",
+    )
+    p.add_argument(
+        "--download-models",
+        action="store_true",
+        help="Download Wan 2.2 model weights before generating prompts (requires --comfyui-base-dir).",
+    )
+    p.add_argument(
+        "--comfyui-base-dir",
+        type=Path,
+        default=None,
+        help="ComfyUI root directory used as the base for model downloads.",
+    )
     p.add_argument("--num-requests", type=int, default=50)
     p.add_argument("--max-concurrency", type=int, default=8)
     p.add_argument("--request-rate", type=float, default=0.0, help="Requests/sec. 0 = fire immediately.")
@@ -323,6 +767,8 @@ def parse_args() -> argparse.Namespace:
 
 
 async def async_main(args: argparse.Namespace) -> None:
+    if args.prompt_file is None:
+        raise SystemExit("error: --prompt-file is required (or use --generate-wan22-prompts to create one)")
     prompt_template = load_prompt_template(args.prompt_file)
     schedule = build_arrival_schedule(
         num_requests=args.num_requests,
@@ -367,6 +813,16 @@ async def async_main(args: argparse.Namespace) -> None:
 
 def main() -> None:
     args = parse_args()
+    if args.generate_wan22_prompts:
+        generate_prompt_files(
+            output_dir=args.wan22_output_dir,
+            input_dir=args.wan22_input_dir,
+            num_prompts=args.num_requests,
+            num_images=args.wan22_num_images,
+            download_models=args.download_models,
+            comfyui_base_dir=args.comfyui_base_dir,
+        )
+        return
     asyncio.run(async_main(args))
 
 

From 28bbdb00317b004598281913165c598f617c69c8 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 11:53:35 -0700
Subject: [PATCH 05/27] Fix vbench download

---
 benchmarks/benchmark_comfyui_serving.py | 46 +++++++++++--------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 6431be2bc..6eb7050ec 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -256,9 +256,8 @@ _WAN22_I2V_GRAPH: dict[str, Any] = {
     },
 }
 
-_VBENCH_I2V_JSON_URL = (
-    "https://raw.githubusercontent.com/Vchitect/VBench/master/vbench2_beta_i2v/i2v-bench-info.json"
-)
+# Google Drive file IDs from VBench's vbench2_beta_i2v/download_data.sh
+_VBENCH_ORIGIN_ZIP_GDRIVE_ID = "1qhkLCSBkzll0dkKpwlDTwLL0nxdQ4nrY"
 
 
 def download_wan22_models(base_dir: Path) -> None:
@@ -275,41 +274,36 @@ def download_wan22_models(base_dir: Path) -> None:
 
 def _try_download_vbench_i2v(input_dir: Path) -> list[str]:
     """
-    Attempt to fetch VBench I2V images via huggingface_hub.
+    Download VBench I2V origin images from Google Drive via gdown (pip install gdown).
     Returns image basenames placed in *input_dir*, or [] on failure.
     """
     try:
-        from huggingface_hub import snapshot_download  # type: ignore
+        import gdown  # type: ignore
     except ImportError:
-        print("[setup] huggingface_hub not available; skipping VBench download.")
+        print("[setup] gdown not available; skipping VBench download. Install with: pip install gdown")
         return []
 
+    import zipfile
+
+    zip_path = input_dir / "origin.zip"
     try:
-        print("[setup] downloading Vchitect/VBench_I2V dataset from HuggingFace ...")
-        cache_dir = input_dir / "_vbench_cache"
-        local = snapshot_download(
-            repo_id="Vchitect/VBench_I2V",
-            repo_type="dataset",
-            local_dir=str(cache_dir),
-        )
+        if not zip_path.exists():
+            print("[setup] downloading VBench I2V origin images from Google Drive ...")
+            gdown.download(id=_VBENCH_ORIGIN_ZIP_GDRIVE_ID, output=str(zip_path), quiet=False)
+        print("[setup] extracting origin.zip ...")
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(str(input_dir))
+        zip_path.unlink()
     except Exception as exc:
         print(f"[setup] VBench I2V download failed: {exc}")
+        if zip_path.exists():
+            zip_path.unlink()
         return []
 
     image_exts = {".png", ".jpg", ".jpeg", ".webp"}
-    found = sorted(p for p in Path(local).rglob("*") if p.suffix.lower() in image_exts)
-    if not found:
-        return []
-
-    import shutil
-
-    filenames: list[str] = []
-    for src in found:
-        dest = input_dir / src.name
-        if not dest.exists():
-            shutil.copy2(str(src), str(dest))
-        filenames.append(src.name)
-
+    filenames = sorted(
+        p.name for p in input_dir.rglob("*") if p.suffix.lower() in image_exts
+    )
     print(f"[setup] prepared {len(filenames)} VBench I2V images in {input_dir}")
     return filenames
 

From 8136fbbb4a78fa6b6409309ac21644e42b43007b Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 12:23:24 -0700
Subject: [PATCH 06/27] Fix input

---
 benchmarks/benchmark_comfyui_serving.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 6eb7050ec..c9dfeae49 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -25,10 +25,10 @@ Step 1 — Generate prompt files (downloads images, writes JSONs, then exits):
     --comfyui-base-dir /path/to/ComfyUI \\
     --num-requests 50
 
-  # Custom image/output dirs:
+  # Custom image/output dirs (input dir must be ComfyUI's input/ folder):
   python3 benchmarks/benchmark_comfyui_serving.py \\
     --generate-wan22-prompts \\
-    --wan22-input-dir /data/images \\
+    --wan22-input-dir /home/ubuntu/ComfyUI/input \\
     --wan22-output-dir /data/prompts/wan22 \\
     --wan22-num-images 30 \\
     --num-requests 50
@@ -715,8 +715,8 @@ def parse_args() -> argparse.Namespace:
     p.add_argument(
         "--wan22-input-dir",
         type=Path,
-        default=Path("inputs"),
-        help="Directory for benchmark input images (default: inputs/).",
+        default=Path("input"),
+        help="Directory for benchmark input images. Must be ComfyUI's input/ folder so LoadImage can find them (default: input/).",
     )
     p.add_argument(
         "--wan22-output-dir",

From 978b962300a914afcb2a76a4a2158c5c35e257d0 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 13:42:53 -0700
Subject: [PATCH 07/27] fix scripts

---
 benchmarks/benchmark_comfyui_serving.py | 348 +++++++++---------------
 1 file changed, 130 insertions(+), 218 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index c9dfeae49..2f9ca1c6e 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -2,7 +2,7 @@
 """
 Simple serving benchmark client for ComfyUI's HTTP API.
 
-This script is inspired by diffusion serving benchmarks and is designed to:
+This script is designed to:
   - submit prompts to ComfyUI (/prompt or /bench/prompt),
   - optionally shape request arrivals (fixed rate or Poisson),
   - poll completion via /history/{prompt_id},
@@ -15,28 +15,26 @@ Step 1 — Generate prompt files (downloads images, writes JSONs, then exits):
 
   # Minimal: uses synthetic images, writes to prompts/wan22_i2v/
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --generate-wan22-prompts \\
+    --generate-prompts --model wan22 --task i2v \\
     --num-requests 50
 
   # With model download (needs ComfyUI root):
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --generate-wan22-prompts \\
-    --download-models \\
-    --comfyui-base-dir /path/to/ComfyUI \\
+    --generate-prompts --model wan22 --task i2v \\
+    --download-models --comfyui-base-dir /path/to/ComfyUI \\
     --num-requests 50
 
-  # Custom image/output dirs (input dir must be ComfyUI's input/ folder):
+  # Custom image/output dirs (input-dir must be ComfyUI's input/ folder):
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --generate-wan22-prompts \\
-    --wan22-input-dir /home/ubuntu/ComfyUI/input \\
-    --wan22-output-dir /data/prompts/wan22 \\
-    --wan22-num-images 30 \\
-    --num-requests 50
+    --generate-prompts --model wan22 --task i2v \\
+    --input-dir /home/ubuntu/ComfyUI/input \\
+    --prompts-dir /home/ubuntu/ComfyUI/benchmarks/prompts/wan22_i2v \\
+    --num-images 30 --num-requests 50
 
 Step 2 — Run the benchmark (point at any one of the generated prompt files):
 
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --prompt-file prompts/wan22_i2v/wan22_i2v_prompt_0000.json \\
+    --prompt-file benchmarks/prompts/wan22_i2v/wan22_i2v_prompt_0000.json \\
     --num-requests 50 \\
     --max-concurrency 4 \\
     --host http://127.0.0.1:8188
@@ -64,10 +62,17 @@ import aiohttp
 
 
 # ──────────────────────────────────────────────────────────────────────────────
-# Wan 2.2 I2V benchmark setup helpers
+# Benchmark setup helpers
 # ──────────────────────────────────────────────────────────────────────────────
 
-_WAN22_MODELS: list[tuple[str, str]] = [
+# Workflow JSON files live in benchmarks/workflows/<model>_<task>.json.
+_WORKFLOWS_DIR = Path(__file__).parent / "workflows"
+
+# Placeholder in workflow JSON files that is replaced with the actual image filename.
+_IMAGE_PLACEHOLDER = "__INPUT_IMAGE__"
+
+# Model weight downloads for wan22/i2v.
+_WAN22_I2V_MODELS: list[tuple[str, str]] = [
     (
         "models/diffusion_models/wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
         "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/diffusion_models/wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
@@ -94,175 +99,46 @@ _WAN22_MODELS: list[tuple[str, str]] = [
     ),
 ]
 
-# Placeholder sentinel replaced by generate_prompt_file.
-_IMAGE_PLACEHOLDER = "__INPUT_IMAGE__"
-
-_WAN22_I2V_GRAPH: dict[str, Any] = {
-    "97": {
-        "inputs": {"image": _IMAGE_PLACEHOLDER},
-        "class_type": "LoadImage",
-        "_meta": {"title": "Start Frame Image"},
-    },
-    "108": {
-        "inputs": {
-            "filename_prefix": "video/Wan2.2_image_to_video",
-            "format": "auto",
-            "codec": "auto",
-            "video-preview": "",
-            "video": ["130:117", 0],
-        },
-        "class_type": "SaveVideo",
-        "_meta": {"title": "Save Video"},
-    },
-    "130:105": {
-        "inputs": {
-            "clip_name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
-            "type": "wan",
-            "device": "default",
-        },
-        "class_type": "CLIPLoader",
-        "_meta": {"title": "Load CLIP"},
-    },
-    "130:106": {
-        "inputs": {"vae_name": "wan_2.1_vae.safetensors"},
-        "class_type": "VAELoader",
-        "_meta": {"title": "Load VAE"},
-    },
-    "130:107": {
-        "inputs": {
-            "text": "A felt-style little eagle cashier greeting, waving, and smiling at the camera.",
-            "clip": ["130:105", 0],
-        },
-        "class_type": "CLIPTextEncode",
-        "_meta": {"title": "CLIP Text Encode (Positive Prompt)"},
-    },
-    "130:109": {
-        "inputs": {"shift": 5.000000000000001, "model": ["130:126", 0]},
-        "class_type": "ModelSamplingSD3",
-        "_meta": {"title": "ModelSamplingSD3"},
-    },
-    "130:110": {
-        "inputs": {
-            "add_noise": "enable",
-            "noise_seed": 636787045983965,
-            "steps": 4,
-            "cfg": 1,
-            "sampler_name": "euler",
-            "scheduler": "simple",
-            "start_at_step": 0,
-            "end_at_step": 2,
-            "return_with_leftover_noise": "enable",
-            "model": ["130:109", 0],
-            "positive": ["130:128", 0],
-            "negative": ["130:128", 1],
-            "latent_image": ["130:128", 2],
-        },
-        "class_type": "KSamplerAdvanced",
-        "_meta": {"title": "KSampler (Advanced)"},
-    },
-    "130:111": {
-        "inputs": {
-            "add_noise": "disable",
-            "noise_seed": 0,
-            "steps": 4,
-            "cfg": 1,
-            "sampler_name": "euler",
-            "scheduler": "simple",
-            "start_at_step": 2,
-            "end_at_step": 4,
-            "return_with_leftover_noise": "disable",
-            "model": ["130:124", 0],
-            "positive": ["130:128", 0],
-            "negative": ["130:128", 1],
-            "latent_image": ["130:110", 0],
-        },
-        "class_type": "KSamplerAdvanced",
-        "_meta": {"title": "KSampler (Advanced)"},
-    },
-    "130:117": {
-        "inputs": {"fps": 16, "images": ["130:129", 0]},
-        "class_type": "CreateVideo",
-        "_meta": {"title": "Create Video"},
-    },
-    "130:122": {
-        "inputs": {
-            "unet_name": "wan2.2_i2v_high_noise_14B_fp8_scaled.safetensors",
-            "weight_dtype": "default",
-        },
-        "class_type": "UNETLoader",
-        "_meta": {"title": "Load Diffusion Model"},
-    },
-    "130:123": {
-        "inputs": {
-            "unet_name": "wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
-            "weight_dtype": "default",
-        },
-        "class_type": "UNETLoader",
-        "_meta": {"title": "Load Diffusion Model"},
-    },
-    "130:124": {
-        "inputs": {"shift": 5.000000000000001, "model": ["130:127", 0]},
-        "class_type": "ModelSamplingSD3",
-        "_meta": {"title": "ModelSamplingSD3"},
-    },
-    "130:125": {
-        "inputs": {
-            "text": (
-                "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，"
-                "JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，"
-                "形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
-            ),
-            "clip": ["130:105", 0],
-        },
-        "class_type": "CLIPTextEncode",
-        "_meta": {"title": "CLIP Text Encode (Negative Prompt)"},
-    },
-    "130:126": {
-        "inputs": {
-            "lora_name": "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors",
-            "strength_model": 1.0000000000000002,
-            "model": ["130:122", 0],
-        },
-        "class_type": "LoraLoaderModelOnly",
-        "_meta": {"title": "Load LoRA"},
-    },
-    "130:127": {
-        "inputs": {
-            "lora_name": "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors",
-            "strength_model": 1.0000000000000002,
-            "model": ["130:123", 0],
-        },
-        "class_type": "LoraLoaderModelOnly",
-        "_meta": {"title": "Load LoRA"},
-    },
-    "130:128": {
-        "inputs": {
-            "width": 720,
-            "height": 720,
-            "length": 81,
-            "batch_size": 1,
-            "positive": ["130:107", 0],
-            "negative": ["130:125", 0],
-            "vae": ["130:106", 0],
-            "start_image": ["97", 0],
-        },
-        "class_type": "WanImageToVideo",
-        "_meta": {"title": "WanImageToVideo"},
-    },
-    "130:129": {
-        "inputs": {"samples": ["130:111", 0], "vae": ["130:106", 0]},
-        "class_type": "VAEDecode",
-        "_meta": {"title": "VAE Decode"},
-    },
-}
-
 # Google Drive file IDs from VBench's vbench2_beta_i2v/download_data.sh
 _VBENCH_ORIGIN_ZIP_GDRIVE_ID = "1qhkLCSBkzll0dkKpwlDTwLL0nxdQ4nrY"
 
+# Registry mapping (model, task) → benchmark configuration.
+# To add a new model/task: drop a workflow JSON in benchmarks/workflows/ and
+# add an entry here.
+_MODEL_REGISTRY: dict[tuple[str, str], dict[str, Any]] = {
+    ("wan22", "i2v"): {
+        "workflow_file": "wan22_i2v.json",
+        "model_files": _WAN22_I2V_MODELS,
+        "image_source": "vbench_i2v",
+    },
+}
 
-def download_wan22_models(base_dir: Path) -> None:
-    """Download Wan 2.2 I2V model files into *base_dir* using wget."""
-    for rel_path, url in _WAN22_MODELS:
+_VALID_MODELS = sorted({m for m, _ in _MODEL_REGISTRY})
+_VALID_TASKS = sorted({t for _, t in _MODEL_REGISTRY})
+
+
+def _replace_in_graph(obj: Any, placeholder: str, value: str) -> None:
+    """Recursively replace every occurrence of *placeholder* with *value* in-place."""
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            if v == placeholder:
+                obj[k] = value
+            else:
+                _replace_in_graph(v, placeholder, value)
+    elif isinstance(obj, list):
+        for i, item in enumerate(obj):
+            if item == placeholder:
+                obj[i] = value
+            else:
+                _replace_in_graph(item, placeholder, value)
+
+
+def download_models(base_dir: Path, model: str, task: str) -> None:
+    """Download model weights for *model*/*task* into *base_dir* using wget."""
+    key = (model, task)
+    if key not in _MODEL_REGISTRY:
+        raise ValueError(f"No model files registered for {model}/{task}")
+    for rel_path, url in _MODEL_REGISTRY[key]["model_files"]:
         dest = base_dir / rel_path
         if dest.exists():
             print(f"[setup] already exists, skipping: {dest}")
@@ -328,13 +204,17 @@ def _generate_synthetic_images(input_dir: Path, num_images: int) -> list[str]:
     return filenames
 
 
-def prepare_input_images(input_dir: Path, num_images: int = 20) -> list[str]:
+def prepare_input_images(
+    input_dir: Path,
+    num_images: int = 20,
+    image_source: str = "vbench_i2v",
+) -> list[str]:
     """
     Prepare benchmark input images in *input_dir*.
 
     Priority:
       1. Reuse any images already present in the directory.
-      2. Download Vchitect/VBench_I2V dataset via huggingface_hub.
+      2. Fetch from the source specified by *image_source* (e.g. "vbench_i2v").
       3. Generate synthetic 720×720 white PNG placeholders with Pillow.
 
     Returns a list of image basenames (not full paths).
@@ -349,9 +229,10 @@ def prepare_input_images(input_dir: Path, num_images: int = 20) -> list[str]:
         print(f"[setup] found {len(existing)} existing images in {input_dir}")
         return existing
 
-    filenames = _try_download_vbench_i2v(input_dir)
-    if filenames:
-        return filenames
+    if image_source == "vbench_i2v":
+        filenames = _try_download_vbench_i2v(input_dir)
+        if filenames:
+            return filenames
 
     print(f"[setup] generating {num_images} synthetic 720×720 placeholder images ...")
     return _generate_synthetic_images(input_dir, num_images)
@@ -359,57 +240,71 @@ def prepare_input_images(input_dir: Path, num_images: int = 20) -> list[str]:
 
 def generate_prompt_file(
     output_path: Path,
+    workflow_path: Path,
     image_filename: str,
-    positive_prompt: str | None = None,
 ) -> None:
     """
-    Write a single Wan 2.2 I2V ComfyUI prompt JSON to *output_path*.
+    Write a single ComfyUI prompt JSON to *output_path* from *workflow_path*.
 
-    *image_filename* is substituted into the LoadImage node (node "97").
-    *positive_prompt* overrides the default positive text if provided.
+    Replaces every occurrence of the sentinel string "__INPUT_IMAGE__" in the
+    workflow graph with *image_filename*.
     """
-    graph: dict[str, Any] = json.loads(json.dumps(_WAN22_I2V_GRAPH))
-    graph["97"]["inputs"]["image"] = image_filename
-    if positive_prompt is not None:
-        graph["130:107"]["inputs"]["text"] = positive_prompt
-
+    graph: dict[str, Any] = json.loads(workflow_path.read_text())
+    _replace_in_graph(graph, _IMAGE_PLACEHOLDER, image_filename)
     output_path.parent.mkdir(parents=True, exist_ok=True)
     output_path.write_text(json.dumps({"prompt": graph}, indent=2))
 
 
 def generate_prompt_files(
+    model: str,
+    task: str,
     output_dir: Path,
     input_dir: Path,
     num_prompts: int = 50,
     num_images: int = 20,
-    download_models: bool = False,
+    download_model_weights: bool = False,
     comfyui_base_dir: Path | None = None,
 ) -> list[Path]:
     """
-    Full Wan 2.2 I2V benchmark setup:
+    Full benchmark setup for a given *model*/*task*:
 
       1. Optionally download model weights into *comfyui_base_dir*.
-      2. Prepare input images in *input_dir* (VBench I2V or synthetic).
+      2. Prepare input images in *input_dir*.
       3. Generate *num_prompts* prompt JSON files in *output_dir*, cycling
          through the available images.
 
     Returns the list of generated prompt file paths.
     """
-    if download_models:
+    key = (model, task)
+    if key not in _MODEL_REGISTRY:
+        available = ", ".join(f"{m}/{t}" for m, t in _MODEL_REGISTRY)
+        raise ValueError(f"Unknown --model {model!r} --task {task!r}. Available: {available}")
+
+    cfg = _MODEL_REGISTRY[key]
+
+    if download_model_weights:
         if comfyui_base_dir is None:
             raise ValueError("--comfyui-base-dir is required when --download-models is set")
-        download_wan22_models(comfyui_base_dir)
+        download_models(comfyui_base_dir, model, task)
 
-    image_filenames = prepare_input_images(input_dir, num_images=num_images)
+    image_filenames = prepare_input_images(
+        input_dir,
+        num_images=num_images,
+        image_source=cfg.get("image_source", "synthetic"),
+    )
     if not image_filenames:
         raise RuntimeError(f"No input images available in {input_dir}")
 
+    workflow_path = _WORKFLOWS_DIR / cfg["workflow_file"]
+    if not workflow_path.exists():
+        raise FileNotFoundError(f"Workflow file not found: {workflow_path}")
+
     output_dir.mkdir(parents=True, exist_ok=True)
     generated: list[Path] = []
     for i in range(num_prompts):
         image_name = image_filenames[i % len(image_filenames)]
-        prompt_path = output_dir / f"wan22_i2v_prompt_{i:04d}.json"
-        generate_prompt_file(prompt_path, image_name)
+        prompt_path = output_dir / f"{model}_{task}_prompt_{i:04d}.json"
+        generate_prompt_file(prompt_path, workflow_path, image_name)
         generated.append(prompt_path)
 
     print(f"[setup] generated {len(generated)} prompt files in {output_dir}")
@@ -705,35 +600,47 @@ def parse_args() -> argparse.Namespace:
         "--prompt-file",
         type=Path,
         default=None,
-        help="Path to prompt JSON. Required unless --generate-wan22-prompts is set.",
+        help="Path to prompt JSON. Required unless --generate-prompts is set.",
     )
     p.add_argument(
-        "--generate-wan22-prompts",
+        "--generate-prompts",
         action="store_true",
-        help="Generate Wan 2.2 I2V prompt files (steps: prepare images, write JSONs) then exit.",
+        help="Prepare input images and generate prompt JSON files, then exit.",
     )
     p.add_argument(
-        "--wan22-input-dir",
+        "--model",
+        choices=_VALID_MODELS,
+        default=None,
+        help=f"Model to benchmark. Required with --generate-prompts. Choices: {_VALID_MODELS}.",
+    )
+    p.add_argument(
+        "--task",
+        choices=_VALID_TASKS,
+        default=None,
+        help=f"Task type. Required with --generate-prompts. Choices: {_VALID_TASKS}.",
+    )
+    p.add_argument(
+        "--input-dir",
         type=Path,
         default=Path("input"),
-        help="Directory for benchmark input images. Must be ComfyUI's input/ folder so LoadImage can find them (default: input/).",
+        help="ComfyUI input image directory (default: input/). LoadImage resolves files from this folder.",
     )
     p.add_argument(
-        "--wan22-output-dir",
+        "--prompts-dir",
         type=Path,
-        default=Path("prompts/wan22_i2v"),
-        help="Directory where generated prompt JSON files are written (default: prompts/wan22_i2v/).",
+        default=None,
+        help="Directory where generated prompt JSON files are written (default: benchmarks/prompts/<model>_<task>/).",
     )
     p.add_argument(
-        "--wan22-num-images",
+        "--num-images",
         type=int,
         default=20,
-        help="Number of synthetic images to generate when VBench download is unavailable (default: 20).",
+        help="Number of synthetic images to generate when dataset download is unavailable (default: 20).",
     )
     p.add_argument(
         "--download-models",
         action="store_true",
-        help="Download Wan 2.2 model weights before generating prompts (requires --comfyui-base-dir).",
+        help="Download model weights before generating prompts (requires --comfyui-base-dir).",
     )
     p.add_argument(
         "--comfyui-base-dir",
@@ -762,7 +669,7 @@ def parse_args() -> argparse.Namespace:
 
 async def async_main(args: argparse.Namespace) -> None:
     if args.prompt_file is None:
-        raise SystemExit("error: --prompt-file is required (or use --generate-wan22-prompts to create one)")
+        raise SystemExit("error: --prompt-file is required (or use --generate-prompts to create one)")
     prompt_template = load_prompt_template(args.prompt_file)
     schedule = build_arrival_schedule(
         num_requests=args.num_requests,
@@ -807,13 +714,18 @@ async def async_main(args: argparse.Namespace) -> None:
 
 def main() -> None:
     args = parse_args()
-    if args.generate_wan22_prompts:
+    if args.generate_prompts:
+        if not args.model or not args.task:
+            raise SystemExit("error: --model and --task are required with --generate-prompts")
+        prompts_dir = args.prompts_dir or Path("benchmarks/prompts") / f"{args.model}_{args.task}"
         generate_prompt_files(
-            output_dir=args.wan22_output_dir,
-            input_dir=args.wan22_input_dir,
+            model=args.model,
+            task=args.task,
+            output_dir=prompts_dir,
+            input_dir=args.input_dir,
             num_prompts=args.num_requests,
-            num_images=args.wan22_num_images,
-            download_models=args.download_models,
+            num_images=args.num_images,
+            download_model_weights=args.download_models,
             comfyui_base_dir=args.comfyui_base_dir,
         )
         return

From 52da6933b45a231af439c81828449aa3149bd7e1 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:01:06 -0700
Subject: [PATCH 08/27] Add workflow

---
 benchmarks/workflows/wan22_i2v.json | 154 ++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 benchmarks/workflows/wan22_i2v.json

diff --git a/benchmarks/workflows/wan22_i2v.json b/benchmarks/workflows/wan22_i2v.json
new file mode 100644
index 000000000..85a40956f
--- /dev/null
+++ b/benchmarks/workflows/wan22_i2v.json
@@ -0,0 +1,154 @@
+{
+  "97": {
+    "inputs": {"image": "__INPUT_IMAGE__"},
+    "class_type": "LoadImage",
+    "_meta": {"title": "Start Frame Image"}
+  },
+  "108": {
+    "inputs": {
+      "filename_prefix": "video/Wan2.2_image_to_video",
+      "format": "auto",
+      "codec": "auto",
+      "video-preview": "",
+      "video": ["130:117", 0]
+    },
+    "class_type": "SaveVideo",
+    "_meta": {"title": "Save Video"}
+  },
+  "130:105": {
+    "inputs": {
+      "clip_name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+      "type": "wan",
+      "device": "default"
+    },
+    "class_type": "CLIPLoader",
+    "_meta": {"title": "Load CLIP"}
+  },
+  "130:106": {
+    "inputs": {"vae_name": "wan_2.1_vae.safetensors"},
+    "class_type": "VAELoader",
+    "_meta": {"title": "Load VAE"}
+  },
+  "130:107": {
+    "inputs": {
+      "text": "A felt-style little eagle cashier greeting, waving, and smiling at the camera.",
+      "clip": ["130:105", 0]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {"title": "CLIP Text Encode (Positive Prompt)"}
+  },
+  "130:109": {
+    "inputs": {"shift": 5.000000000000001, "model": ["130:126", 0]},
+    "class_type": "ModelSamplingSD3",
+    "_meta": {"title": "ModelSamplingSD3"}
+  },
+  "130:110": {
+    "inputs": {
+      "add_noise": "enable",
+      "noise_seed": 636787045983965,
+      "steps": 4,
+      "cfg": 1,
+      "sampler_name": "euler",
+      "scheduler": "simple",
+      "start_at_step": 0,
+      "end_at_step": 2,
+      "return_with_leftover_noise": "enable",
+      "model": ["130:109", 0],
+      "positive": ["130:128", 0],
+      "negative": ["130:128", 1],
+      "latent_image": ["130:128", 2]
+    },
+    "class_type": "KSamplerAdvanced",
+    "_meta": {"title": "KSampler (Advanced)"}
+  },
+  "130:111": {
+    "inputs": {
+      "add_noise": "disable",
+      "noise_seed": 0,
+      "steps": 4,
+      "cfg": 1,
+      "sampler_name": "euler",
+      "scheduler": "simple",
+      "start_at_step": 2,
+      "end_at_step": 4,
+      "return_with_leftover_noise": "disable",
+      "model": ["130:124", 0],
+      "positive": ["130:128", 0],
+      "negative": ["130:128", 1],
+      "latent_image": ["130:110", 0]
+    },
+    "class_type": "KSamplerAdvanced",
+    "_meta": {"title": "KSampler (Advanced)"}
+  },
+  "130:117": {
+    "inputs": {"fps": 16, "images": ["130:129", 0]},
+    "class_type": "CreateVideo",
+    "_meta": {"title": "Create Video"}
+  },
+  "130:122": {
+    "inputs": {
+      "unet_name": "wan2.2_i2v_high_noise_14B_fp8_scaled.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {"title": "Load Diffusion Model"}
+  },
+  "130:123": {
+    "inputs": {
+      "unet_name": "wan2.2_i2v_low_noise_14B_fp8_scaled.safetensors",
+      "weight_dtype": "default"
+    },
+    "class_type": "UNETLoader",
+    "_meta": {"title": "Load Diffusion Model"}
+  },
+  "130:124": {
+    "inputs": {"shift": 5.000000000000001, "model": ["130:127", 0]},
+    "class_type": "ModelSamplingSD3",
+    "_meta": {"title": "ModelSamplingSD3"}
+  },
+  "130:125": {
+    "inputs": {
+      "text": "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+      "clip": ["130:105", 0]
+    },
+    "class_type": "CLIPTextEncode",
+    "_meta": {"title": "CLIP Text Encode (Negative Prompt)"}
+  },
+  "130:126": {
+    "inputs": {
+      "lora_name": "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors",
+      "strength_model": 1.0000000000000002,
+      "model": ["130:122", 0]
+    },
+    "class_type": "LoraLoaderModelOnly",
+    "_meta": {"title": "Load LoRA"}
+  },
+  "130:127": {
+    "inputs": {
+      "lora_name": "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors",
+      "strength_model": 1.0000000000000002,
+      "model": ["130:123", 0]
+    },
+    "class_type": "LoraLoaderModelOnly",
+    "_meta": {"title": "Load LoRA"}
+  },
+  "130:128": {
+    "inputs": {
+      "width": 720,
+      "height": 720,
+      "length": 81,
+      "batch_size": 1,
+      "positive": ["130:107", 0],
+      "negative": ["130:125", 0],
+      "vae": ["130:106", 0],
+      "start_image": ["97", 0]
+    },
+    "class_type": "WanImageToVideo",
+    "_meta": {"title": "WanImageToVideo"}
+  },
+  "130:129": {
+    "inputs": {"samples": ["130:111", 0], "vae": ["130:106", 0]},
+    "class_type": "VAEDecode",
+    "_meta": {"title": "VAE Decode"}
+  }
+}

From ff5e379cc27852965f41ea817cb61f326aada5f5 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:13:01 -0700
Subject: [PATCH 09/27] convert these two steps commands into one command: 1.
 check if downloading image or model is already there, if it is, skip. 2.
 remove prompt-file arg, when generating a new request, roundrobin the
 generated prompts

---
 benchmarks/benchmark_comfyui_serving.py | 108 +++++++++---------------
 1 file changed, 41 insertions(+), 67 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 2f9ca1c6e..1855912da 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -11,35 +11,27 @@ This script is designed to:
 Usage — Wan 2.2 I2V benchmark
 ==============================
 
-Step 1 — Generate prompt files (downloads images, writes JSONs, then exits):
+Images and prompt files are prepared automatically on first run and reused on
+subsequent runs. Just specify --model and --task:
 
-  # Minimal: uses synthetic images, writes to prompts/wan22_i2v/
+  # Minimal (synthetic images, default paths):
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --generate-prompts --model wan22 --task i2v \\
-    --num-requests 50
+    --model wan22 --task i2v \\
+    --num-requests 50 --max-concurrency 4
 
   # With model download (needs ComfyUI root):
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --generate-prompts --model wan22 --task i2v \\
+    --model wan22 --task i2v \\
     --download-models --comfyui-base-dir /path/to/ComfyUI \\
-    --num-requests 50
+    --num-requests 50 --max-concurrency 4
 
-  # Custom image/output dirs (input-dir must be ComfyUI's input/ folder):
+  # Custom paths:
   python3 benchmarks/benchmark_comfyui_serving.py \\
-    --generate-prompts --model wan22 --task i2v \\
+    --model wan22 --task i2v \\
     --input-dir /home/ubuntu/ComfyUI/input \\
     --prompts-dir /home/ubuntu/ComfyUI/benchmarks/prompts/wan22_i2v \\
-    --num-images 30 --num-requests 50
-
-Step 2 — Run the benchmark (point at any one of the generated prompt files):
-
-  python3 benchmarks/benchmark_comfyui_serving.py \\
-    --prompt-file benchmarks/prompts/wan22_i2v/wan22_i2v_prompt_0000.json \\
-    --num-requests 50 \\
-    --max-concurrency 4 \\
+    --num-images 30 --num-requests 50 --max-concurrency 4 \\
     --host http://127.0.0.1:8188
-
-The setup step also prints the exact run command at the end, so you can copy it directly.
 """
 
 from __future__ import annotations
@@ -260,7 +252,6 @@ def generate_prompt_files(
     task: str,
     output_dir: Path,
     input_dir: Path,
-    num_prompts: int = 50,
     num_images: int = 20,
     download_model_weights: bool = False,
     comfyui_base_dir: Path | None = None,
@@ -269,11 +260,11 @@ def generate_prompt_files(
     Full benchmark setup for a given *model*/*task*:
 
       1. Optionally download model weights into *comfyui_base_dir*.
-      2. Prepare input images in *input_dir*.
-      3. Generate *num_prompts* prompt JSON files in *output_dir*, cycling
-         through the available images.
+      2. Prepare input images in *input_dir* (skipped if images already exist).
+      3. Generate one prompt JSON per input image in *output_dir*
+         (skipped if prompt files already exist).
 
-    Returns the list of generated prompt file paths.
+    Returns the list of prompt file paths.
     """
     key = (model, task)
     if key not in _MODEL_REGISTRY:
@@ -287,6 +278,12 @@ def generate_prompt_files(
             raise ValueError("--comfyui-base-dir is required when --download-models is set")
         download_models(comfyui_base_dir, model, task)
 
+    # Skip prompt generation if files already exist.
+    existing = sorted(output_dir.glob(f"{model}_{task}_prompt_*.json"))
+    if existing:
+        print(f"[setup] found {len(existing)} existing prompt files in {output_dir}, skipping generation")
+        return existing
+
     image_filenames = prepare_input_images(
         input_dir,
         num_images=num_images,
@@ -301,19 +298,12 @@ def generate_prompt_files(
 
     output_dir.mkdir(parents=True, exist_ok=True)
     generated: list[Path] = []
-    for i in range(num_prompts):
-        image_name = image_filenames[i % len(image_filenames)]
+    for i, image_name in enumerate(image_filenames):
         prompt_path = output_dir / f"{model}_{task}_prompt_{i:04d}.json"
         generate_prompt_file(prompt_path, workflow_path, image_name)
         generated.append(prompt_path)
 
     print(f"[setup] generated {len(generated)} prompt files in {output_dir}")
-    print(f"[setup] example run:")
-    print(
-        f"  python benchmark_comfyui_serving.py"
-        f" --prompt-file {generated[0]}"
-        f" --num-requests {num_prompts}"
-    )
     return generated
 
 
@@ -489,7 +479,7 @@ async def run_request(
     semaphore: asyncio.Semaphore,
     session: aiohttp.ClientSession,
     args: argparse.Namespace,
-    prompt_wrapper_template: dict[str, Any],
+    prompt_templates: list[dict[str, Any]],
 ) -> RequestResult:
     await asyncio.sleep(max(0.0, (start_time + scheduled_offset_s) - time.perf_counter()))
     queued_at = time.perf_counter()
@@ -498,7 +488,7 @@ async def run_request(
         started_at = time.perf_counter()
         prompt_id = None
         try:
-            payload = json.loads(json.dumps(prompt_wrapper_template))
+            payload = json.loads(json.dumps(prompt_templates[idx % len(prompt_templates)]))
             payload.setdefault("extra_data", {})
             payload["client_id"] = args.client_id
 
@@ -596,28 +586,17 @@ def parse_args() -> argparse.Namespace:
         choices=("/prompt", "/bench/prompt"),
         help="Submission endpoint.",
     )
-    p.add_argument(
-        "--prompt-file",
-        type=Path,
-        default=None,
-        help="Path to prompt JSON. Required unless --generate-prompts is set.",
-    )
-    p.add_argument(
-        "--generate-prompts",
-        action="store_true",
-        help="Prepare input images and generate prompt JSON files, then exit.",
-    )
     p.add_argument(
         "--model",
         choices=_VALID_MODELS,
-        default=None,
-        help=f"Model to benchmark. Required with --generate-prompts. Choices: {_VALID_MODELS}.",
+        required=True,
+        help=f"Model to benchmark. Choices: {_VALID_MODELS}.",
     )
     p.add_argument(
         "--task",
         choices=_VALID_TASKS,
-        default=None,
-        help=f"Task type. Required with --generate-prompts. Choices: {_VALID_TASKS}.",
+        required=True,
+        help=f"Task type. Choices: {_VALID_TASKS}.",
     )
     p.add_argument(
         "--input-dir",
@@ -668,9 +647,19 @@ def parse_args() -> argparse.Namespace:
 
 
 async def async_main(args: argparse.Namespace) -> None:
-    if args.prompt_file is None:
-        raise SystemExit("error: --prompt-file is required (or use --generate-prompts to create one)")
-    prompt_template = load_prompt_template(args.prompt_file)
+    prompts_dir = args.prompts_dir or Path("benchmarks/prompts") / f"{args.model}_{args.task}"
+    prompt_paths = generate_prompt_files(
+        model=args.model,
+        task=args.task,
+        output_dir=prompts_dir,
+        input_dir=args.input_dir,
+        num_images=args.num_images,
+        download_model_weights=args.download_models,
+        comfyui_base_dir=args.comfyui_base_dir,
+    )
+    prompt_templates = [load_prompt_template(p) for p in prompt_paths]
+    print(f"[bench] loaded {len(prompt_templates)} prompt templates, round-robining over {args.num_requests} requests")
+
     schedule = build_arrival_schedule(
         num_requests=args.num_requests,
         request_rate=args.request_rate,
@@ -691,7 +680,7 @@ async def async_main(args: argparse.Namespace) -> None:
                     semaphore=semaphore,
                     session=session,
                     args=args,
-                    prompt_wrapper_template=prompt_template,
+                    prompt_templates=prompt_templates,
                 )
             )
             for i in range(args.num_requests)
@@ -714,21 +703,6 @@ async def async_main(args: argparse.Namespace) -> None:
 
 def main() -> None:
     args = parse_args()
-    if args.generate_prompts:
-        if not args.model or not args.task:
-            raise SystemExit("error: --model and --task are required with --generate-prompts")
-        prompts_dir = args.prompts_dir or Path("benchmarks/prompts") / f"{args.model}_{args.task}"
-        generate_prompt_files(
-            model=args.model,
-            task=args.task,
-            output_dir=prompts_dir,
-            input_dir=args.input_dir,
-            num_prompts=args.num_requests,
-            num_images=args.num_images,
-            download_model_weights=args.download_models,
-            comfyui_base_dir=args.comfyui_base_dir,
-        )
-        return
     asyncio.run(async_main(args))
 
 

From d407d823500b327d56a8d59c499ae4fb5e9166ab Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:20:32 -0700
Subject: [PATCH 10/27] Fix comments

---
 benchmarks/benchmark_comfyui_serving.py | 35 +++++++++----------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 1855912da..2629833e2 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -1,36 +1,27 @@
 #!/usr/bin/env python3
 """
-Simple serving benchmark client for ComfyUI's HTTP API.
+ComfyUI model serving benchmark.
 
-This script is designed to:
-  - submit prompts to ComfyUI (/prompt or /bench/prompt),
-  - optionally shape request arrivals (fixed rate or Poisson),
-  - poll completion via /history/{prompt_id},
-  - report latency/throughput/error metrics.
+Submits prompts concurrently to a running ComfyUI server and reports
+latency/throughput metrics. Input images and prompt files are prepared
+automatically (and cached for reuse) before the benchmark starts.
 
-Usage — Wan 2.2 I2V benchmark
-==============================
+Supported models / tasks
+------------------------
+  wan22 / i2v   — Wan 2.2 Image-to-Video (LightX2V 4-step, 720×720, 81 frames)
 
-Images and prompt files are prepared automatically on first run and reused on
-subsequent runs. Just specify --model and --task:
-
-  # Minimal (synthetic images, default paths):
+Usage
+-----
   python3 benchmarks/benchmark_comfyui_serving.py \\
     --model wan22 --task i2v \\
-    --num-requests 50 --max-concurrency 4
+    --num-requests 50 --max-concurrency 4 \\
+    --host http://127.0.0.1:8188
 
-  # With model download (needs ComfyUI root):
+  # Also download model weights (run from ComfyUI root):
   python3 benchmarks/benchmark_comfyui_serving.py \\
     --model wan22 --task i2v \\
     --download-models --comfyui-base-dir /path/to/ComfyUI \\
-    --num-requests 50 --max-concurrency 4
-
-  # Custom paths:
-  python3 benchmarks/benchmark_comfyui_serving.py \\
-    --model wan22 --task i2v \\
-    --input-dir /home/ubuntu/ComfyUI/input \\
-    --prompts-dir /home/ubuntu/ComfyUI/benchmarks/prompts/wan22_i2v \\
-    --num-images 30 --num-requests 50 --max-concurrency 4 \\
+    --num-requests 50 --max-concurrency 4 \\
     --host http://127.0.0.1:8188
 """
 

From 125ed0be4b97df39bfcdc7c267baa9ad31f82a27 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:22:27 -0700
Subject: [PATCH 11/27] Fix comments

---
 benchmarks/benchmark_comfyui_serving.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 2629833e2..2236230bd 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -6,6 +6,15 @@ Submits prompts concurrently to a running ComfyUI server and reports
 latency/throughput metrics. Input images and prompt files are prepared
 automatically (and cached for reuse) before the benchmark starts.
 
+On first run the script will:
+  1. Download model weights (if --download-models is set).
+  2. Download the VBench I2V image dataset (requires: pip install gdown),
+     or generate synthetic placeholder images as a fallback.
+  3. Write one prompt JSON per input image under benchmarks/prompts/<model>_<task>/.
+
+On subsequent runs all three steps are skipped if the files already exist.
+Requests are distributed across prompt files in round-robin order.
+
 Supported models / tasks
 ------------------------
   wan22 / i2v   — Wan 2.2 Image-to-Video (LightX2V 4-step, 720×720, 81 frames)

From 08411a1d65a6e42db5611984089c71d8c654c83e Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:28:46 -0700
Subject: [PATCH 12/27] Fix input dir

---
 benchmarks/benchmark_comfyui_serving.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 2236230bd..91c81bcf3 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -598,12 +598,6 @@ def parse_args() -> argparse.Namespace:
         required=True,
         help=f"Task type. Choices: {_VALID_TASKS}.",
     )
-    p.add_argument(
-        "--input-dir",
-        type=Path,
-        default=Path("input"),
-        help="ComfyUI input image directory (default: input/). LoadImage resolves files from this folder.",
-    )
     p.add_argument(
         "--prompts-dir",
         type=Path,
@@ -652,7 +646,7 @@ async def async_main(args: argparse.Namespace) -> None:
         model=args.model,
         task=args.task,
         output_dir=prompts_dir,
-        input_dir=args.input_dir,
+        input_dir=Path("input"),
         num_images=args.num_images,
         download_model_weights=args.download_models,
         comfyui_base_dir=args.comfyui_base_dir,

From 09f03107c2740250181f66ca329e37ecce887c44 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 14:34:31 -0700
Subject: [PATCH 13/27] remove checking existing png under input since
 example.png is always under input folder

---
 benchmarks/benchmark_comfyui_serving.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 91c81bcf3..119527cd1 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -212,14 +212,6 @@ def prepare_input_images(
     Returns a list of image basenames (not full paths).
     """
     input_dir.mkdir(parents=True, exist_ok=True)
-    image_exts = {".png", ".jpg", ".jpeg", ".webp"}
-
-    existing = sorted(
-        p.name for p in input_dir.iterdir() if p.suffix.lower() in image_exts
-    )
-    if existing:
-        print(f"[setup] found {len(existing)} existing images in {input_dir}")
-        return existing
 
     if image_source == "vbench_i2v":
         filenames = _try_download_vbench_i2v(input_dir)

From 512deb3cd6d6e914d7deb08ee493f35fd1788165 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 15:08:49 -0700
Subject: [PATCH 14/27] Fix returned vbench image filenames

---
 benchmarks/benchmark_comfyui_serving.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 119527cd1..418371ffe 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -170,7 +170,9 @@ def _try_download_vbench_i2v(input_dir: Path) -> list[str]:
 
     image_exts = {".png", ".jpg", ".jpeg", ".webp"}
     filenames = sorted(
-        p.name for p in input_dir.rglob("*") if p.suffix.lower() in image_exts
+        p.relative_to(input_dir).as_posix()
+        for p in input_dir.rglob("*")
+        if p.suffix.lower() in image_exts
     )
     print(f"[setup] prepared {len(filenames)} VBench I2V images in {input_dir}")
     return filenames

From ca56e224a035d1b4934b9507a31642b201bce201 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:50:40 -0700
Subject: [PATCH 15/27] Moving collecting summary to benchmark_comfyui_serving

---
 benchmarks/benchmark_comfyui_serving.py | 23 ++-------
 main.py                                 | 22 ++++-----
 server.py                               | 66 -------------------------
 3 files changed, 16 insertions(+), 95 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 418371ffe..2859d2e96 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -396,7 +396,7 @@ async def wait_for_prompt_done(
     timeout_s: float,
 ) -> tuple[float | None, float | None]:
     """
-    Returns (queue_wait_ms, execution_ms) when available from history status messages.
+    Returns (queue_wait_ms, execution_ms) from history_item["benchmark"] written by the server.
     Falls back to (None, None) if unavailable.
     """
     deadline = time.perf_counter() + timeout_s
@@ -419,26 +419,13 @@ async def wait_for_prompt_done(
                 continue
 
             status = history_item.get("status", {})
-            status_str = status.get("status_str")
-            messages = status.get("messages", [])
-            if status_str not in ("success", "error"):
+            if status.get("status_str") not in ("success", "error"):
                 await asyncio.sleep(poll_interval_s)
                 continue
 
-            queue_wait_ms = None
-            execution_ms = None
-            try:
-                timestamp_map: dict[str, int] = {}
-                for event, msg in messages:
-                    if isinstance(msg, dict) and "timestamp" in msg:
-                        timestamp_map[event] = int(msg["timestamp"])
-                start_ts = timestamp_map.get("execution_start")
-                end_ts = timestamp_map.get("execution_success") or timestamp_map.get("execution_error")
-                if start_ts is not None and end_ts is not None:
-                    execution_ms = max(0.0, end_ts - start_ts)
-            except Exception:
-                execution_ms = None
-
+            benchmark = history_item.get("benchmark", {})
+            queue_wait_ms = benchmark.get("queue_wait_ms")
+            execution_ms = benchmark.get("execution_ms")
             return queue_wait_ms, execution_ms
 
         await asyncio.sleep(poll_interval_s)
diff --git a/main.py b/main.py
index 5013bac42..77b0058a5 100644
--- a/main.py
+++ b/main.py
@@ -293,7 +293,6 @@ def prompt_worker(q, server_instance):
     gc_collect_interval = 10.0
 
     while True:
-        benchmark_mode = args.benchmark_server_only
         timeout = 1000.0
         if need_gc:
             timeout = max(gc_collect_interval - (current_time - last_gc_collect), 0.0)
@@ -310,18 +309,28 @@ def prompt_worker(q, server_instance):
             extra_data = item[3].copy()
             for k in sensitive:
                 extra_data[k] = sensitive[k]
-            benchmark_mode = args.benchmark_server_only or extra_data.get("benchmark_server_only", False)
+            benchmark_mode = args.benchmark_server_only
 
             if not benchmark_mode:
                 asset_seeder.pause()
             e.execute(item[2], prompt_id, extra_data, item[4])
+            execution_time_s = time.perf_counter() - execution_start_time
 
             need_gc = True
 
+            queue_wait_ms = 0.0
+            created_at = extra_data.get("create_time")
+            if isinstance(created_at, int):
+                queue_wait_ms = max(0.0, execution_start_wall_ms - created_at)
+
             remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
             history_result = e.history_result
             if benchmark_mode:
                 history_result = {"outputs": {}, "meta": {}}
+            history_result["benchmark"] = {
+                "execution_ms": execution_time_s * 1000.0,
+                "queue_wait_ms": queue_wait_ms,
+            }
 
             q.task_done(item_id,
                         history_result,
@@ -333,7 +342,6 @@ def prompt_worker(q, server_instance):
                 server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id)
 
             current_time = time.perf_counter()
-            execution_time_s = current_time - execution_start_time
 
             # Log Time in a more readable way after 10 minutes
             if execution_time_s > 600:
@@ -342,14 +350,6 @@ def prompt_worker(q, server_instance):
             else:
                 logging.info("Prompt executed in {:.2f} seconds".format(execution_time_s))
 
-            queue_wait_ms = 0.0
-            created_at = extra_data.get("create_time")
-            if isinstance(created_at, int):
-                queue_wait_ms = max(0.0, execution_start_wall_ms - created_at)
-
-            if benchmark_mode:
-                server_instance.record_benchmark_result(prompt_id, e.success, execution_time_s * 1000.0, queue_wait_ms)
-
             if not benchmark_mode and not asset_seeder.is_disabled():
                 paths = _collect_output_absolute_paths(e.history_result)
                 register_output_files(paths, job_id=prompt_id)
diff --git a/server.py b/server.py
index 5db448b7f..ff15904f6 100644
--- a/server.py
+++ b/server.py
@@ -16,7 +16,6 @@ import struct
 import ssl
 import socket
 import ipaddress
-import threading
 from PIL import Image, ImageOps
 from PIL.PngImagePlugin import PngInfo
 from io import BytesIO
@@ -253,17 +252,6 @@ class PromptServer():
         self.client_id = None
 
         self.on_prompt_handlers = []
-        self._benchmark_lock = threading.Lock()
-        self._benchmark_stats = {
-            "requests_total": 0,
-            "requests_success": 0,
-            "requests_error": 0,
-            "latency_ms_total": 0.0,
-            "latency_ms_max": 0.0,
-            "queue_wait_ms_total": 0.0,
-            "queue_wait_ms_max": 0.0,
-            "last_prompt_id": None,
-        }
 
         @routes.get('/ws')
         async def websocket_handler(request):
@@ -924,16 +912,6 @@ class PromptServer():
             queue_info['queue_pending'] = _remove_sensitive_from_queue(current_queue[1])
             return web.json_response(queue_info)
 
-        @routes.get("/bench/stats")
-        async def get_bench_stats(request):
-            stats = self.get_benchmark_stats()
-            return web.json_response(stats)
-
-        @routes.post("/bench/reset")
-        async def reset_bench_stats(request):
-            self.reset_benchmark_stats()
-            return web.json_response({"status": "ok"})
-
         async def enqueue_prompt(json_data):
             if "number" in json_data:
                 number = float(json_data['number'])
@@ -984,15 +962,6 @@ class PromptServer():
                 }
                 return web.json_response({"error": error, "node_errors": {}}, status=400)
 
-        @routes.post("/bench/prompt")
-        async def post_bench_prompt(request):
-            json_data = await request.json()
-            json_data = self.trigger_on_prompt(json_data)
-            extra_data = json_data.setdefault("extra_data", {})
-            extra_data["benchmark_server_only"] = True
-            extra_data.setdefault("preview_method", "none")
-            return await enqueue_prompt(json_data)
-
         @routes.post("/prompt")
         async def post_prompt(request):
             logging.info("got prompt")
@@ -1144,41 +1113,6 @@ class PromptServer():
         prompt_info['exec_info'] = exec_info
         return prompt_info
 
-    def reset_benchmark_stats(self):
-        with self._benchmark_lock:
-            self._benchmark_stats = {
-                "requests_total": 0,
-                "requests_success": 0,
-                "requests_error": 0,
-                "latency_ms_total": 0.0,
-                "latency_ms_max": 0.0,
-                "queue_wait_ms_total": 0.0,
-                "queue_wait_ms_max": 0.0,
-                "last_prompt_id": None,
-            }
-
-    def record_benchmark_result(self, prompt_id, success, latency_ms, queue_wait_ms=0.0):
-        with self._benchmark_lock:
-            self._benchmark_stats["requests_total"] += 1
-            if success:
-                self._benchmark_stats["requests_success"] += 1
-            else:
-                self._benchmark_stats["requests_error"] += 1
-            self._benchmark_stats["latency_ms_total"] += max(0.0, latency_ms)
-            self._benchmark_stats["queue_wait_ms_total"] += max(0.0, queue_wait_ms)
-            self._benchmark_stats["latency_ms_max"] = max(self._benchmark_stats["latency_ms_max"], max(0.0, latency_ms))
-            self._benchmark_stats["queue_wait_ms_max"] = max(self._benchmark_stats["queue_wait_ms_max"], max(0.0, queue_wait_ms))
-            self._benchmark_stats["last_prompt_id"] = prompt_id
-
-    def get_benchmark_stats(self):
-        with self._benchmark_lock:
-            stats = dict(self._benchmark_stats)
-
-        total = stats["requests_total"]
-        stats["latency_ms_avg"] = (stats["latency_ms_total"] / total) if total > 0 else 0.0
-        stats["queue_wait_ms_avg"] = (stats["queue_wait_ms_total"] / total) if total > 0 else 0.0
-        return stats
-
     async def send(self, event, data, sid=None):
         if event == BinaryEventTypes.UNENCODED_PREVIEW_IMAGE:
             await self.send_image(data, sid=sid)

From 059b346966c6905cf3f59c8a606eca41086ded6f Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 16:57:16 -0700
Subject: [PATCH 16/27] fix server.py

---
 server.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/server.py b/server.py
index ff15904f6..881da8e66 100644
--- a/server.py
+++ b/server.py
@@ -912,7 +912,12 @@ class PromptServer():
             queue_info['queue_pending'] = _remove_sensitive_from_queue(current_queue[1])
             return web.json_response(queue_info)
 
-        async def enqueue_prompt(json_data):
+        @routes.post("/prompt")
+        async def post_prompt(request):
+            logging.info("got prompt")
+            json_data =  await request.json()
+            json_data = self.trigger_on_prompt(json_data)
+
             if "number" in json_data:
                 number = float(json_data['number'])
             else:
@@ -962,13 +967,6 @@ class PromptServer():
                 }
                 return web.json_response({"error": error, "node_errors": {}}, status=400)
 
-        @routes.post("/prompt")
-        async def post_prompt(request):
-            logging.info("got prompt")
-            json_data =  await request.json()
-            json_data = self.trigger_on_prompt(json_data)
-            return await enqueue_prompt(json_data)
-
         @routes.post("/queue")
         async def post_queue(request):
             json_data =  await request.json()

From 69f6272edcf0aafc34f04f9e84dac9e63ef88ab7 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 21:39:57 -0700
Subject: [PATCH 17/27] Add benchmark for each node.

---
 benchmarks/benchmark_comfyui_serving.py | 32 +++++++++++++++++++------
 execution.py                            |  8 +++++++
 main.py                                 | 14 +++++++----
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 2859d2e96..cccc85df7 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -316,6 +316,7 @@ class RequestResult:
     end_to_end_s: float
     queue_wait_ms: float | None
     execution_ms: float | None
+    node_timing_ms: dict[str, dict] | None
 
 
 def percentile(values: list[float], pct: float) -> float:
@@ -394,10 +395,10 @@ async def wait_for_prompt_done(
     prompt_id: str,
     poll_interval_s: float,
     timeout_s: float,
-) -> tuple[float | None, float | None]:
+) -> tuple[float | None, float | None, dict | None]:
     """
-    Returns (queue_wait_ms, execution_ms) from history_item["benchmark"] written by the server.
-    Falls back to (None, None) if unavailable.
+    Returns (queue_wait_ms, execution_ms, node_timing_ms) from history_item["benchmark"].
+    Falls back to (None, None, None) if unavailable.
     """
     deadline = time.perf_counter() + timeout_s
     history_url = f"{base_url}/history/{prompt_id}"
@@ -424,9 +425,11 @@ async def wait_for_prompt_done(
                 continue
 
             benchmark = history_item.get("benchmark", {})
-            queue_wait_ms = benchmark.get("queue_wait_ms")
-            execution_ms = benchmark.get("execution_ms")
-            return queue_wait_ms, execution_ms
+            return (
+                benchmark.get("queue_wait_ms"),
+                benchmark.get("execution_ms"),
+                benchmark.get("nodes"),
+            )
 
         await asyncio.sleep(poll_interval_s)
 
@@ -484,7 +487,7 @@ async def run_request(
                 timeout_s=args.request_timeout_s,
             )
 
-            queue_wait_ms, execution_ms = await wait_for_prompt_done(
+            queue_wait_ms, execution_ms, node_timing_ms = await wait_for_prompt_done(
                 session=session,
                 base_url=args.host,
                 prompt_id=prompt_id,
@@ -503,6 +506,7 @@ async def run_request(
                 end_to_end_s=finished_at - queued_at,
                 queue_wait_ms=queue_wait_ms,
                 execution_ms=execution_ms,
+                node_timing_ms=node_timing_ms,
             )
         except Exception as exc:
             finished_at = time.perf_counter()
@@ -517,6 +521,7 @@ async def run_request(
                 end_to_end_s=finished_at - queued_at,
                 queue_wait_ms=None,
                 execution_ms=None,
+                node_timing_ms=None,
             )
 
 
@@ -551,6 +556,19 @@ def print_summary(results: list[RequestResult], wall_s: float) -> None:
         print(f"execution_mean_ms:  {statistics.mean(exec_ms):.2f}")
         print(f"execution_p95_ms:   {percentile(exec_ms, 95):.2f}")
 
+    # Per-node timing: aggregate execution_ms across all successful results.
+    node_totals: dict[str, list[float]] = {}
+    for r in success:
+        if not r.node_timing_ms:
+            continue
+        for node_id, info in r.node_timing_ms.items():
+            key = f"{info.get('class_type', 'unknown')} ({node_id})"
+            node_totals.setdefault(key, []).append(info.get("execution_ms", 0.0))
+    if node_totals:
+        print("\n--- Per-node execution time (mean ms across successful requests) ---")
+        for key, times in sorted(node_totals.items(), key=lambda x: -statistics.mean(x[1])):
+            print(f"  {key}: mean={statistics.mean(times):.1f}  p95={percentile(times, 95):.1f}  n={len(times)}")
+
     if fail:
         print("\nSample failures:")
         for r in fail[:5]:
diff --git a/execution.py b/execution.py
index e15eb4bda..777ecda77 100644
--- a/execution.py
+++ b/execution.py
@@ -721,6 +721,7 @@ class PromptExecutor:
             self.server.client_id = None
 
         self.status_messages = []
+        self.node_timing_ms: dict[str, dict] = {}
         self.add_message("execution_start", { "prompt_id": prompt_id}, broadcast=False)
 
         self._notify_prompt_lifecycle("start", prompt_id)
@@ -767,6 +768,7 @@ class PromptExecutor:
                         break
 
                     assert node_id is not None, "Node ID should not be None at this point"
+                    node_start_s = time.perf_counter() if args.benchmark_server_only else None
                     result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs)
                     self.success = result != ExecutionResult.FAILURE
                     if result == ExecutionResult.FAILURE:
@@ -776,6 +778,12 @@ class PromptExecutor:
                         execution_list.unstage_node_execution()
                     else: # result == ExecutionResult.SUCCESS:
                         execution_list.complete_node_execution()
+                        if node_start_s is not None:
+                            class_type = dynamic_prompt.get_node(node_id).get("class_type", "unknown")
+                            self.node_timing_ms[node_id] = {
+                                "class_type": class_type,
+                                "execution_ms": (time.perf_counter() - node_start_s) * 1000.0,
+                            }
 
                     if self.cache_type == CacheType.RAM_PRESSURE:
                         comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom)
diff --git a/main.py b/main.py
index 77b0058a5..ac15ad30a 100644
--- a/main.py
+++ b/main.py
@@ -326,11 +326,15 @@ def prompt_worker(q, server_instance):
             remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
             history_result = e.history_result
             if benchmark_mode:
-                history_result = {"outputs": {}, "meta": {}}
-            history_result["benchmark"] = {
-                "execution_ms": execution_time_s * 1000.0,
-                "queue_wait_ms": queue_wait_ms,
-            }
+                history_result = {
+                    "outputs": {},
+                    "meta": {},
+                    "benchmark": {
+                        "execution_ms": execution_time_s * 1000.0,
+                        "queue_wait_ms": queue_wait_ms,
+                        "nodes": e.node_timing_ms,
+                    },
+                }
 
             q.task_done(item_id,
                         history_result,

From 6251350cf4b709eb984f0f8e292e11a10f9445f7 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:03:52 -0700
Subject: [PATCH 18/27] Remove queue_time

---
 benchmarks/benchmark_comfyui_serving.py | 17 ++++-------------
 main.py                                 |  7 -------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index cccc85df7..06979cdc5 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -314,7 +314,6 @@ class RequestResult:
     started_at: float
     finished_at: float
     end_to_end_s: float
-    queue_wait_ms: float | None
     execution_ms: float | None
     node_timing_ms: dict[str, dict] | None
 
@@ -395,10 +394,10 @@ async def wait_for_prompt_done(
     prompt_id: str,
     poll_interval_s: float,
     timeout_s: float,
-) -> tuple[float | None, float | None, dict | None]:
+) -> tuple[float | None, dict | None]:
     """
-    Returns (queue_wait_ms, execution_ms, node_timing_ms) from history_item["benchmark"].
-    Falls back to (None, None, None) if unavailable.
+    Returns (execution_ms, node_timing_ms) from history_item["benchmark"].
+    Falls back to (None, None) if unavailable.
     """
     deadline = time.perf_counter() + timeout_s
     history_url = f"{base_url}/history/{prompt_id}"
@@ -426,7 +425,6 @@ async def wait_for_prompt_done(
 
             benchmark = history_item.get("benchmark", {})
             return (
-                benchmark.get("queue_wait_ms"),
                 benchmark.get("execution_ms"),
                 benchmark.get("nodes"),
             )
@@ -487,7 +485,7 @@ async def run_request(
                 timeout_s=args.request_timeout_s,
             )
 
-            queue_wait_ms, execution_ms, node_timing_ms = await wait_for_prompt_done(
+            execution_ms, node_timing_ms = await wait_for_prompt_done(
                 session=session,
                 base_url=args.host,
                 prompt_id=prompt_id,
@@ -504,7 +502,6 @@ async def run_request(
                 started_at=started_at,
                 finished_at=finished_at,
                 end_to_end_s=finished_at - queued_at,
-                queue_wait_ms=queue_wait_ms,
                 execution_ms=execution_ms,
                 node_timing_ms=node_timing_ms,
             )
@@ -519,7 +516,6 @@ async def run_request(
                 started_at=started_at,
                 finished_at=finished_at,
                 end_to_end_s=finished_at - queued_at,
-                queue_wait_ms=None,
                 execution_ms=None,
                 node_timing_ms=None,
             )
@@ -529,7 +525,6 @@ def print_summary(results: list[RequestResult], wall_s: float) -> None:
     success = [r for r in results if r.ok]
     fail = [r for r in results if not r.ok]
     lat_s = [r.end_to_end_s for r in success]
-    queue_wait_ms = [r.queue_wait_ms for r in success if r.queue_wait_ms is not None]
     exec_ms = [r.execution_ms for r in success if r.execution_ms is not None]
 
     throughput = (len(success) / wall_s) if wall_s > 0 else 0.0
@@ -548,10 +543,6 @@ def print_summary(results: list[RequestResult], wall_s: float) -> None:
         print(f"latency_mean_s:   {statistics.mean(lat_s):.3f}")
         print(f"latency_max_s:    {max(lat_s):.3f}")
 
-    if queue_wait_ms:
-        print(f"queue_wait_mean_ms: {statistics.mean(queue_wait_ms):.2f}")
-        print(f"queue_wait_p95_ms:  {percentile(queue_wait_ms, 95):.2f}")
-
     if exec_ms:
         print(f"execution_mean_ms:  {statistics.mean(exec_ms):.2f}")
         print(f"execution_p95_ms:   {percentile(exec_ms, 95):.2f}")
diff --git a/main.py b/main.py
index ac15ad30a..110dc34bf 100644
--- a/main.py
+++ b/main.py
@@ -301,7 +301,6 @@ def prompt_worker(q, server_instance):
         if queue_item is not None:
             item, item_id = queue_item
             execution_start_time = time.perf_counter()
-            execution_start_wall_ms = int(time.time() * 1000)
             prompt_id = item[1]
             server_instance.last_prompt_id = prompt_id
 
@@ -318,11 +317,6 @@ def prompt_worker(q, server_instance):
 
             need_gc = True
 
-            queue_wait_ms = 0.0
-            created_at = extra_data.get("create_time")
-            if isinstance(created_at, int):
-                queue_wait_ms = max(0.0, execution_start_wall_ms - created_at)
-
             remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
             history_result = e.history_result
             if benchmark_mode:
@@ -331,7 +325,6 @@ def prompt_worker(q, server_instance):
                     "meta": {},
                     "benchmark": {
                         "execution_ms": execution_time_s * 1000.0,
-                        "queue_wait_ms": queue_wait_ms,
                         "nodes": e.node_timing_ms,
                     },
                 }

From 139d4a7e862c786c4af235c6e9291a3bc4ecfb74 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:07:53 -0700
Subject: [PATCH 19/27] fix format

---
 main.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/main.py b/main.py
index 110dc34bf..a8652ad6b 100644
--- a/main.py
+++ b/main.py
@@ -313,7 +313,6 @@ def prompt_worker(q, server_instance):
             if not benchmark_mode:
                 asset_seeder.pause()
             e.execute(item[2], prompt_id, extra_data, item[4])
-            execution_time_s = time.perf_counter() - execution_start_time
 
             need_gc = True
 
@@ -323,10 +322,6 @@ def prompt_worker(q, server_instance):
                 history_result = {
                     "outputs": {},
                     "meta": {},
-                    "benchmark": {
-                        "execution_ms": execution_time_s * 1000.0,
-                        "nodes": e.node_timing_ms,
-                    },
                 }
 
             q.task_done(item_id,
@@ -339,13 +334,20 @@ def prompt_worker(q, server_instance):
                 server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id)
 
             current_time = time.perf_counter()
+            execution_time = current_time - execution_start_time
 
             # Log Time in a more readable way after 10 minutes
-            if execution_time_s > 600:
-                execution_time_formatted = time.strftime("%H:%M:%S", time.gmtime(execution_time_s))
+            if execution_time > 600:
+                execution_time_formatted = time.strftime("%H:%M:%S", time.gmtime(execution_time))
                 logging.info(f"Prompt executed in {execution_time_formatted}")
             else:
-                logging.info("Prompt executed in {:.2f} seconds".format(execution_time_s))
+                logging.info("Prompt executed in {:.2f} seconds".format(execution_time))
+
+            if benchmark_mode:
+                history_result["benchmark"] = {
+                    "execution_ms": execution_time * 1000.0,
+                    "nodes": e.node_timing_ms,
+                }
 
             if not benchmark_mode and not asset_seeder.is_disabled():
                 paths = _collect_output_absolute_paths(e.history_result)

From 79825dbd320d61fa8f19435753f884f0266ae918 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:11:47 -0700
Subject: [PATCH 20/27] Fix format

---
 main.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/main.py b/main.py
index a8652ad6b..07c0dd359 100644
--- a/main.py
+++ b/main.py
@@ -317,15 +317,8 @@ def prompt_worker(q, server_instance):
             need_gc = True
 
             remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
-            history_result = e.history_result
-            if benchmark_mode:
-                history_result = {
-                    "outputs": {},
-                    "meta": {},
-                }
-
             q.task_done(item_id,
-                        history_result,
+                        e.history_result,
                         status=execution.PromptQueue.ExecutionStatus(
                             status_str='success' if e.success else 'error',
                             completed=e.success,
@@ -344,7 +337,7 @@ def prompt_worker(q, server_instance):
                 logging.info("Prompt executed in {:.2f} seconds".format(execution_time))
 
             if benchmark_mode:
-                history_result["benchmark"] = {
+                e.history_result["benchmark"] = {
                     "execution_ms": execution_time * 1000.0,
                     "nodes": e.node_timing_ms,
                 }

From ba978bc0e20a2e66ddc2dc23c09b3a137734a56a Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:13:55 -0700
Subject: [PATCH 21/27] Fix format

---
 main.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 07c0dd359..8be3036ab 100644
--- a/main.py
+++ b/main.py
@@ -310,8 +310,7 @@ def prompt_worker(q, server_instance):
                 extra_data[k] = sensitive[k]
             benchmark_mode = args.benchmark_server_only
 
-            if not benchmark_mode:
-                asset_seeder.pause()
+            asset_seeder.pause()
             e.execute(item[2], prompt_id, extra_data, item[4])
 
             need_gc = True
@@ -342,7 +341,7 @@ def prompt_worker(q, server_instance):
                     "nodes": e.node_timing_ms,
                 }
 
-            if not benchmark_mode and not asset_seeder.is_disabled():
+            if not asset_seeder.is_disabled():
                 paths = _collect_output_absolute_paths(e.history_result)
                 register_output_files(paths, job_id=prompt_id)
 
@@ -368,10 +367,9 @@ def prompt_worker(q, server_instance):
                 need_gc = False
                 hook_breaker_ac10a0.restore_functions()
 
-                if not benchmark_mode and not asset_seeder.is_disabled():
+                if not asset_seeder.is_disabled():
                     asset_seeder.enqueue_enrich(roots=("output",), compute_hashes=True)
-                if not benchmark_mode:
-                    asset_seeder.resume()
+                asset_seeder.resume()
 
 
 async def run(server_instance, address='', port=8188, verbose=True, call_on_start=None):

From c39f7ea76c551b1890084c6aea3b89b97c10a943 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:23:42 -0700
Subject: [PATCH 22/27] add tqdm to the benchmark

---
 benchmarks/benchmark_comfyui_serving.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 06979cdc5..2e961660c 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -51,6 +51,7 @@ from pathlib import Path
 from typing import Any
 
 import aiohttp
+from tqdm import tqdm
 
 
 # ──────────────────────────────────────────────────────────────────────────────
@@ -669,7 +670,14 @@ async def async_main(args: argparse.Namespace) -> None:
             )
             for i in range(args.num_requests)
         ]
-        results = await asyncio.gather(*tasks)
+        results = []
+        with tqdm(total=args.num_requests, unit="req", desc="benchmark") as pbar:
+            for coro in asyncio.as_completed(tasks):
+                result = await coro
+                results.append(result)
+                pbar.update(1)
+                if result.ok:
+                    pbar.set_postfix(succeeded=sum(r.ok for r in results))
     wall_s = time.perf_counter() - started
 
     print_summary(results, wall_s)

From 54ced2923bfff2f3950ba92f90196ab4b215ab84 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:28:42 -0700
Subject: [PATCH 23/27] don't generate synthetic when load vbench fails

---
 benchmarks/benchmark_comfyui_serving.py | 27 ++++++++-----------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index 2e961660c..f68c22054 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -144,13 +144,9 @@ def download_models(base_dir: Path, model: str, task: str) -> None:
 def _try_download_vbench_i2v(input_dir: Path) -> list[str]:
     """
     Download VBench I2V origin images from Google Drive via gdown (pip install gdown).
-    Returns image basenames placed in *input_dir*, or [] on failure.
+    Raises on any failure.
     """
-    try:
-        import gdown  # type: ignore
-    except ImportError:
-        print("[setup] gdown not available; skipping VBench download. Install with: pip install gdown")
-        return []
+    import gdown  # type: ignore; raises ImportError if not installed
 
     import zipfile
 
@@ -163,11 +159,10 @@ def _try_download_vbench_i2v(input_dir: Path) -> list[str]:
         with zipfile.ZipFile(zip_path, "r") as zf:
             zf.extractall(str(input_dir))
         zip_path.unlink()
-    except Exception as exc:
-        print(f"[setup] VBench I2V download failed: {exc}")
+    except Exception:
         if zip_path.exists():
             zip_path.unlink()
-        return []
+        raise
 
     image_exts = {".png", ".jpg", ".jpeg", ".webp"}
     filenames = sorted(
@@ -206,20 +201,14 @@ def prepare_input_images(
 ) -> list[str]:
     """
     Prepare benchmark input images in *input_dir*.
-
-    Priority:
-      1. Reuse any images already present in the directory.
-      2. Fetch from the source specified by *image_source* (e.g. "vbench_i2v").
-      3. Generate synthetic 720×720 white PNG placeholders with Pillow.
-
-    Returns a list of image basenames (not full paths).
+    For "vbench_i2v", downloads from Google Drive and raises on failure.
+    Falls back to synthetic images only when image_source is not "vbench_i2v".
+    Returns a list of image paths relative to *input_dir*.
     """
     input_dir.mkdir(parents=True, exist_ok=True)
 
     if image_source == "vbench_i2v":
-        filenames = _try_download_vbench_i2v(input_dir)
-        if filenames:
-            return filenames
+        return _try_download_vbench_i2v(input_dir)
 
     print(f"[setup] generating {num_images} synthetic 720×720 placeholder images ...")
     return _generate_synthetic_images(input_dir, num_images)

From a2204ec976719aeab1bc6ce5f2978a4ddf1d0eb7 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:34:38 -0700
Subject: [PATCH 24/27] force to regenerate prompts everytime

---
 benchmarks/benchmark_comfyui_serving.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/benchmarks/benchmark_comfyui_serving.py b/benchmarks/benchmark_comfyui_serving.py
index f68c22054..bd884baca 100644
--- a/benchmarks/benchmark_comfyui_serving.py
+++ b/benchmarks/benchmark_comfyui_serving.py
@@ -262,12 +262,6 @@ def generate_prompt_files(
             raise ValueError("--comfyui-base-dir is required when --download-models is set")
         download_models(comfyui_base_dir, model, task)
 
-    # Skip prompt generation if files already exist.
-    existing = sorted(output_dir.glob(f"{model}_{task}_prompt_*.json"))
-    if existing:
-        print(f"[setup] found {len(existing)} existing prompt files in {output_dir}, skipping generation")
-        return existing
-
     image_filenames = prepare_input_images(
         input_dir,
         num_images=num_images,

From 9ea25780c66dc91a30d358dbaad3d16965702d87 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 22:44:00 -0700
Subject: [PATCH 25/27] fix benchmark

---
 main.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/main.py b/main.py
index 8be3036ab..bab80379a 100644
--- a/main.py
+++ b/main.py
@@ -315,6 +315,12 @@ def prompt_worker(q, server_instance):
 
             need_gc = True
 
+            if benchmark_mode:
+                e.history_result["benchmark"] = {
+                    "execution_ms": (time.perf_counter() - execution_start_time) * 1000.0,
+                    "nodes": e.node_timing_ms,
+                }
+
             remove_sensitive = lambda prompt: prompt[:5] + prompt[6:]
             q.task_done(item_id,
                         e.history_result,
@@ -335,12 +341,6 @@ def prompt_worker(q, server_instance):
             else:
                 logging.info("Prompt executed in {:.2f} seconds".format(execution_time))
 
-            if benchmark_mode:
-                e.history_result["benchmark"] = {
-                    "execution_ms": execution_time * 1000.0,
-                    "nodes": e.node_timing_ms,
-                }
-
             if not asset_seeder.is_disabled():
                 paths = _collect_output_absolute_paths(e.history_result)
                 register_output_files(paths, job_id=prompt_id)

From 1d64200d2e77901d0db252eeaf454bada3209984 Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 23:01:37 -0700
Subject: [PATCH 26/27] Add benchmark README

---
 benchmarks/README.md | 110 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 benchmarks/README.md

diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..8d785fd9d
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,110 @@
+# ComfyUI Serving Benchmarks
+
+Measures latency and throughput of a running ComfyUI server by submitting
+concurrent prompt requests and collecting results from the history API.
+
+## Dependencies
+
+```bash
+pip install aiohttp tqdm gdown
+```
+
+## Supported models / tasks
+
+| Model | Task | Description |
+|-------|------|-------------|
+| `wan22` | `i2v` | Wan 2.2 Image-to-Video — LightX2V 4-step, 720×720, 81 frames |
+
+To add a new model/task: drop a workflow JSON in `workflows/` (with
+`__INPUT_IMAGE__` as the image placeholder) and add an entry to
+`_MODEL_REGISTRY` in `benchmark_comfyui_serving.py`.
+
+## How it works
+
+On each run the script:
+
+1. Downloads model weights into the ComfyUI `models/` directory (only if
+   `--download-models` is passed).
+2. Downloads the [VBench I2V](https://github.com/Vchitect/VBench) image
+   dataset via `gdown` into ComfyUI's `input/` folder.
+3. Generates one prompt JSON per input image under
+   `benchmarks/prompts/<model>_<task>/`.
+4. Submits `--num-requests` prompts to the server, cycling through the
+   generated prompt files in round-robin order.
+5. Polls `/history/{prompt_id}` for completion and prints a latency /
+   throughput summary.
+
+Per-node execution times are available when the server is started with
+`--benchmark-server-only`.
+
+## Usage
+
+### Start the server
+
+```bash
+python main.py --listen 127.0.0.1 --port 8188 --benchmark-server-only
+```
+
+### Run the benchmark
+
+```bash
+# From the ComfyUI root directory:
+python3 benchmarks/benchmark_comfyui_serving.py \
+  --model wan22 --task i2v \
+  --num-requests 50 --max-concurrency 4 \
+  --host http://127.0.0.1:8188
+```
+
+Include model weight download on first run:
+
+```bash
+python3 benchmarks/benchmark_comfyui_serving.py \
+  --model wan22 --task i2v \
+  --download-models --comfyui-base-dir /path/to/ComfyUI \
+  --num-requests 50 --max-concurrency 4 \
+  --host http://127.0.0.1:8188
+```
+
+### All flags
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--model` | *(required)* | Model name (e.g. `wan22`) |
+| `--task` | *(required)* | Task type (e.g. `i2v`) |
+| `--host` | `http://127.0.0.1:8188` | ComfyUI base URL |
+| `--num-requests` | `50` | Total requests to submit |
+| `--max-concurrency` | `8` | Max in-flight requests |
+| `--request-rate` | `0` | Requests/sec; `0` = fire immediately |
+| `--poisson` | off | Poisson inter-arrival when `--request-rate > 0` |
+| `--num-images` | `20` | Synthetic images if VBench download unavailable |
+| `--prompts-dir` | `benchmarks/prompts/<model>_<task>/` | Prompt JSON output directory |
+| `--download-models` | off | Download model weights before benchmarking |
+| `--comfyui-base-dir` | — | ComfyUI root (required with `--download-models`) |
+| `--output-json` | — | Write full per-request results to a JSON file |
+
+## Output
+
+```
+benchmark: 100%|████████████| 50/50 [req, succeeded=50]
+
+=== ComfyUI Serving Benchmark Summary ===
+requests_total:   50
+requests_success: 50
+requests_failed:  0
+wall_time_s:      412.341
+throughput_req_s: 0.121
+latency_p50_s:    38.201
+latency_p90_s:    52.110
+latency_p95_s:    55.837
+latency_p99_s:    60.012
+latency_mean_s:   39.445
+latency_max_s:    61.203
+execution_mean_ms: 35210.44
+execution_p95_ms:  51200.11
+
+--- Per-node execution time (mean ms across successful requests) ---
+  KSampler (Advanced) (130:110): mean=18200.1  p95=22100.3  n=50
+  KSampler (Advanced) (130:111): mean=16900.4  p95=20800.7  n=50
+  VAEDecode (130:129):           mean=420.2    p95=510.1    n=50
+  ...
+```

From 875bdc4015e50541463e8587c604a663ef6d4e3a Mon Sep 17 00:00:00 2001
From: Tara Ding <38710454+windtara0619@users.noreply.github.com>
Date: Mon, 27 Apr 2026 23:05:42 -0700
Subject: [PATCH 27/27] Update README

---
 benchmarks/README.md | 51 +++++++++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 17 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8d785fd9d..4dbc42a01 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -85,26 +85,43 @@ python3 benchmarks/benchmark_comfyui_serving.py \
 ## Output
 
 ```
-benchmark: 100%|████████████| 50/50 [req, succeeded=50]
+benchmark: 100%|█████████████| 5/5 [02:58<00:00, 35.73s/req, succeeded=5]
 
 === ComfyUI Serving Benchmark Summary ===
-requests_total:   50
-requests_success: 50
+requests_total:   5
+requests_success: 5
 requests_failed:  0
-wall_time_s:      412.341
-throughput_req_s: 0.121
-latency_p50_s:    38.201
-latency_p90_s:    52.110
-latency_p95_s:    55.837
-latency_p99_s:    60.012
-latency_mean_s:   39.445
-latency_max_s:    61.203
-execution_mean_ms: 35210.44
-execution_p95_ms:  51200.11
+wall_time_s:      178.652
+throughput_req_s: 0.028
+latency_p50_s:    109.594
+latency_p90_s:    164.840
+latency_p95_s:    171.744
+latency_p99_s:    177.266
+latency_mean_s:   109.781
+latency_max_s:    178.647
+execution_mean_ms:  35465.21
+execution_p95_ms:   39685.06
 
 --- Per-node execution time (mean ms across successful requests) ---
-  KSampler (Advanced) (130:110): mean=18200.1  p95=22100.3  n=50
-  KSampler (Advanced) (130:111): mean=16900.4  p95=20800.7  n=50
-  VAEDecode (130:129):           mean=420.2    p95=510.1    n=50
-  ...
+  KSamplerAdvanced (130:110): mean=12827.5  p95=14264.0  n=5
+  KSamplerAdvanced (130:111): mean=12726.4  p95=13822.2  n=5
+  VAEDecode (130:129): mean=3439.0  p95=3467.6  n=5
+  SaveVideo (108): mean=2844.7  p95=3280.0  n=5
+  WanImageToVideo (130:128): mean=2367.7  p95=2595.9  n=5
+  CLIPTextEncode (130:125): mean=1785.0  p95=1785.0  n=1
+  CLIPLoader (130:105): mean=700.7  p95=700.7  n=1
+  LoadImage (97): mean=518.4  p95=970.0  n=5
+  VAELoader (130:106): mean=507.7  p95=507.7  n=1
+  CLIPTextEncode (130:107): mean=223.4  p95=223.4  n=1
+  UNETLoader (130:122): mean=122.2  p95=122.2  n=1
+  LoraLoaderModelOnly (130:126): mean=68.1  p95=68.1  n=1
+  UNETLoader (130:123): mean=65.9  p95=65.9  n=1
+  LoraLoaderModelOnly (130:127): mean=36.2  p95=36.2  n=1
+  ModelSamplingSD3 (130:109): mean=1.0  p95=1.0  n=1
+  ModelSamplingSD3 (130:124): mean=0.9  p95=0.9  n=1
+  CreateVideo (130:117): mean=0.7  p95=1.1  n=5
 ```
+
+> **Note:** Nodes with `n=1` (e.g. model loaders) are cached by ComfyUI after
+> the first request and skipped in subsequent executions, so they only appear
+> once across the benchmark run.