From e148a7f1bb6ecd9f756526da0a93ac45dc78234c Mon Sep 17 00:00:00 2001 From: Rattus Date: Thu, 13 Nov 2025 21:38:04 +1000 Subject: [PATCH] mm: discard async errors from pinning failures Pretty much every error cudaHostRegister can throw also queues the same error on the async GPU queue. This was fixed for repinning error case, but there is the bad mmap and just enomem cases that are harder to detect. Do some dummy GPU work to clean the error state. --- comfy/model_management.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/comfy/model_management.py b/comfy/model_management.py index a21df54b3..1a7b2361a 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -1098,6 +1098,15 @@ if not args.disable_pinned_memory: MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.95 logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024))) +def discard_cuda_async_error(): + try: + a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device()) + b = torch.tensor([1], dtype=torch.uint8, device=get_torch_device()) + _ = a + b + torch.cuda.synchronize() + except torch.AcceleratorError: + #Dump it! We already know about it from the synchronous return + pass def pin_memory(tensor): global TOTAL_PINNED_MEMORY @@ -1128,6 +1137,8 @@ def pin_memory(tensor): PINNED_MEMORY[ptr] = size TOTAL_PINNED_MEMORY += size return True + else: + discard_cuda_async_error() return False @@ -1156,6 +1167,8 @@ def unpin_memory(tensor): if len(PINNED_MEMORY) == 0: TOTAL_PINNED_MEMORY = 0 return True + else: + discard_cuda_async_error() return False