mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-12-31 17:21:03 +08:00
mm: discard async errors from pinning failures (#10738)
Pretty much every error cudaHostRegister can throw also queues the same error on the async GPU queue. This was fixed for repinning error case, but there is the bad mmap and just enomem cases that are harder to detect. Do some dummy GPU work to clean the error state.
This commit is contained in:
parent
8fd07170f1
commit
9ca7e143af
@ -1126,6 +1126,16 @@ if not args.disable_pinned_memory:
|
|||||||
|
|
||||||
PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"])
|
PINNING_ALLOWED_TYPES = set(["Parameter", "QuantizedTensor"])
|
||||||
|
|
||||||
|
def discard_cuda_async_error():
|
||||||
|
try:
|
||||||
|
a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
|
||||||
|
b = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
|
||||||
|
_ = a + b
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
except torch.AcceleratorError:
|
||||||
|
#Dump it! We already know about it from the synchronous return
|
||||||
|
pass
|
||||||
|
|
||||||
def pin_memory(tensor):
|
def pin_memory(tensor):
|
||||||
global TOTAL_PINNED_MEMORY
|
global TOTAL_PINNED_MEMORY
|
||||||
if MAX_PINNED_MEMORY <= 0:
|
if MAX_PINNED_MEMORY <= 0:
|
||||||
@ -1158,6 +1168,8 @@ def pin_memory(tensor):
|
|||||||
PINNED_MEMORY[ptr] = size
|
PINNED_MEMORY[ptr] = size
|
||||||
TOTAL_PINNED_MEMORY += size
|
TOTAL_PINNED_MEMORY += size
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
discard_cuda_async_error()
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -1186,6 +1198,8 @@ def unpin_memory(tensor):
|
|||||||
if len(PINNED_MEMORY) == 0:
|
if len(PINNED_MEMORY) == 0:
|
||||||
TOTAL_PINNED_MEMORY = 0
|
TOTAL_PINNED_MEMORY = 0
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
discard_cuda_async_error()
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user