mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-03-11 12:17:45 +08:00
Widen OOM_EXCEPTION to AcceleratorError form (#12835)
Some checks are pending
Python Linting / Run Ruff (push) Waiting to run
Python Linting / Run Pylint (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run
Execution Tests / test (macos-latest) (push) Waiting to run
Execution Tests / test (ubuntu-latest) (push) Waiting to run
Execution Tests / test (windows-latest) (push) Waiting to run
Test server launches without errors / test (push) Waiting to run
Unit Tests / test (macos-latest) (push) Waiting to run
Unit Tests / test (ubuntu-latest) (push) Waiting to run
Unit Tests / test (windows-2022) (push) Waiting to run
Some checks are pending
Python Linting / Run Ruff (push) Waiting to run
Python Linting / Run Pylint (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run
Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run
Execution Tests / test (macos-latest) (push) Waiting to run
Execution Tests / test (ubuntu-latest) (push) Waiting to run
Execution Tests / test (windows-latest) (push) Waiting to run
Test server launches without errors / test (push) Waiting to run
Unit Tests / test (macos-latest) (push) Waiting to run
Unit Tests / test (ubuntu-latest) (push) Waiting to run
Unit Tests / test (windows-2022) (push) Waiting to run
Pytorch only filters for OOMs in its own allocators however there are paths that can OOM on allocators made outside the pytorch allocators. These manifest as an AllocatorError as pytorch does not have universal error translation to its OOM type on exception. Handle it. A log I have for this also shows a double report of the error async, so call the async discarder to cleanup and make these OOMs look like OOMs.
This commit is contained in:
parent
a912809c25
commit
535c16ce6e
@ -372,7 +372,8 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
|
|||||||
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
|
r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
|
||||||
del s2
|
del s2
|
||||||
break
|
break
|
||||||
except model_management.OOM_EXCEPTION as e:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
if first_op_done == False:
|
if first_op_done == False:
|
||||||
model_management.soft_empty_cache(True)
|
model_management.soft_empty_cache(True)
|
||||||
if cleared_cache == False:
|
if cleared_cache == False:
|
||||||
|
|||||||
@ -258,7 +258,8 @@ def slice_attention(q, k, v):
|
|||||||
r1[:, :, i:end] = torch.bmm(v, s2)
|
r1[:, :, i:end] = torch.bmm(v, s2)
|
||||||
del s2
|
del s2
|
||||||
break
|
break
|
||||||
except model_management.OOM_EXCEPTION as e:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
model_management.soft_empty_cache(True)
|
model_management.soft_empty_cache(True)
|
||||||
steps *= 2
|
steps *= 2
|
||||||
if steps > 128:
|
if steps > 128:
|
||||||
@ -314,7 +315,8 @@ def pytorch_attention(q, k, v):
|
|||||||
try:
|
try:
|
||||||
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False)
|
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False)
|
||||||
out = out.transpose(2, 3).reshape(orig_shape)
|
out = out.transpose(2, 3).reshape(orig_shape)
|
||||||
except model_management.OOM_EXCEPTION:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
|
logging.warning("scaled_dot_product_attention OOMed: switched to slice attention")
|
||||||
oom_fallback = True
|
oom_fallback = True
|
||||||
if oom_fallback:
|
if oom_fallback:
|
||||||
|
|||||||
@ -169,7 +169,8 @@ def _get_attention_scores_no_kv_chunking(
|
|||||||
try:
|
try:
|
||||||
attn_probs = attn_scores.softmax(dim=-1)
|
attn_probs = attn_scores.softmax(dim=-1)
|
||||||
del attn_scores
|
del attn_scores
|
||||||
except model_management.OOM_EXCEPTION:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
logging.warning("ran out of memory while running softmax in _get_attention_scores_no_kv_chunking, trying slower in place softmax instead")
|
logging.warning("ran out of memory while running softmax in _get_attention_scores_no_kv_chunking, trying slower in place softmax instead")
|
||||||
attn_scores -= attn_scores.max(dim=-1, keepdim=True).values # noqa: F821 attn_scores is not defined
|
attn_scores -= attn_scores.max(dim=-1, keepdim=True).values # noqa: F821 attn_scores is not defined
|
||||||
torch.exp(attn_scores, out=attn_scores)
|
torch.exp(attn_scores, out=attn_scores)
|
||||||
|
|||||||
@ -270,6 +270,18 @@ try:
|
|||||||
except:
|
except:
|
||||||
OOM_EXCEPTION = Exception
|
OOM_EXCEPTION = Exception
|
||||||
|
|
||||||
|
def is_oom(e):
|
||||||
|
if isinstance(e, OOM_EXCEPTION):
|
||||||
|
return True
|
||||||
|
if isinstance(e, torch.AcceleratorError) and getattr(e, 'error_code', None) == 2:
|
||||||
|
discard_cuda_async_error()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def raise_non_oom(e):
|
||||||
|
if not is_oom(e):
|
||||||
|
raise e
|
||||||
|
|
||||||
XFORMERS_VERSION = ""
|
XFORMERS_VERSION = ""
|
||||||
XFORMERS_ENABLED_VAE = True
|
XFORMERS_ENABLED_VAE = True
|
||||||
if args.disable_xformers:
|
if args.disable_xformers:
|
||||||
|
|||||||
@ -954,7 +954,8 @@ class VAE:
|
|||||||
if pixel_samples is None:
|
if pixel_samples is None:
|
||||||
pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
|
pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
|
||||||
pixel_samples[x:x+batch_number] = out
|
pixel_samples[x:x+batch_number] = out
|
||||||
except model_management.OOM_EXCEPTION:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
|
logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
|
||||||
#NOTE: We don't know what tensors were allocated to stack variables at the time of the
|
#NOTE: We don't know what tensors were allocated to stack variables at the time of the
|
||||||
#exception and the exception itself refs them all until we get out of this except block.
|
#exception and the exception itself refs them all until we get out of this except block.
|
||||||
@ -1029,7 +1030,8 @@ class VAE:
|
|||||||
samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
|
samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device)
|
||||||
samples[x:x + batch_number] = out
|
samples[x:x + batch_number] = out
|
||||||
|
|
||||||
except model_management.OOM_EXCEPTION:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
|
logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
|
||||||
#NOTE: We don't know what tensors were allocated to stack variables at the time of the
|
#NOTE: We don't know what tensors were allocated to stack variables at the time of the
|
||||||
#exception and the exception itself refs them all until we get out of this except block.
|
#exception and the exception itself refs them all until we get out of this except block.
|
||||||
|
|||||||
@ -86,7 +86,8 @@ class ImageUpscaleWithModel(io.ComfyNode):
|
|||||||
pbar = comfy.utils.ProgressBar(steps)
|
pbar = comfy.utils.ProgressBar(steps)
|
||||||
s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar)
|
s = comfy.utils.tiled_scale(in_img, lambda a: upscale_model(a), tile_x=tile, tile_y=tile, overlap=overlap, upscale_amount=upscale_model.scale, pbar=pbar)
|
||||||
oom = False
|
oom = False
|
||||||
except model_management.OOM_EXCEPTION as e:
|
except Exception as e:
|
||||||
|
model_management.raise_non_oom(e)
|
||||||
tile //= 2
|
tile //= 2
|
||||||
if tile < 128:
|
if tile < 128:
|
||||||
raise e
|
raise e
|
||||||
|
|||||||
@ -612,7 +612,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
|
|||||||
logging.error(traceback.format_exc())
|
logging.error(traceback.format_exc())
|
||||||
tips = ""
|
tips = ""
|
||||||
|
|
||||||
if isinstance(ex, comfy.model_management.OOM_EXCEPTION):
|
if comfy.model_management.is_oom(ex):
|
||||||
tips = "This error means you ran out of memory on your GPU.\n\nTIPS: If the workflow worked before you might have accidentally set the batch_size to a large number."
|
tips = "This error means you ran out of memory on your GPU.\n\nTIPS: If the workflow worked before you might have accidentally set the batch_size to a large number."
|
||||||
logging.info("Memory summary: {}".format(comfy.model_management.debug_memory_summary()))
|
logging.info("Memory summary: {}".format(comfy.model_management.debug_memory_summary()))
|
||||||
logging.error("Got an OOM, unloading all loaded models.")
|
logging.error("Got an OOM, unloading all loaded models.")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user