mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-23 16:29:25 +08:00
model_management: treat shared-memory-dominant integrated GPUs as SHARED vram
On AMD APUs (and other integrated GPUs) the "VRAM" reported by torch.cuda.mem_get_info() is the GTT/shared aperture carved out of host RAM, not a dedicated board. ComfyUI starts such devices in NORMAL_VRAM and later sums device VRAM plus system RAM when sizing the model-load budget, so on a UMA part the same physical RAM is counted twice and the inflated budget triggers HIGH_VRAM / gpu-only placement that OOMs the shared pool. Detecting integrated GPUs alone is not enough: integrated parts vary widely in how memory is split. Some (large BIOS UMA carveout, e.g. Strix Halo) report most memory as dedicated mem_info_vram_total, where HIGH_VRAM is right; others report a small VRAM carveout with the bulk in GTT, where SHARED is right. Demoting every integrated GPU to SHARED would regress the dedicated-heavy configs. Key the demotion on the amdgpu mem_info_vram_total vs mem_info_gtt_total ratio: only when an integrated GPU's shared (GTT) pool is at least as large as its dedicated VRAM do we switch it to VRAMState.SHARED. Dedicated-heavy integrated parts and discrete GPUs keep NORMAL_VRAM. When the sysfs totals cannot be read (e.g. NVIDIA Tegra, which has no dedicated VRAM) the device is treated as shared-heavy, matching its true unified memory. Fixes #14274 Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
This commit is contained in:
parent
2cdaaf4a25
commit
a2db31582f
@ -432,6 +432,86 @@ def is_amd():
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def is_integrated_gpu(device=None):
|
||||||
|
# AMD APUs / integrated GPUs expose host RAM (GTT/shared) as device memory
|
||||||
|
# via mem_get_info(); torch flags these as integrated. See ComfyUI #14274.
|
||||||
|
if cpu_state != CPUState.GPU:
|
||||||
|
return False
|
||||||
|
if not (is_nvidia() or is_amd()):
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
if device is None:
|
||||||
|
device = get_torch_device()
|
||||||
|
return bool(getattr(torch.cuda.get_device_properties(device), "is_integrated", 0))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _amd_vram_gtt_totals(device=None):
|
||||||
|
# Best-effort (vram_total, gtt_total) in bytes from the amdgpu sysfs nodes
|
||||||
|
# mem_info_vram_total / mem_info_gtt_total, or None when they cannot be read
|
||||||
|
# (e.g. NVIDIA Tegra integrated parts that have no dedicated VRAM). #14274
|
||||||
|
if not is_amd():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
drm_root = "/sys/class/drm"
|
||||||
|
candidates = []
|
||||||
|
for name in os.listdir(drm_root):
|
||||||
|
if not (name.startswith("card") and name[len("card"):].isdigit()):
|
||||||
|
continue
|
||||||
|
dev_dir = os.path.join(drm_root, name, "device")
|
||||||
|
vram_path = os.path.join(dev_dir, "mem_info_vram_total")
|
||||||
|
gtt_path = os.path.join(dev_dir, "mem_info_gtt_total")
|
||||||
|
if not (os.path.exists(vram_path) and os.path.exists(gtt_path)):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
with open(os.path.join(dev_dir, "vendor")) as vf:
|
||||||
|
if vf.read().strip().lower() != "0x1002":
|
||||||
|
continue
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
candidates.append((os.path.basename(os.path.realpath(dev_dir)), vram_path, gtt_path))
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
chosen = None
|
||||||
|
bus_id = None
|
||||||
|
try:
|
||||||
|
if device is None:
|
||||||
|
device = get_torch_device()
|
||||||
|
bus_id = getattr(torch.cuda.get_device_properties(device), "pci_bus_id", None)
|
||||||
|
except Exception:
|
||||||
|
bus_id = None
|
||||||
|
if bus_id:
|
||||||
|
bus_id = str(bus_id).lower()
|
||||||
|
for pci, vram_path, gtt_path in candidates:
|
||||||
|
if pci.lower().endswith(bus_id) or bus_id.endswith(pci.lower()):
|
||||||
|
chosen = (vram_path, gtt_path)
|
||||||
|
break
|
||||||
|
if chosen is None and len(candidates) == 1:
|
||||||
|
chosen = (candidates[0][1], candidates[0][2])
|
||||||
|
if chosen is None:
|
||||||
|
return None
|
||||||
|
with open(chosen[0]) as f:
|
||||||
|
vram_total = int(f.read().strip())
|
||||||
|
with open(chosen[1]) as f:
|
||||||
|
gtt_total = int(f.read().strip())
|
||||||
|
return (vram_total, gtt_total)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def integrated_gpu_is_shared_heavy(device=None):
|
||||||
|
# For an integrated GPU, decide whether its memory is dominated by the shared
|
||||||
|
# GTT/host-RAM aperture (treat as UMA -> SHARED) or by a large dedicated VRAM
|
||||||
|
# carveout (keep NORMAL/HIGH_VRAM). Keys on the amdgpu mem_info_vram_total vs
|
||||||
|
# mem_info_gtt_total ratio (ComfyUI #14274). Defaults to True when the totals
|
||||||
|
# are unavailable (e.g. NVIDIA Tegra parts that have no dedicated VRAM).
|
||||||
|
totals = _amd_vram_gtt_totals(device)
|
||||||
|
if totals is None:
|
||||||
|
return True
|
||||||
|
vram_total, gtt_total = totals
|
||||||
|
if not vram_total or vram_total <= 0:
|
||||||
|
return True
|
||||||
|
return gtt_total >= vram_total
|
||||||
|
|
||||||
def amd_min_version(device=None, min_rdna_version=0):
|
def amd_min_version(device=None, min_rdna_version=0):
|
||||||
if not is_amd():
|
if not is_amd():
|
||||||
return False
|
return False
|
||||||
@ -567,6 +647,15 @@ if cpu_state != CPUState.GPU:
|
|||||||
if cpu_state == CPUState.MPS:
|
if cpu_state == CPUState.MPS:
|
||||||
vram_state = VRAMState.SHARED
|
vram_state = VRAMState.SHARED
|
||||||
|
|
||||||
|
if vram_state == VRAMState.NORMAL_VRAM and is_integrated_gpu() and integrated_gpu_is_shared_heavy():
|
||||||
|
# Integrated/UMA GPU whose shared GTT/host-RAM pool dominates the (small)
|
||||||
|
# dedicated VRAM carveout: treat as UMA and use SHARED so the shared pool is
|
||||||
|
# not double-counted as dedicated VRAM (#14274). Dedicated-heavy integrated
|
||||||
|
# parts (large BIOS UMA carveout, e.g. Strix Halo) keep NORMAL_VRAM where
|
||||||
|
# HIGH_VRAM is correct.
|
||||||
|
vram_state = VRAMState.SHARED
|
||||||
|
logging.info("Integrated GPU with shared-memory-dominant pool detected (UMA): using SHARED vram state to avoid double-counting GTT/shared memory as dedicated VRAM.")
|
||||||
|
|
||||||
logging.info(f"Set vram state to: {vram_state.name}")
|
logging.info(f"Set vram state to: {vram_state.name}")
|
||||||
|
|
||||||
DISABLE_SMART_MEMORY = args.disable_smart_memory
|
DISABLE_SMART_MEMORY = args.disable_smart_memory
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user