From a2db31582f65bf50ef87b344ceabe42481c39831 Mon Sep 17 00:00:00 2001
From: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
Date: Sun, 7 Jun 2026 12:20:20 +0800
Subject: [PATCH 1/2] model_management: treat shared-memory-dominant integrated
 GPUs as SHARED vram

On AMD APUs (and other integrated GPUs) the "VRAM" reported by
torch.cuda.mem_get_info() is the GTT/shared aperture carved out of host
RAM, not a dedicated board. ComfyUI starts such devices in NORMAL_VRAM and
later sums device VRAM plus system RAM when sizing the model-load budget,
so on a UMA part the same physical RAM is counted twice and the inflated
budget triggers HIGH_VRAM / gpu-only placement that OOMs the shared pool.

Detecting integrated GPUs alone is not enough: integrated parts vary widely
in how memory is split. Some (large BIOS UMA carveout, e.g. Strix Halo)
report most memory as dedicated mem_info_vram_total, where HIGH_VRAM is
right; others report a small VRAM carveout with the bulk in GTT, where
SHARED is right. Demoting every integrated GPU to SHARED would regress the
dedicated-heavy configs.

Key the demotion on the amdgpu mem_info_vram_total vs mem_info_gtt_total
ratio: only when an integrated GPU's shared (GTT) pool is at least as large
as its dedicated VRAM do we switch it to VRAMState.SHARED. Dedicated-heavy
integrated parts and discrete GPUs keep NORMAL_VRAM. When the sysfs totals
cannot be read (e.g. NVIDIA Tegra, which has no dedicated VRAM) the device
is treated as shared-heavy, matching its true unified memory.

Fixes #14274

Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
---
 comfy/model_management.py | 89 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 8e786c0a5..34cf3f71f 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -432,6 +432,86 @@ def is_amd():
             return True
     return False
 
+def is_integrated_gpu(device=None):
+    # AMD APUs / integrated GPUs expose host RAM (GTT/shared) as device memory
+    # via mem_get_info(); torch flags these as integrated. See ComfyUI #14274.
+    if cpu_state != CPUState.GPU:
+        return False
+    if not (is_nvidia() or is_amd()):
+        return False
+    try:
+        if device is None:
+            device = get_torch_device()
+        return bool(getattr(torch.cuda.get_device_properties(device), "is_integrated", 0))
+    except Exception:
+        return False
+
+def _amd_vram_gtt_totals(device=None):
+    # Best-effort (vram_total, gtt_total) in bytes from the amdgpu sysfs nodes
+    # mem_info_vram_total / mem_info_gtt_total, or None when they cannot be read
+    # (e.g. NVIDIA Tegra integrated parts that have no dedicated VRAM). #14274
+    if not is_amd():
+        return None
+    try:
+        drm_root = "/sys/class/drm"
+        candidates = []
+        for name in os.listdir(drm_root):
+            if not (name.startswith("card") and name[len("card"):].isdigit()):
+                continue
+            dev_dir = os.path.join(drm_root, name, "device")
+            vram_path = os.path.join(dev_dir, "mem_info_vram_total")
+            gtt_path = os.path.join(dev_dir, "mem_info_gtt_total")
+            if not (os.path.exists(vram_path) and os.path.exists(gtt_path)):
+                continue
+            try:
+                with open(os.path.join(dev_dir, "vendor")) as vf:
+                    if vf.read().strip().lower() != "0x1002":
+                        continue
+            except OSError:
+                pass
+            candidates.append((os.path.basename(os.path.realpath(dev_dir)), vram_path, gtt_path))
+        if not candidates:
+            return None
+        chosen = None
+        bus_id = None
+        try:
+            if device is None:
+                device = get_torch_device()
+            bus_id = getattr(torch.cuda.get_device_properties(device), "pci_bus_id", None)
+        except Exception:
+            bus_id = None
+        if bus_id:
+            bus_id = str(bus_id).lower()
+            for pci, vram_path, gtt_path in candidates:
+                if pci.lower().endswith(bus_id) or bus_id.endswith(pci.lower()):
+                    chosen = (vram_path, gtt_path)
+                    break
+        if chosen is None and len(candidates) == 1:
+            chosen = (candidates[0][1], candidates[0][2])
+        if chosen is None:
+            return None
+        with open(chosen[0]) as f:
+            vram_total = int(f.read().strip())
+        with open(chosen[1]) as f:
+            gtt_total = int(f.read().strip())
+        return (vram_total, gtt_total)
+    except Exception:
+        return None
+
+def integrated_gpu_is_shared_heavy(device=None):
+    # For an integrated GPU, decide whether its memory is dominated by the shared
+    # GTT/host-RAM aperture (treat as UMA -> SHARED) or by a large dedicated VRAM
+    # carveout (keep NORMAL/HIGH_VRAM). Keys on the amdgpu mem_info_vram_total vs
+    # mem_info_gtt_total ratio (ComfyUI #14274). Defaults to True when the totals
+    # are unavailable (e.g. NVIDIA Tegra parts that have no dedicated VRAM).
+    totals = _amd_vram_gtt_totals(device)
+    if totals is None:
+        return True
+    vram_total, gtt_total = totals
+    if not vram_total or vram_total <= 0:
+        return True
+    return gtt_total >= vram_total
+
 def amd_min_version(device=None, min_rdna_version=0):
     if not is_amd():
         return False
@@ -567,6 +647,15 @@ if cpu_state != CPUState.GPU:
 if cpu_state == CPUState.MPS:
     vram_state = VRAMState.SHARED
 
+if vram_state == VRAMState.NORMAL_VRAM and is_integrated_gpu() and integrated_gpu_is_shared_heavy():
+    # Integrated/UMA GPU whose shared GTT/host-RAM pool dominates the (small)
+    # dedicated VRAM carveout: treat as UMA and use SHARED so the shared pool is
+    # not double-counted as dedicated VRAM (#14274). Dedicated-heavy integrated
+    # parts (large BIOS UMA carveout, e.g. Strix Halo) keep NORMAL_VRAM where
+    # HIGH_VRAM is correct.
+    vram_state = VRAMState.SHARED
+    logging.info("Integrated GPU with shared-memory-dominant pool detected (UMA): using SHARED vram state to avoid double-counting GTT/shared memory as dedicated VRAM.")
+
 logging.info(f"Set vram state to: {vram_state.name}")
 
 DISABLE_SMART_MEMORY = args.disable_smart_memory

From 50d77af3afc0a69e2958cf75455525dc2b491fbf Mon Sep 17 00:00:00 2001
From: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
Date: Mon, 22 Jun 2026 13:41:58 +0800
Subject: [PATCH 2/2] model_management: match AMD GPU by canonical PCI BDF, not
 str(pci_bus_id)

The _amd_vram_gtt_totals() device match compared str(pci_bus_id) against the
sysfs leaf BDF, but torch reports pci_bus_id as a decimal integer while amdgpu
names its nodes as a hex "domain:bus:device.function" BDF, so the comparison
never matched. A single-GPU host was rescued by the len(candidates) == 1
fallback; a hybrid / multi-GPU host has no fallback and could fall through to
shared-heavy, demoting a dedicated GPU to SHARED (reported for a GPU sitting
behind a PCIe bridge).

Build the canonical hex BDF from torch's integer pci_domain_id / pci_bus_id /
pci_device_id and compare it against the candidate's realpath leaf BDF (PCI
function stripped). realpath already collapses any bridge chain to the leaf,
so this works for directly-attached, behind-a-bridge, and multi-GPU hosts
alike. The len(candidates) == 1 fallback is kept.

Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>

#14274
---
 comfy/model_management.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 34cf3f71f..2f93f1211 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -473,17 +473,29 @@ def _amd_vram_gtt_totals(device=None):
         if not candidates:
             return None
         chosen = None
-        bus_id = None
+        target_bdf = None
         try:
             if device is None:
                 device = get_torch_device()
-            bus_id = getattr(torch.cuda.get_device_properties(device), "pci_bus_id", None)
+            props = torch.cuda.get_device_properties(device)
+            # torch reports the PCI location as integers (pci_domain_id / pci_bus_id
+            # / pci_device_id); amdgpu names its sysfs nodes as a hex
+            # "domain:bus:device.function" BDF. Build the canonical hex BDF so the
+            # two are comparable (the old str(pci_bus_id) compared a decimal bus
+            # number against a hex BDF string and could never match). #14274
+            target_bdf = "%04x:%02x:%02x" % (
+                int(getattr(props, "pci_domain_id", 0) or 0),
+                int(getattr(props, "pci_bus_id", 0) or 0),
+                int(getattr(props, "pci_device_id", 0) or 0),
+            )
         except Exception:
-            bus_id = None
-        if bus_id:
-            bus_id = str(bus_id).lower()
+            target_bdf = None
+        if target_bdf:
             for pci, vram_path, gtt_path in candidates:
-                if pci.lower().endswith(bus_id) or bus_id.endswith(pci.lower()):
+                # candidates carry the realpath() leaf BDF (domain:bus:device.function),
+                # so matching the domain:bus:device part works whether the GPU is
+                # attached directly or sits behind a PCIe bridge (nested sysfs path). #14274
+                if pci.lower().rsplit(".", 1)[0] == target_bdf:
                     chosen = (vram_path, gtt_path)
                     break
         if chosen is None and len(candidates) == 1: