# ------------------- Hide ROCm/HIP ------------------- import sys import os os.environ.pop("ROCM_HOME", None) os.environ.pop("HIP_HOME", None) os.environ.pop("ROCM_VERSION", None) paths = os.environ["PATH"].split(";") paths_no_rocm = [p for p in paths if "rocm" not in p.lower()] os.environ["PATH"] = ";".join(paths_no_rocm) # ------------------- End ROCm/HIP Hiding ------------- # Fix for cublasLt errors on newer ZLUDA (if no hipblaslt) os.environ['DISABLE_ADDMM_CUDA_LT'] = '1' # ------------------- main imports ------------------- # main imports import torch import ctypes import shutil import subprocess import importlib.metadata from functools import wraps from typing import Union, List from enum import Enum # ------------------- main imports ------------------- # ------------------- ComfyUI Frontend Version Check ------------------- def get_package_version(package_name): try: from importlib.metadata import version return version(package_name) except ImportError: from importlib_metadata import version return version(package_name) # Check and install/update comfyui-frontend-package required_frontend_version = "1.18.6" frontend_package_name = "comfyui-frontend-package" try: installed_version = get_package_version(frontend_package_name) # print(f"\nInstalled version of {frontend_package_name}: {installed_version}") from packaging import version if version.parse(installed_version) < version.parse(required_frontend_version): import subprocess import sys subprocess.check_call([sys.executable, '-m', 'pip', 'install', f'{frontend_package_name}=={required_frontend_version}', '--quiet', '--upgrade']) print(f"\nComfyUI Frontend Package outdated. Upgraded to {required_frontend_version}.") except Exception: import subprocess import sys subprocess.check_call([sys.executable, '-m', 'pip', 'install', f'{frontend_package_name}=={required_frontend_version}', '--quiet']) print("\nComfyUI Frontend Package was missing. Installed it.") # Check and install/update comfyui-workflow-templates required_templates_version = "0.1.3" templates_package_name = "comfyui-workflow-templates" try: installed_version = get_package_version(templates_package_name) # print(f"Installed version of {templates_package_name}: {installed_version}") from packaging import version if version.parse(installed_version) < version.parse(required_templates_version): import subprocess import sys subprocess.check_call([sys.executable, '-m', 'pip', 'install', f'{templates_package_name}=={required_templates_version}', '--quiet', '--upgrade']) print(f"\nComfyUI Workflow Templates outdated. Upgraded to {required_templates_version}.") except Exception: import subprocess import sys subprocess.check_call([sys.executable, '-m', 'pip', 'install', f'{templates_package_name}=={required_templates_version}', '--quiet']) print("\nComfyUI Workflow Templates was missing. Installed it.") # ------------------- End Frontend Version Check ------------------- # ------------------- Triton Setup ------------------- print("\n :: ------------------------ ZLUDA ----------------------- :: ") try: import triton import triton.language as tl print(" :: Triton core imported successfully") @triton.jit def _zluda_kernel_test(x_ptr, y_ptr, n_elements, BLOCK_SIZE: tl.constexpr): pid = tl.program_id(axis=0) offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < n_elements x = tl.load(x_ptr + offsets, mask=mask) tl.store(y_ptr + offsets, x + 1, mask=mask) def _verify_triton() -> bool: try: print(" :: Running Triton kernel test...") x = torch.ones(64, device='cuda') y = torch.empty_like(x) _zluda_kernel_test[(1,)](x, y, x.numel(), BLOCK_SIZE=64) if torch.allclose(y, x + 1): print(" :: Triton kernel test passed successfully") return True print(" :: Triton kernel test failed (incorrect output)") return False except Exception as e: print(f" :: Triton test failed: {str(e)}") return False triton_available = _verify_triton() if triton_available: print(" :: Triton initialized successfully") os.environ['FLASH_ATTENTION_TRITON_AMD_AUTOTUNE'] = 'TRUE' else: print(" :: Triton available but failed verification") except ImportError: print(" :: Triton not installed") triton_available = False except Exception as e: print(f" :: Triton initialization failed: {str(e)}") triton_available = False # ------------------- End Triton Verification ------------------- # ------------------- ZLUDA Detection ------------------- zluda_device_name = torch.cuda.get_device_name() if torch.cuda.is_available() else "" is_zluda = zluda_device_name.endswith("[ZLUDA]") # ------------------- End Detection -------------------- # # ------------------- ZLUDA Core Implementation ------------------- MEM_BUS_WIDTH = { "AMD Radeon RX 9070 XT": 256, "AMD Radeon RX 9070": 256, "AMD Radeon RX 9060 XT": 192, "AMD Radeon RX 7900 XTX": 384, "AMD Radeon RX 7900 XT": 320, "AMD Radeon RX 7900 GRE": 256, "AMD Radeon RX 7800 XT": 256, "AMD Radeon RX 7700 XT": 192, "AMD Radeon RX 7700": 192, "AMD Radeon RX 7650 GRE": 128, "AMD Radeon RX 7600 XT": 128, "AMD Radeon RX 7600": 128, "AMD Radeon RX 7500 XT": 96, "AMD Radeon RX 6950 XT": 256, "AMD Radeon RX 6900 XT": 256, "AMD Radeon RX 6800 XT": 256, "AMD Radeon RX 6800": 256, "AMD Radeon RX 6750 XT": 192, "AMD Radeon RX 6700 XT": 192, "AMD Radeon RX 6700": 160, "AMD Radeon RX 6650 XT": 128, "AMD Radeon RX 6600 XT": 128, "AMD Radeon RX 6600": 128, "AMD Radeon RX 6500 XT": 64, "AMD Radeon RX 6400": 64, } # ------------------- Device Properties Implementation ------------------- class DeviceProperties: PROPERTIES_OVERRIDE = {"regs_per_multiprocessor": 65535, "gcnArchName": "UNKNOWN ARCHITECTURE"} internal: torch._C._CudaDeviceProperties def __init__(self, props: torch._C._CudaDeviceProperties): self.internal = props def __getattr__(self, name): if name in DeviceProperties.PROPERTIES_OVERRIDE: return DeviceProperties.PROPERTIES_OVERRIDE[name] return getattr(self.internal, name) # ------------------- Audio Ops Patch ------------------- if is_zluda: _torch_stft = torch.stft _torch_istft = torch.istft def z_stft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): return _torch_stft(input=input.cpu(), window=window.cpu(), *args, **kwargs).to(input.device) def z_istft(input: torch.Tensor, window: torch.Tensor, *args, **kwargs): return _torch_istft(input=input.cpu(), window=window.cpu(), *args, **kwargs).to(input.device) def z_jit(f, *_, **__): f.graph = torch._C.Graph() return f torch._dynamo.config.suppress_errors = True torch.stft = z_stft torch.istft = z_istft torch.jit.script = z_jit # ------------------- End Audio Patch ------------------- # ------------------- Top-K Fallback Patch ------------------- if is_zluda: _topk = torch.topk def safe_topk(input: torch.Tensor, *args, **kwargs): device = input.device values, indices = _topk(input.cpu(), *args, **kwargs) return torch.return_types.topk((values.to(device), indices.to(device),)) torch.topk = safe_topk # ------------------- End Top-K Patch ------------------- # ------------------- ONNX Runtime Patch ------------------- try: import onnxruntime as ort if is_zluda: print(" :: Patching ONNX Runtime for ZLUDA — disabling CUDA EP.") # Store original get_available_providers original_get_available_providers = ort.get_available_providers def filtered_providers(): return [ep for ep in original_get_available_providers() if ep != "CUDAExecutionProvider"] # Patch ONLY the _pybind_state version (used during session creation) ort.capi._pybind_state.get_available_providers = filtered_providers # Wrap InferenceSession to force CPU provider when CUDA is explicitly requested OriginalSession = ort.InferenceSession class SafeInferenceSession(OriginalSession): def __init__(self, *args, providers=None, **kwargs): if providers and "CUDAExecutionProvider" in providers: print(" :: Forcing ONNX to use CPUExecutionProvider instead of CUDA.") providers = ["CPUExecutionProvider"] super().__init__(*args, providers=providers, **kwargs) ort.InferenceSession = SafeInferenceSession except ImportError: print(" :: ONNX Runtime not installed — skipping patch.") except Exception as e: print(" :: Failed to patch ONNX Runtime:", e) # ------------------- End ONNX Patch ------------------- # ------------------- ZLUDA hijack --------------------- do_nothing = lambda _: None def do_hijack(): if not is_zluda: return print(f" :: Using ZLUDA with device: {zluda_device_name}") print(" :: Applying core ZLUDA patches...") # 2. Triton optimizations if triton_available: print(" :: Initializing Triton optimizations") try: # General Triton config print(" :: Configuring Triton device properties...") _get_props = triton.runtime.driver.active.utils.get_device_properties def patched_props(device): props = _get_props(device) name = torch.cuda.get_device_name()[:-8] # Remove [ZLUDA] props["mem_bus_width"] = MEM_BUS_WIDTH.get(name, 128) if name not in MEM_BUS_WIDTH: print(f' :: Using default mem_bus_width=128 for {name}') return props triton.runtime.driver.active.utils.get_device_properties = patched_props print(" :: Triton device properties configured") # Flash Attention flash_enabled = False try: from comfy.flash_attn_triton_amd import interface_fa print(" :: Flash attention components found") original_sdpa = torch.nn.functional.scaled_dot_product_attention def amd_flash_wrapper(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None): try: if (query.shape[-1] <= 128 and not attn_mask and query.dtype != torch.float32): if scale is None: scale = query.shape[-1] ** -0.5 return interface_fa.fwd( query.transpose(1, 2), key.transpose(1, 2), value.transpose(1, 2), None, None, dropout_p, scale, is_causal, -1, -1, 0.0, False, None )[0].transpose(1, 2) except Exception as e: print(f' :: Flash attention error: {str(e)}') return original_sdpa(query=query, key=key, value=value, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal, scale=scale) torch.nn.functional.scaled_dot_product_attention = amd_flash_wrapper flash_enabled = True print(" :: AMD flash attention enabled successfully") except ImportError: print(" :: Flash attention components not installed") except Exception as e: print(f" :: Flash attention setup failed: {str(e)}") # Other Triton optimizations if not flash_enabled: print(" :: Applying basic Triton optimizations") # Add other Triton optimizations here # ... except Exception as e: print(f" :: Triton optimization failed: {str(e)}") else: print(" :: Triton optimizations skipped (not available)") # 3. Common configurations print(" :: Configuring PyTorch backends...") torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_mem_efficient_sdp = do_nothing torch.backends.cudnn.enabled = True if hasattr(torch.backends.cuda, "enable_flash_sdp"): torch.backends.cuda.enable_flash_sdp(True) print(" :: Disabled CUDA flash attention") if hasattr(torch.backends.cuda, "enable_math_sdp"): torch.backends.cuda.enable_math_sdp(True) print(" :: Enabled math attention fallback") print(" :: ZLUDA initialization complete") print(" :: ------------------------ ZLUDA ----------------------- :: \n") if is_zluda: do_hijack() else: print(f" :: CUDA device detected: {zluda_device_name or 'None'}")