Merge 9dd4d35e80 into 72e3f6081c

Add downscale ratio to empty ltxv latent. (#13999 )
Adding new StringFormat node (#13997 )
2026-06-18 05:49:41 +08:00 · 2026-05-20 00:45:52 -04:00 · 2026-05-19 20:28:06 -07:00 · 2026-05-20 10:25:49 +08:00 · 2026-05-19 16:55:04 -07:00 · 2026-05-19 14:48:47 -07:00
11 changed files with 895 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -421,6 +421,14 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w
 > Note: Windows users can use [alexisrolland/docker-openssl](https://github.com/alexisrolland/docker-openssl) or one of the [3rd party binary distributions](https://wiki.openssl.org/index.php/Binaries) to run the command example above.
 <br/><br/>If you use a container, note that the volume mount `-v` can be a relative path so `... -v ".\:/openssl-certs" ...` would create the key & cert files in the current directory of your command prompt or powershell terminal.

+## How to run heavy workflow on mid range GPU (NVIDIA-Linux)?
+
+Use the `--enable-gds` flag to activate NVIDIA [GPUDirect Storage](https://docs.nvidia.com/gpudirect-storage/) (GDS), which allows data to be transferred directly between SSDs and GPUs. This eliminates traditional CPU-mediated data paths, significantly reducing I/O latency and CPU overhead. System RAM will still be utilized for caching to further optimize performance, along with SSD.
+
+This feature is tested on NVIDIA GPUs on Linux based system only.
+
+Requires: `cupy-cuda12x>=12.0.0`, `pynvml>=11.4.1`, `cudf>=23.0.0`, `numba>=0.57.0`, `nvidia-ml-py>=12.0.0`.
+
 ## Support and dev channel

 [Discord](https://comfy.org/discord): Try the #help or #feedback channels.
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -159,6 +159,17 @@ parser.add_argument("--default-hashing-function", type=str, choices=['md5', 'sha
 parser.add_argument("--disable-smart-memory", action="store_true", help="Force ComfyUI to agressively offload to regular ram instead of keeping models in vram when it can.")
 parser.add_argument("--deterministic", action="store_true", help="Make pytorch use slower deterministic algorithms when it can. Note that this might not make images deterministic in all cases.")

+# GPUDirect Storage (GDS) arguments
+gds_group = parser.add_argument_group('gds', 'GPUDirect Storage options for direct SSD-to-GPU model loading')
+gds_group.add_argument("--enable-gds", action="store_true", help="Enable GPUDirect Storage for direct SSD-to-GPU model loading (requires CUDA 11.4+, cuFile).")
+gds_group.add_argument("--disable-gds", action="store_true", help="Explicitly disable GPUDirect Storage.")
+gds_group.add_argument("--gds-min-file-size", type=int, default=100, help="Minimum file size in MB to use GDS (default: 100MB).")
+gds_group.add_argument("--gds-chunk-size", type=int, default=64, help="GDS transfer chunk size in MB (default: 64MB).")
+gds_group.add_argument("--gds-streams", type=int, default=4, help="Number of CUDA streams for GDS operations (default: 4).")
+gds_group.add_argument("--gds-prefetch", action="store_true", help="Enable GDS prefetching for better performance.")
+gds_group.add_argument("--gds-no-fallback", action="store_true", help="Disable fallback to CPU loading if GDS fails.")
+gds_group.add_argument("--gds-stats", action="store_true", help="Print GDS statistics on exit.")
+
 class PerformanceFeature(enum.Enum):
    Fp16Accumulation = "fp16_accumulation"
    Fp8MatrixMultiplication = "fp8_matrix_mult"
--- a/comfy/gds_loader.py
+++ b/comfy/gds_loader.py
@ -0,0 +1,494 @@
+# copyright 2025 Maifee Ul Asad @ github.com/maifeeulasad
+# copyright under GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007
+
+"""
+GPUDirect Storage (GDS) Integration for ComfyUI
+Direct SSD-to-GPU model loading without RAM/CPU bottlenecks
+Still there will be some CPU/RAM usage, mostly for safetensors parsing and small buffers.
+
+This module provides GPUDirect Storage functionality to load models directly
+from NVMe SSDs to GPU memory, bypassing system RAM and CPU.
+"""
+
+import os
+import logging
+import torch
+import time
+from typing import Optional, Dict, Any, Union
+from pathlib import Path
+import safetensors
+import gc
+import mmap
+from dataclasses import dataclass
+
+try:
+    import cupy
+    import cupy.cuda.runtime as cuda_runtime
+    CUPY_AVAILABLE = True
+except ImportError:
+    CUPY_AVAILABLE = False
+    logging.warning("CuPy not available. GDS will use fallback mode.")
+
+try:
+    import cudf  # RAPIDS for GPU dataframes
+    RAPIDS_AVAILABLE = True
+except ImportError:
+    RAPIDS_AVAILABLE = False
+
+try:
+    import pynvml
+    pynvml.nvmlInit()
+    NVML_AVAILABLE = True
+except ImportError:
+    NVML_AVAILABLE = False
+    logging.warning("NVIDIA-ML-Py not available. GPU monitoring disabled.")
+
+@dataclass
+class GDSConfig:
+    """Configuration for GPUDirect Storage"""
+    enabled: bool = True
+    min_file_size_mb: int = 100  # Only use GDS for files larger than this
+    chunk_size_mb: int = 64      # Size of chunks to transfer
+    use_pinned_memory: bool = True
+    prefetch_enabled: bool = True
+    compression_aware: bool = True
+    max_concurrent_streams: int = 4
+    fallback_to_cpu: bool = True
+    show_stats: bool = False     # Whether to show stats on exit
+
+
+class GDSError(Exception):
+    """GDS-specific errors"""
+    pass
+
+
+class GPUDirectStorage:
+    """
+    GPUDirect Storage implementation for ComfyUI
+    Enables direct SSD-to-GPU transfers for model loading
+    """
+    
+    def __init__(self, config: Optional[GDSConfig] = None):
+        self.config = config or GDSConfig()
+        self.device = torch.cuda.current_device() if torch.cuda.is_available() else None
+        self.cuda_streams = []
+        self.pinned_buffers = {}
+        self.stats = {
+            'gds_loads': 0,
+            'fallback_loads': 0,
+            'total_bytes_gds': 0,
+            'total_time_gds': 0.0,
+            'avg_bandwidth_gbps': 0.0
+        }
+        
+        # Initialize GDS if available
+        self._gds_available = self._check_gds_availability()
+        if self._gds_available:
+            self._init_gds()
+        else:
+            logging.warning("GDS not available, using fallback methods")
+    
+    def _check_gds_availability(self) -> bool:
+        """Check if GDS is available on the system"""
+        if not torch.cuda.is_available():
+            return False
+        
+        if not CUPY_AVAILABLE:
+            return False
+        
+        # Check for GPUDirect Storage support
+        try:
+            # Check CUDA version (GDS requires CUDA 11.4+)
+            cuda_version = torch.version.cuda
+            if cuda_version:
+                major, minor = map(int, cuda_version.split('.')[:2])
+                if major < 11 or (major == 11 and minor < 4):
+                    logging.warning(f"CUDA {cuda_version} detected. GDS requires CUDA 11.4+")
+                    return False
+            
+            # Check if cuFile is available (part of CUDA toolkit)
+            try:
+                import cupy.cuda.cufile as cufile
+                # Try to initialize cuFile
+                cufile.initialize()
+                return True
+            except (ImportError, RuntimeError) as e:
+                logging.warning(f"cuFile not available: {e}")
+                return False
+                
+        except Exception as e:
+            logging.warning(f"GDS availability check failed: {e}")
+            return False
+    
+    def _init_gds(self):
+        """Initialize GDS resources"""
+        try:
+            # Create CUDA streams for async operations
+            for i in range(self.config.max_concurrent_streams):
+                stream = torch.cuda.Stream()
+                self.cuda_streams.append(stream)
+            
+            # Pre-allocate pinned memory buffers
+            if self.config.use_pinned_memory:
+                self._allocate_pinned_buffers()
+            
+            logging.info(f"GDS initialized with {len(self.cuda_streams)} streams")
+            
+        except Exception as e:
+            logging.error(f"Failed to initialize GDS: {e}")
+            self._gds_available = False
+    
+    def _allocate_pinned_buffers(self):
+        """Pre-allocate pinned memory buffers for staging"""
+        try:
+            # Allocate buffers of different sizes
+            buffer_sizes = [16, 32, 64, 128, 256]  # MB
+            
+            for size_mb in buffer_sizes:
+                size_bytes = size_mb * 1024 * 1024
+                # Allocate pinned memory using CuPy
+                if CUPY_AVAILABLE:
+                    buffer = cupy.cuda.alloc_pinned_memory(size_bytes)
+                    self.pinned_buffers[size_mb] = buffer
+                    
+        except Exception as e:
+            logging.warning(f"Failed to allocate pinned buffers: {e}")
+    
+    def _get_file_size(self, file_path: str) -> int:
+        """Get file size in bytes"""
+        return os.path.getsize(file_path)
+    
+    def _should_use_gds(self, file_path: str) -> bool:
+        """Determine if GDS should be used for this file"""
+        if not self._gds_available or not self.config.enabled:
+            return False
+        
+        file_size_mb = self._get_file_size(file_path) / (1024 * 1024)
+        return file_size_mb >= self.config.min_file_size_mb
+    
+    def _load_with_gds(self, file_path: str) -> Dict[str, torch.Tensor]:
+        """Load model using GPUDirect Storage"""
+        start_time = time.time()
+        
+        try:
+            if file_path.lower().endswith(('.safetensors', '.sft')):
+                return self._load_safetensors_gds(file_path)
+            else:
+                return self._load_pytorch_gds(file_path)
+                
+        except Exception as e:
+            logging.error(f"GDS loading failed for {file_path}: {e}")
+            if self.config.fallback_to_cpu:
+                logging.info("Falling back to CPU loading")
+                self.stats['fallback_loads'] += 1
+                return self._load_fallback(file_path)
+            else:
+                raise GDSError(f"GDS loading failed: {e}")
+        finally:
+            load_time = time.time() - start_time
+            self.stats['total_time_gds'] += load_time
+    
+    def _load_safetensors_gds(self, file_path: str) -> Dict[str, torch.Tensor]:
+        """Load safetensors file using GDS"""
+        try:
+            import cupy.cuda.cufile as cufile
+            
+            # Open file with cuFile for direct GPU loading
+            with cufile.CuFileManager() as manager:
+                # Memory-map the file for efficient access
+                with open(file_path, 'rb') as f:
+                    # Use mmap for large files
+                    with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mmapped_file:
+                        
+                        # Parse safetensors header
+                        header_size = int.from_bytes(mmapped_file[:8], 'little')
+                        header_bytes = mmapped_file[8:8+header_size]
+                        
+                        import json
+                        header = json.loads(header_bytes.decode('utf-8'))
+                        
+                        # Load tensors directly to GPU
+                        tensors = {}
+                        data_offset = 8 + header_size
+                        
+                        for name, info in header.items():
+                            if name == "__metadata__":
+                                continue
+                            
+                            dtype_map = {
+                                'F32': torch.float32,
+                                'F16': torch.float16,
+                                'BF16': torch.bfloat16,
+                                'I8': torch.int8,
+                                'I16': torch.int16,
+                                'I32': torch.int32,
+                                'I64': torch.int64,
+                                'U8': torch.uint8,
+                            }
+                            
+                            dtype = dtype_map.get(info['dtype'], torch.float32)
+                            shape = info['shape']
+                            start_offset = data_offset + info['data_offsets'][0]
+                            end_offset = data_offset + info['data_offsets'][1]
+                            
+                            # Direct GPU allocation
+                            tensor = torch.empty(shape, dtype=dtype, device=f'cuda:{self.device}')
+                            
+                            # Use cuFile for direct transfer
+                            tensor_bytes = end_offset - start_offset
+                            
+                            # Get GPU memory pointer
+                            gpu_ptr = tensor.data_ptr()
+                            
+                            # Direct file-to-GPU transfer
+                            cufile.copy_from_file(
+                                gpu_ptr,
+                                mmapped_file[start_offset:end_offset],
+                                tensor_bytes
+                            )
+                            
+                            tensors[name] = tensor
+                        
+                        self.stats['gds_loads'] += 1
+                        self.stats['total_bytes_gds'] += self._get_file_size(file_path)
+                        
+                        return tensors
+                        
+        except Exception as e:
+            logging.error(f"GDS safetensors loading failed: {e}")
+            raise
+    
+    def _load_pytorch_gds(self, file_path: str) -> Dict[str, torch.Tensor]:
+        """Load PyTorch file using GDS with staging"""
+        try:
+            # For PyTorch files, we need to use a staging approach
+            # since torch.load doesn't support direct GPU loading
+            
+            # Load to pinned memory first
+            with open(file_path, 'rb') as f:
+                file_size = self._get_file_size(file_path)
+                
+                # Choose appropriate buffer or allocate new one
+                buffer_size_mb = min(256, max(64, file_size // (1024 * 1024)))
+                
+                if buffer_size_mb in self.pinned_buffers:
+                    pinned_buffer = self.pinned_buffers[buffer_size_mb]
+                else:
+                    # Allocate temporary pinned buffer
+                    pinned_buffer = cupy.cuda.alloc_pinned_memory(file_size)
+                
+                # Read file to pinned memory
+                f.readinto(pinned_buffer)
+                
+                # Use torch.load with map_location to specific GPU
+                # This will be faster due to pinned memory
+                state_dict = torch.load(
+                    f,
+                    map_location=f'cuda:{self.device}',
+                    weights_only=True
+                )
+                
+                self.stats['gds_loads'] += 1
+                self.stats['total_bytes_gds'] += file_size
+                
+                return state_dict
+                
+        except Exception as e:
+            logging.error(f"GDS PyTorch loading failed: {e}")
+            raise
+    
+    def _load_fallback(self, file_path: str) -> Dict[str, torch.Tensor]:
+        """Fallback loading method using standard approaches"""
+        if file_path.lower().endswith(('.safetensors', '.sft')):
+            # Use safetensors with device parameter
+            with safetensors.safe_open(file_path, framework="pt", device=f'cuda:{self.device}') as f:
+                return {k: f.get_tensor(k) for k in f.keys()}
+        else:
+            # Standard PyTorch loading
+            return torch.load(file_path, map_location=f'cuda:{self.device}', weights_only=True)
+    
+    def load_model(self, file_path: str, device: Optional[torch.device] = None) -> Dict[str, torch.Tensor]:
+        """
+        Main entry point for loading models with GDS
+        
+        Args:
+            file_path: Path to the model file
+            device: Target device (if None, uses current CUDA device)
+            
+        Returns:
+            Dictionary of tensors loaded directly to GPU
+        """
+        if device is not None and device.type == 'cuda':
+            self.device = device.index or 0
+        
+        if self._should_use_gds(file_path):
+            logging.info(f"Loading {file_path} with GDS")
+            return self._load_with_gds(file_path)
+        else:
+            logging.info(f"Loading {file_path} with standard method")
+            self.stats['fallback_loads'] += 1
+            return self._load_fallback(file_path)
+    
+    def prefetch_model(self, file_path: str) -> bool:
+        """
+        Prefetch model to GPU memory cache (if supported)
+        
+        Args:
+            file_path: Path to the model file
+            
+        Returns:
+            True if prefetch was successful
+        """
+        if not self.config.prefetch_enabled or not self._gds_available:
+            return False
+        
+        try:
+            # Basic prefetch implementation
+            # This would ideally use NVIDIA's GPUDirect Storage API
+            # to warm up the storage cache
+            
+            file_size = self._get_file_size(file_path)
+            logging.info(f"Prefetching {file_path} ({file_size // (1024*1024)} MB)")
+            
+            # Read file metadata to warm caches
+            with open(file_path, 'rb') as f:
+                # Read first and last chunks to trigger prefetch
+                f.read(1024 * 1024)  # First 1MB
+                f.seek(-min(1024 * 1024, file_size), 2)  # Last 1MB
+                f.read()
+            
+            return True
+            
+        except Exception as e:
+            logging.warning(f"Prefetch failed for {file_path}: {e}")
+            return False
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get loading statistics"""
+        total_loads = self.stats['gds_loads'] + self.stats['fallback_loads']
+        
+        if self.stats['total_time_gds'] > 0 and self.stats['total_bytes_gds'] > 0:
+            bandwidth_gbps = (self.stats['total_bytes_gds'] / (1024**3)) / self.stats['total_time_gds']
+            self.stats['avg_bandwidth_gbps'] = bandwidth_gbps
+        
+        return {
+            **self.stats,
+            'total_loads': total_loads,
+            'gds_usage_percent': (self.stats['gds_loads'] / max(1, total_loads)) * 100,
+            'gds_available': self._gds_available,
+            'config': self.config.__dict__
+        }
+    
+    def cleanup(self):
+        """Clean up GDS resources"""
+        try:
+            # Clear CUDA streams
+            for stream in self.cuda_streams:
+                stream.synchronize()
+            self.cuda_streams.clear()
+            
+            # Free pinned buffers
+            for buffer in self.pinned_buffers.values():
+                if CUPY_AVAILABLE:
+                    cupy.cuda.free_pinned_memory(buffer)
+            self.pinned_buffers.clear()
+            
+            # Force garbage collection
+            gc.collect()
+            torch.cuda.empty_cache()
+            
+        except Exception as e:
+            logging.warning(f"GDS cleanup failed: {e}")
+    
+    def __del__(self):
+        """Destructor to ensure cleanup"""
+        self.cleanup()
+
+
+# Global GDS instance
+_gds_instance: Optional[GPUDirectStorage] = None
+
+
+def get_gds_instance(config: Optional[GDSConfig] = None) -> GPUDirectStorage:
+    """Get or create the global GDS instance"""
+    global _gds_instance
+    
+    if _gds_instance is None:
+        _gds_instance = GPUDirectStorage(config)
+    
+    return _gds_instance
+
+
+def load_torch_file_gds(ckpt: str, safe_load: bool = False, device: Optional[torch.device] = None) -> Dict[str, torch.Tensor]:
+    """
+    GDS-enabled replacement for comfy.utils.load_torch_file
+    
+    Args:
+        ckpt: Path to checkpoint file
+        safe_load: Whether to use safe loading (for compatibility)
+        device: Target device
+        
+    Returns:
+        Dictionary of loaded tensors
+    """
+    gds = get_gds_instance()
+    
+    try:
+        # Load with GDS
+        return gds.load_model(ckpt, device)
+        
+    except Exception as e:
+        logging.error(f"GDS loading failed, falling back to standard method: {e}")
+        # Fallback to original method
+        import comfy.utils
+        return comfy.utils.load_torch_file(ckpt, safe_load=safe_load, device=device)
+
+
+def prefetch_model_gds(file_path: str) -> bool:
+    """Prefetch model for faster loading"""
+    gds = get_gds_instance()
+    return gds.prefetch_model(file_path)
+
+
+def get_gds_stats() -> Dict[str, Any]:
+    """Get GDS statistics"""
+    gds = get_gds_instance()
+    return gds.get_stats()
+
+
+def configure_gds(config: GDSConfig):
+    """Configure GDS settings"""
+    global _gds_instance
+    _gds_instance = GPUDirectStorage(config)
+
+
+def init_gds(config: GDSConfig):
+    """
+    Initialize GPUDirect Storage with the provided configuration
+    
+    Args:
+        config: GDSConfig object with initialization parameters
+    """
+    try:
+        # Configure GDS
+        configure_gds(config)
+        logging.info(f"GDS initialized: enabled={config.enabled}, min_size={config.min_file_size_mb}MB, streams={config.max_concurrent_streams}")
+        
+        # Set up exit handler for stats if requested
+        if hasattr(config, 'show_stats') and config.show_stats:
+            import atexit
+            def print_gds_stats():
+                stats = get_gds_stats()
+                logging.info("=== GDS Statistics ===")
+                logging.info(f"Total loads: {stats['total_loads']}")
+                logging.info(f"GDS loads: {stats['gds_loads']} ({stats['gds_usage_percent']:.1f}%)")
+                logging.info(f"Fallback loads: {stats['fallback_loads']}")
+                logging.info(f"Total bytes via GDS: {stats['total_bytes_gds'] / (1024**3):.2f} GB")
+                logging.info(f"Average bandwidth: {stats['avg_bandwidth_gbps']:.2f} GB/s")
+                logging.info("===================")
+            atexit.register(print_gds_stats)
+            
+    except ImportError as e:
+        logging.warning(f"GDS initialization failed - missing dependencies: {e}")
+    except Exception as e:
+        logging.error(f"GDS initialization failed: {e}")
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -260,7 +260,7 @@ def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, w


 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False):
-    # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass
+    # NOTE: offloadable=False is a legacy mode and if you are a custom node author reading this please pass
    # offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This
    # will add async-offload support to your cast and improve performance.
    if input is not None:
--- a/comfy/utils.py
+++ b/comfy/utils.py
@ -120,6 +120,18 @@ def load_safetensors(ckpt):


 def load_torch_file(ckpt, safe_load=False, device=None, return_metadata=False):
+    # Try GDS loading first if available and device is GPU
+    if device is not None and device.type == 'cuda':
+        try:
+            from . import gds_loader
+            gds_result = gds_loader.load_torch_file_gds(ckpt, safe_load=safe_load, device=device)
+            if return_metadata:
+                # For GDS, we return empty metadata for now (can be enhanced)
+                return (gds_result, {})
+            return gds_result
+        except Exception as e:
+            logging.debug(f"GDS loading failed, using fallback: {e}")
+    
    if device is None:
        device = torch.device("cpu")
    metadata = None
--- a/comfy_extras/nodes_gds.py
+++ b/comfy_extras/nodes_gds.py
@ -0,0 +1,293 @@
+# copyright 2025 Maifee Ul Asad @ github.com/maifeeulasad
+# copyright under GNU GENERAL PUBLIC LICENSE, Version 3, 29 June 2007
+
+"""
+Enhanced model loading nodes with GPUDirect Storage support
+"""
+
+import logging
+import time
+import asyncio
+from typing import Optional, Dict, Any
+
+import torch
+import folder_paths
+import comfy.sd
+import comfy.utils
+from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict
+
+
+class CheckpointLoaderGDS(ComfyNodeABC):
+    """
+    Enhanced checkpoint loader with GPUDirect Storage support
+    Provides direct SSD-to-GPU loading and prefetching capabilities
+    """
+    
+    @classmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        return {
+            "required": {
+                "ckpt_name": (folder_paths.get_filename_list("checkpoints"), {
+                    "tooltip": "The name of the checkpoint (model) to load with GDS optimization."
+                }),
+            },
+            "optional": {
+                "prefetch": ("BOOLEAN", {
+                    "default": False, 
+                    "tooltip": "Prefetch model to GPU cache for faster loading."
+                }),
+                "use_gds": ("BOOLEAN", {
+                    "default": True,
+                    "tooltip": "Use GPUDirect Storage if available."
+                }),
+                "target_device": (["auto", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cpu"], {
+                    "default": "auto",
+                    "tooltip": "Target device for model loading."
+                })
+            }
+        }
+    
+    RETURN_TYPES = ("MODEL", "CLIP", "VAE", "STRING")
+    RETURN_NAMES = ("model", "clip", "vae", "load_info")
+    OUTPUT_TOOLTIPS = (
+        "The model used for denoising latents.",
+        "The CLIP model used for encoding text prompts.",
+        "The VAE model used for encoding and decoding images to and from latent space.",
+        "Loading information and statistics."
+    )
+    FUNCTION = "load_checkpoint_gds"
+    CATEGORY = "loaders/advanced"
+    DESCRIPTION = "Enhanced checkpoint loader with GPUDirect Storage support for direct SSD-to-GPU loading."
+    EXPERIMENTAL = True
+
+    def load_checkpoint_gds(self, ckpt_name: str, prefetch: bool = False, use_gds: bool = True, target_device: str = "auto"):
+        start_time = time.time()
+        
+        ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
+        
+        # Determine target device
+        if target_device == "auto":
+            device = None  # Let the system decide
+        elif target_device == "cpu":
+            device = torch.device("cpu")
+        else:
+            device = torch.device(target_device)
+        
+        load_info = {
+            "file": ckpt_name,
+            "path": ckpt_path,
+            "target_device": str(device) if device else "auto",
+            "gds_enabled": use_gds,
+            "prefetch_used": prefetch
+        }
+        
+        try:
+            # Prefetch if requested
+            if prefetch and use_gds:
+                try:
+                    from comfy.gds_loader import prefetch_model_gds
+                    prefetch_success = prefetch_model_gds(ckpt_path)
+                    load_info["prefetch_success"] = prefetch_success
+                    if prefetch_success:
+                        logging.info(f"Prefetched {ckpt_name} to GPU cache")
+                except Exception as e:
+                    logging.warning(f"Prefetch failed for {ckpt_name}: {e}")
+                    load_info["prefetch_error"] = str(e)
+            
+            # Load checkpoint with potential GDS optimization
+            if use_gds and device and device.type == 'cuda':
+                try:
+                    from comfy.gds_loader import get_gds_instance
+                    gds = get_gds_instance()
+                    
+                    # Check if GDS should be used for this file
+                    if gds._should_use_gds(ckpt_path):
+                        load_info["loader_used"] = "GDS"
+                        logging.info(f"Loading {ckpt_name} with GDS")
+                    else:
+                        load_info["loader_used"] = "Standard"
+                        logging.info(f"Loading {ckpt_name} with standard method (file too small for GDS)")
+                        
+                except Exception as e:
+                    logging.warning(f"GDS check failed, using standard loading: {e}")
+                    load_info["loader_used"] = "Standard (GDS failed)"
+            else:
+                load_info["loader_used"] = "Standard"
+            
+            # Load the actual checkpoint
+            out = comfy.sd.load_checkpoint_guess_config(
+                ckpt_path, 
+                output_vae=True, 
+                output_clip=True, 
+                embedding_directory=folder_paths.get_folder_paths("embeddings")
+            )
+            
+            load_time = time.time() - start_time
+            load_info["load_time_seconds"] = round(load_time, 3)
+            load_info["load_success"] = True
+            
+            # Format load info as string
+            info_str = f"Loaded: {ckpt_name}\n"
+            info_str += f"Method: {load_info['loader_used']}\n"
+            info_str += f"Time: {load_info['load_time_seconds']}s\n"
+            info_str += f"Device: {load_info['target_device']}"
+            
+            if "prefetch_success" in load_info:
+                info_str += f"\nPrefetch: {'✓' if load_info['prefetch_success'] else '✗'}"
+            
+            logging.info(f"Checkpoint loaded: {ckpt_name} in {load_time:.3f}s using {load_info['loader_used']}")
+            
+            return (*out[:3], info_str)
+            
+        except Exception as e:
+            load_info["load_success"] = False
+            load_info["error"] = str(e)
+            error_str = f"Failed to load: {ckpt_name}\nError: {str(e)}"
+            logging.error(f"Checkpoint loading failed: {e}")
+            raise RuntimeError(error_str)
+
+
+class ModelPrefetcher(ComfyNodeABC):
+    """
+    Node for prefetching models to GPU cache
+    """
+    
+    @classmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        return {
+            "required": {
+                "checkpoint_names": ("STRING", {
+                    "multiline": True,
+                    "default": "",
+                    "tooltip": "List of checkpoint names to prefetch (one per line)."
+                }),
+                "prefetch_enabled": ("BOOLEAN", {
+                    "default": True,
+                    "tooltip": "Enable/disable prefetching."
+                })
+            }
+        }
+    
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("prefetch_report",)
+    OUTPUT_TOOLTIPS = ("Report of prefetch operations.",)
+    FUNCTION = "prefetch_models"
+    CATEGORY = "loaders/advanced"
+    DESCRIPTION = "Prefetch multiple models to GPU cache for faster loading."
+    OUTPUT_NODE = True
+
+    def prefetch_models(self, checkpoint_names: str, prefetch_enabled: bool = True):
+        if not prefetch_enabled:
+            return ("Prefetching disabled",)
+        
+        # Parse checkpoint names
+        names = [name.strip() for name in checkpoint_names.split('\n') if name.strip()]
+        
+        if not names:
+            return ("No checkpoints specified for prefetching",)
+        
+        try:
+            from comfy.gds_loader import prefetch_model_gds
+        except ImportError:
+            return ("GDS not available for prefetching",)
+        
+        results = []
+        successful_prefetches = 0
+        
+        for name in names:
+            try:
+                ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", name)
+                success = prefetch_model_gds(ckpt_path)
+                
+                if success:
+                    results.append(f"✓ {name}")
+                    successful_prefetches += 1
+                else:
+                    results.append(f"✗ {name} (prefetch failed)")
+                    
+            except Exception as e:
+                results.append(f"✗ {name} (error: {str(e)[:50]})")
+        
+        report = f"Prefetch Report ({successful_prefetches}/{len(names)} successful):\n"
+        report += "\n".join(results)
+        
+        return (report,)
+
+
+class GDSStats(ComfyNodeABC):
+    """
+    Node for displaying GDS statistics
+    """
+    
+    @classmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        return {
+            "required": {
+                "refresh": ("BOOLEAN", {
+                    "default": False,
+                    "tooltip": "Refresh statistics."
+                })
+            }
+        }
+    
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("stats_report",)
+    OUTPUT_TOOLTIPS = ("GDS statistics and performance report.",)
+    FUNCTION = "get_stats"
+    CATEGORY = "utils/advanced"
+    DESCRIPTION = "Display GPUDirect Storage statistics and performance metrics."
+    OUTPUT_NODE = True
+
+    def get_stats(self, refresh: bool = False):
+        try:
+            from comfy.gds_loader import get_gds_stats
+            stats = get_gds_stats()
+            
+            report = "=== GPUDirect Storage Statistics ===\n\n"
+            
+            # Availability
+            report += f"GDS Available: {'✓' if stats['gds_available'] else '✗'}\n"
+            
+            # Usage statistics
+            report += f"Total Loads: {stats['total_loads']}\n"
+            report += f"GDS Loads: {stats['gds_loads']} ({stats['gds_usage_percent']:.1f}%)\n"
+            report += f"Fallback Loads: {stats['fallback_loads']}\n\n"
+            
+            # Performance metrics
+            if stats['total_bytes_gds'] > 0:
+                gb_transferred = stats['total_bytes_gds'] / (1024**3)
+                report += f"Data Transferred: {gb_transferred:.2f} GB\n"
+                report += f"Average Bandwidth: {stats['avg_bandwidth_gbps']:.2f} GB/s\n"
+                report += f"Total GDS Time: {stats['total_time_gds']:.2f}s\n\n"
+            
+            # Configuration
+            config = stats.get('config', {})
+            if config:
+                report += "Configuration:\n"
+                report += f"- Enabled: {config.get('enabled', 'Unknown')}\n"
+                report += f"- Min File Size: {config.get('min_file_size_mb', 'Unknown')} MB\n"
+                report += f"- Chunk Size: {config.get('chunk_size_mb', 'Unknown')} MB\n"
+                report += f"- Max Streams: {config.get('max_concurrent_streams', 'Unknown')}\n"
+                report += f"- Prefetch: {config.get('prefetch_enabled', 'Unknown')}\n"
+                report += f"- Fallback: {config.get('fallback_to_cpu', 'Unknown')}\n"
+            
+            return (report,)
+            
+        except ImportError:
+            return ("GDS module not available",)
+        except Exception as e:
+            return (f"Error retrieving GDS stats: {str(e)}",)
+
+
+# Node mappings
+NODE_CLASS_MAPPINGS = {
+    "CheckpointLoaderGDS": CheckpointLoaderGDS,
+    "ModelPrefetcher": ModelPrefetcher,
+    "GDSStats": GDSStats,
+}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "CheckpointLoaderGDS": "Load Checkpoint (GDS)",
+    "ModelPrefetcher": "Model Prefetcher",
+    "GDSStats": "GDS Statistics",
+}
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@ -77,7 +77,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
    @classmethod
    def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
        latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
-        return io.NodeOutput({"samples": latent})
+        return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 32})

    generate = execute  # TODO: remove

--- a/comfy_extras/nodes_string.py
+++ b/comfy_extras/nodes_string.py
@ -1,10 +1,41 @@
 import re
 import json
+import string
 from typing_extensions import override

 from comfy_api.latest import ComfyExtension, io


+class StringFormat(io.ComfyNode):
+    @classmethod
+    def define_schema(cls) -> io.Schema:
+        autogrow = io.Autogrow.TemplateNames(
+            input=io.AnyType.Input("value"),
+            names=list(string.ascii_lowercase),
+            min=0,
+        )
+        return io.Schema(
+            node_id="StringFormat",
+            display_name="Format Text",
+            category="text",
+            search_aliases=["string", "format"],
+            description="Same as Python's string format method. Supports all of Python's format options and features.",
+            inputs=[
+                io.Autogrow.Input("values", template=autogrow),
+                io.String.Input("f_string", default="{a}", multiline=True),
+            ],
+            outputs=[
+                io.String.Output(),
+            ],
+        )
+
+    @classmethod
+    def execute(
+        cls, values: io.Autogrow.Type, f_string: str
+    ) -> io.NodeOutput:
+        return io.NodeOutput(f_string.format(**values))
+
+
 class StringConcatenate(io.ComfyNode):
    @classmethod
    def define_schema(cls):
@ -413,6 +444,7 @@ class StringExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
+            StringFormat,
            StringConcatenate,
            StringSubstring,
            StringLength,
--- a/main.py
+++ b/main.py
@ -212,6 +212,34 @@ import comfyui_version
 import app.logger
 import hook_breaker_ac10a0

+# Initialize GPUDirect Storage if enabled
+def init_gds():
+    """Initialize GPUDirect Storage based on CLI arguments"""
+    if hasattr(args, 'disable_gds') and args.disable_gds:
+        logging.info("GDS explicitly disabled via --disable-gds")
+        return
+    
+    if not hasattr(args, 'enable_gds') and not hasattr(args, 'gds_prefetch') and not hasattr(args, 'gds_stats'):
+        # GDS not explicitly requested, use auto-detection
+        return
+    
+    if hasattr(args, 'enable_gds') and args.enable_gds:
+        from comfy.gds_loader import GDSConfig, init_gds as gds_init
+        
+        config = GDSConfig(
+            enabled=getattr(args, 'enable_gds', False) or getattr(args, 'gds_prefetch', False),
+            min_file_size_mb=getattr(args, 'gds_min_file_size', 100),
+            chunk_size_mb=getattr(args, 'gds_chunk_size', 64),
+            max_concurrent_streams=getattr(args, 'gds_streams', 4),
+            prefetch_enabled=getattr(args, 'gds_prefetch', True),
+            fallback_to_cpu=not getattr(args, 'gds_no_fallback', False),
+            show_stats=getattr(args, 'gds_stats', False)
+        )
+        
+        gds_init(config)
+
+# Initialize GDS
+init_gds()
 import comfy.memory_management
 import comfy.model_patcher

--- a/nodes.py
+++ b/nodes.py
@ -2414,6 +2414,7 @@ async def init_builtin_extra_nodes():
        "nodes_model_patch.py",
        "nodes_easycache.py",
        "nodes_audio_encoder.py",
+        "nodes_gds.py",
        "nodes_rope.py",
        "nodes_logic.py",
        "nodes_resolution.py",
--- a/openapi.yaml
+++ b/openapi.yaml
@ -4160,6 +4160,10 @@ paths:
                name:
                  type: string
                  description: Display name for the API key
+                description:
+                  type: string
+                  description: User-provided description of the key's purpose
+                  maxLength: 5000
      responses:
        "201":
          description: API key created
@ -6351,14 +6355,6 @@ components:
          type: integer
          format: int64
          description: Size of the asset in bytes
-        width:
-          type: integer
-          nullable: true
-          description: "Original image width in pixels. Null for non-image assets or assets ingested before dimension extraction."
-        height:
-          type: integer
-          nullable: true
-          description: "Original image height in pixels. Null for non-image assets or assets ingested before dimension extraction."
        mime_type:
          type: string
          description: MIME type of the asset
@ -7685,11 +7681,16 @@ components:
      required:
        - id
        - name
+        - description
      properties:
        id:
          type: string
        name:
          type: string
+        description:
+          type: string
+          maxLength: 5000
+          description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
        prefix:
          type: string
          description: First few characters of the key for identification
@ -7710,12 +7711,17 @@ components:
      required:
        - id
        - name
+        - description
        - key
      properties:
        id:
          type: string
        name:
          type: string
+        description:
+          type: string
+          maxLength: 5000
+          description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
        key:
          type: string
          description: Full API key value (only returned on creation)
Author	SHA1	Message	Date
Maifee Ul Asad	b25f4bb527	Merge `9dd4d35e80` into `72e3f6081c`	2026-05-20 00:45:52 -04:00
comfyanonymous	72e3f6081c	Add downscale ratio to empty ltxv latent. (#13999 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details	2026-05-19 20:28:06 -07:00
Pauan	7ec7b6ffe9	Adding new StringFormat node (#13997 )	2026-05-20 10:25:49 +08:00
Matt Miller	6887165a9d	docs(openapi): tighten workspace API key description field (BE-1004) (#13996 ) Aligns the OSS spec with the cloud-side BE-1004 contract: - createWorkspaceApiKey request body: add maxLength: 5000 to the description property (matches cloud's hub_profile.description MaxLen(5000) convention; enforced cloud-side via handler check). - WorkspaceApiKey + WorkspaceApiKeyCreated response schemas: mark description as required (cloud's handler always populates the field, defaulting to empty string when not supplied on create), drop nullable: true, add maxLength: 5000 for symmetry, and clarify the doc string ("Always present in responses; empty string when no description was supplied on create"). Both schemas are tagged x-runtime: [cloud] at the schema level so the tightening is correctly scoped — OSS-only implementations are not required to honor the workspace API keys endpoints at all. Related cloud PR: Comfy-Org/cloud#3747	2026-05-19 16:55:04 -07:00
Matt Miller	cc4d711eb1	feat(openapi): add optional description field to workspace API key schemas (#13993 ) Some checks are pending Python Linting / Run Ruff (push) Waiting to run Details Python Linting / Run Pylint (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.10, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.11, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-stable (12.1, , linux, 3.12, [self-hosted Linux], stable) (push) Waiting to run Details Full Comfy CI Workflow Runs / test-unix-nightly (12.1, , linux, 3.11, [self-hosted Linux], nightly) (push) Waiting to run Details Execution Tests / test (macos-latest) (push) Waiting to run Details Execution Tests / test (ubuntu-latest) (push) Waiting to run Details Execution Tests / test (windows-latest) (push) Waiting to run Details Test server launches without errors / test (push) Waiting to run Details Unit Tests / test (macos-latest) (push) Waiting to run Details Unit Tests / test (ubuntu-latest) (push) Waiting to run Details Unit Tests / test (windows-2022) (push) Waiting to run Details * feat(openapi): add optional description field to workspace API key schemas Add an optional `description` property (type: string) to three workspace API key schemas in openapi.yaml: - Inline request body of createWorkspaceApiKey (POST /api/workspace/api-keys) - WorkspaceApiKey (list/info schema) - WorkspaceApiKeyCreated (creation response schema) The field is not added to any `required` array, making it fully backward-compatible with existing clients. Refs: BE-1005, BE-1004 Co-authored-by: Matt Miller <mattmillerai@users.noreply.github.com> * fix(openapi): mark description nullable in workspace API key response schemas Per CodeRabbit review on PR #13993: the underlying DB column is nullable varchar (default ''), so the response schemas should permit null to match stored data reality. Without nullable: true the OpenAPI contract would require coercion on the handler side or risk a contract violation. Request schema unchanged — clients shouldn't be sending null on create.	2026-05-19 14:48:47 -07:00
yy	626b082838	Fix typo in ops.py (#11925 )	2026-05-20 05:45:04 +08:00
Matt Miller	d0328b442d	docs(openapi): remove top-level width/height fields on Asset schema (#13973 ) These two fields were added recently to the Asset schema as nullable integers, with the intent of exposing original image dimensions for FE consumers (cloud-side thumbnailing makes naturalWidth/Height return the wrong size for an image card's dimension label). The implementation effort that consumes them subsequently converged on a different shape — dimensions nested under the existing free-form `metadata` JSON field as `{kind: "image", width, height}` — to avoid introducing type-specific flat fields on the canonical Asset shape, and to leave room for forward-compatible additions (video duration, fps, etc.) without further schema churn. This removes the now-unused top-level fields so the spec reflects the agreed direction. No other schema definitions reference these fields directly: AssetCreated, AssetUpdated, etc. inherit Asset via allOf and do not redefine them. The runtime ingest implementation that would have populated these fields was not yet shipped, so no clients are relying on the top-level shape. Co-authored-by: Alexis Rolland <alexisrolland@hotmail.com>	2026-05-19 10:00:26 -07:00
Maifee Ul Asad	9dd4d35e80	[cleanup]: removed duplicate deps from merging;	2026-03-29 18:31:41 +06:00
Maifee Ul Asad	e9d082a228	Merge branch 'master' into offloader-maifee	2026-03-28 13:05:32 +06:00
Maifee Ul Asad	7602203696	Merge branch 'Comfy-Org:master' into offloader-maifee	2026-01-11 12:59:53 +06:00
Maifee Ul Asad	ffa7a369ba	Merge branch 'Comfy-Org:master' into offloader-maifee	2026-01-07 21:26:04 +06:00
Maifee Ul Asad	7ec1656735	Merge branch 'comfyanonymous:master' into offloader-maifee	2025-12-07 15:32:22 +06:00
Maifee Ul Asad	cee75f301a	Merge branch 'comfyanonymous:master' into offloader-maifee	2025-11-27 08:47:41 +06:00
Maifee Ul Asad	1a59686ca8	Merge branch 'comfyanonymous:master' into offloader-maifee	2025-11-25 22:09:53 +06:00
Maifee Ul Asad	6d96d26795	Merge branch 'comfyanonymous:master' into offloader-maifee	2025-11-25 22:08:51 +06:00
Maifee Ul Asad	e07a32c9b8	Merge branch 'master' into offloader-maifee	2025-11-09 17:25:46 +06:00
Maifee Ul Asad	a19f0a88e4	refactor(gds): add `show_stats` option; moved GDS initialization to dedicated file;	2025-10-12 00:54:04 +06:00
Maifee Ul Asad	64811809a0	docs: updated GDS environment limitations; - works only on linux and nvidia	2025-10-11 21:51:24 +06:00
Maifee Ul Asad	529109083a	docs: added GDS docs and deps documentation;	2025-10-10 22:23:13 +06:00
Maifee Ul Asad	a7be9f6fc3	review: remove GDS-related dependencies from requirements.txt	2025-10-10 22:08:58 +06:00
Maifee Ul Asad	6075c44ec8	feat(gds): add GDS-related dependencies to requirements.txt	2025-10-08 14:43:55 +06:00
Maifee Ul Asad	154b73835a	feat(gds): implement GPUDirect Storage initialization based on CLI arguments	2025-10-08 14:40:59 +06:00
Maifee Ul Asad	862e7784f4	feat(gds): add nodes_gds.py to built-in extra nodes initialization	2025-10-08 14:40:41 +06:00
Maifee Ul Asad	f6b6636bf3	feat(gds): implement GDS loading fallback in load_torch_file function; - need to work with tensorflow and other formats - afaik, almost all models shared now is in torch format - converting types should not be that big of a deal	2025-10-08 14:40:29 +06:00
Maifee Ul Asad	83b00df3f0	feat(gds): add GPUDirect Storage options for SSD-to-GPU model loading;	2025-10-08 14:39:22 +06:00
Maifee Ul Asad	5f24eb699c	feat(gds): implement GPUDirect Storage for efficient model loading	2025-10-08 14:38:49 +06:00
Maifee Ul Asad	fab0954077	feat(gds): add GPUDirect Storage support for model loading and prefetching - limited to NVIDIA GPUs only	2025-10-08 14:38:38 +06:00