use native mmap

2025-12-17 10:02:59 +08:00 · 2025-10-21 17:00:56 +08:00 · 2025-10-21 17:00:56 +08:00 · 08e094ed81
commit 08e094ed81
parent fff56de63c
2 changed files with 355 additions and 3 deletions
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@ -27,7 +27,10 @@ import uuid
 from typing import Callable, Optional
 import torch
-import tensordict
+import os
 import tempfile
 import weakref
 import gc
 import comfy.float
 import comfy.hooks
@ -39,8 +42,77 @@ from comfy.comfy_types import UnetWrapperFunction
 from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
 from comfy.model_management import get_free_memory
-def to_mmap(t: torch.Tensor) -> tensordict.MemoryMappedTensor:
+
-    return tensordict.MemoryMappedTensor.from_tensor(t)
+def to_mmap(t: torch.Tensor, filename: Optional[str] = None) -> torch.Tensor:
    """
    Convert a tensor to a memory-mapped CPU tensor using PyTorch's native mmap support.
    """
    # Move to CPU if needed
    if t.is_cuda:
        cpu_tensor = t.cpu()
    else:
        cpu_tensor = t
    # Create temporary file
    if filename is None:
        temp_file = tempfile.mktemp(suffix='.pt', prefix='comfy_mmap_')
    else:
        temp_file = filename
    # Save tensor to file
    torch.save(cpu_tensor, temp_file)
    # If we created a CPU copy from CUDA, delete it to free memory
    if t.is_cuda:
        del cpu_tensor
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    # Load with mmap - this doesn't load all data into RAM
    mmap_tensor = torch.load(temp_file, map_location='cpu', mmap=True, weights_only=False)
    # Register cleanup callback
    def _cleanup():
        try:
            if os.path.exists(temp_file):
                os.remove(temp_file)
                logging.debug(f"Cleaned up mmap file: {temp_file}")
        except Exception:
            pass
    weakref.finalize(mmap_tensor, _cleanup)
    # Save original 'to' method
    original_to = mmap_tensor.to
    # Create custom 'to' method that cleans up file when moving to CUDA
    def custom_to(*args, **kwargs):
        # Determine target device
        target_device = None
        if len(args) > 0:
            if isinstance(args[0], torch.device):
                target_device = args[0]
            elif isinstance(args[0], str):
                target_device = torch.device(args[0])
        if 'device' in kwargs:
            target_device = kwargs['device']
            if isinstance(target_device, str):
                target_device = torch.device(target_device)
        # Call original 'to' method first to move data
        result = original_to(*args, **kwargs)
        # If moved to CUDA, cleanup the mmap file after the move
        if target_device is not None and target_device.type == 'cuda':
            _cleanup()
        return result
    # Replace the 'to' method
    mmap_tensor.to = custom_to
    return mmap_tensor
 def model_to_mmap(model: torch.nn.Module):
    """Convert all parameters and buffers to memory-mapped tensors
--- a/tests/execution/test_model_mmap.py
+++ b/tests/execution/test_model_mmap.py
@ -0,0 +1,280 @@
 import pytest
 import torch
 import torch.nn as nn
 import psutil
 import os
 import gc
 import tempfile
 from comfy.model_patcher import model_to_mmap, to_mmap
 class LargeModel(nn.Module):
    """A simple model with large parameters for testing memory mapping"""
    def __init__(self, size_gb=10):
        super().__init__()
        # Calculate number of float32 elements needed for target size
        # 1 GB = 1024^3 bytes, float32 = 4 bytes
        bytes_per_gb = 1024 * 1024 * 1024
        elements_per_gb = bytes_per_gb // 4  # float32 is 4 bytes
        total_elements = int(size_gb * elements_per_gb)
        # Create a large linear layer
        # Split into multiple layers to avoid single tensor size limits
        self.layers = nn.ModuleList()
        elements_per_layer = 500 * 1024 * 1024  # 500M elements per layer (~2GB)
        num_layers = (total_elements + elements_per_layer - 1) // elements_per_layer
        for i in range(num_layers):
            if i == num_layers - 1:
                # Last layer gets the remaining elements
                remaining = total_elements - (i * elements_per_layer)
                in_features = int(remaining ** 0.5)
                out_features = (remaining + in_features - 1) // in_features
            else:
                in_features = int(elements_per_layer ** 0.5)
                out_features = (elements_per_layer + in_features - 1) // in_features
            # Create layer without bias to control size precisely
            self.layers.append(nn.Linear(in_features, out_features, bias=False))
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
 def get_process_memory_gb():
    """Get current process memory usage in GB"""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024 ** 3)  # Convert to GB
 def get_model_size_gb(model):
    """Calculate model size in GB"""
    total_size = 0
    for param in model.parameters():
        total_size += param.nelement() * param.element_size()
    for buffer in model.buffers():
        total_size += buffer.nelement() * buffer.element_size()
    return total_size / (1024 ** 3)
 def test_model_to_mmap_memory_efficiency():
    """Test that model_to_mmap reduces memory usage for a 10GB model to less than 1GB
    The typical use case is:
    1. Load a large model on CUDA
    2. Convert to mmap to offload from GPU to disk-backed memory
    3. This frees GPU memory and reduces CPU RAM usage
    """
    # Check if CUDA is available
    if not torch.cuda.is_available():
        pytest.skip("CUDA is not available, skipping test")
    # Force garbage collection before starting
    gc.collect()
    torch.cuda.empty_cache()
    # Record initial memory
    initial_cpu_memory = get_process_memory_gb()
    initial_gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3)
    print(f"\nInitial CPU memory: {initial_cpu_memory:.2f} GB")
    print(f"Initial GPU memory: {initial_gpu_memory:.2f} GB")
    # Create a 10GB model
    print("Creating 10GB model...")
    model = LargeModel(size_gb=10)
    # Verify model size
    model_size = get_model_size_gb(model)
    print(f"Model size: {model_size:.2f} GB")
    assert model_size >= 9.5, f"Model size {model_size:.2f} GB is less than expected 10 GB"
    # Move model to CUDA
    print("Moving model to CUDA...")
    model = model.cuda()
    torch.cuda.synchronize()
    # Memory after moving to CUDA
    cpu_after_cuda = get_process_memory_gb()
    gpu_after_cuda = torch.cuda.memory_allocated() / (1024 ** 3)
    print(f"CPU memory after moving to CUDA: {cpu_after_cuda:.2f} GB")
    print(f"GPU memory after moving to CUDA: {gpu_after_cuda:.2f} GB")
    # Convert to mmap (this should move model from GPU to disk-backed memory)
    # Note: model_to_mmap modifies the model in-place via _apply()
    # so model and model_mmap will be the same object
    print("Converting model to mmap...")
    model_mmap = model_to_mmap(model)
    # Verify that model and model_mmap are the same object (in-place modification)
    assert model is model_mmap, "model_to_mmap should modify the model in-place"
    # Force garbage collection and clear CUDA cache
    # The original CUDA tensors should be automatically freed when replaced
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    # Memory after mmap conversion
    cpu_after_mmap = get_process_memory_gb()
    gpu_after_mmap = torch.cuda.memory_allocated() / (1024 ** 3)
    print(f"CPU memory after mmap: {cpu_after_mmap:.2f} GB")
    print(f"GPU memory after mmap: {gpu_after_mmap:.2f} GB")
    # Calculate memory changes from CUDA state (the baseline we're converting from)
    cpu_increase = cpu_after_mmap - cpu_after_cuda
    gpu_decrease = gpu_after_cuda - gpu_after_mmap  # Should be positive (freed)
    print(f"\nCPU memory increase from CUDA: {cpu_increase:.2f} GB")
    print(f"GPU memory freed: {gpu_decrease:.2f} GB")
    # Verify that CPU memory usage increase is less than 1GB
    # The mmap should use disk-backed storage, keeping CPU RAM usage low
    # We use 1.5 GB threshold to account for overhead
    assert cpu_increase < 1.5, (
        f"CPU memory increase after mmap ({cpu_increase:.2f} GB) should be less than 1.5 GB. "
        f"CUDA state: {cpu_after_cuda:.2f} GB, After mmap: {cpu_after_mmap:.2f} GB"
    )
    # Verify that GPU memory has been freed
    # We expect at least 9 GB to be freed (original 10GB model with some tolerance)
    assert gpu_decrease > 9.0, (
        f"GPU memory should be freed after mmap. "
        f"Freed: {gpu_decrease:.2f} GB (from {gpu_after_cuda:.2f} to {gpu_after_mmap:.2f} GB), expected > 9 GB"
    )
    # Verify the model is still functional (basic sanity check)
    assert model_mmap is not None
    assert len(list(model_mmap.parameters())) > 0
    print(f"\n✓ Test passed!")
    print(f"  CPU memory increase: {cpu_increase:.2f} GB < 1.5 GB")
    print(f"  GPU memory freed: {gpu_decrease:.2f} GB > 9.0 GB")
    print(f"  Model successfully offloaded from GPU to disk-backed memory")
    # Cleanup (model and model_mmap are the same object)
    del model, model_mmap
    gc.collect()
    torch.cuda.empty_cache()
 def test_to_mmap_cuda_cycle():
    """Test CUDA -> mmap -> CUDA cycle
    This test verifies:
    1. CUDA tensor can be converted to mmap tensor
    2. CPU memory increase is minimal when using mmap (< 0.1 GB)
    3. GPU memory is freed when converting to mmap
    4. mmap tensor can be moved back to CUDA
    5. Data remains consistent throughout the cycle
    6. mmap file is automatically cleaned up when moved to CUDA
    """
    # Check if CUDA is available
    if not torch.cuda.is_available():
        pytest.skip("CUDA is not available, skipping test")
    # Force garbage collection
    gc.collect()
    torch.cuda.empty_cache()
    print("\nTest: CUDA -> mmap -> CUDA cycle")
    # Record initial CPU memory
    initial_cpu_memory = get_process_memory_gb()
    print(f"Initial CPU memory: {initial_cpu_memory:.2f} GB")
    # Step 1: Create a CUDA tensor
    print("\n1. Creating CUDA tensor...")
    original_data = torch.randn(5000, 5000).cuda()
    original_sum = original_data.sum().item()
    print(f"   Shape: {original_data.shape}")
    print(f"   Device: {original_data.device}")
    print(f"   Sum: {original_sum:.2f}")
    # Record GPU and CPU memory after CUDA allocation
    cpu_after_cuda = get_process_memory_gb()
    gpu_before_mmap = torch.cuda.memory_allocated() / (1024 ** 3)
    print(f"   GPU memory: {gpu_before_mmap:.2f} GB")
    print(f"   CPU memory: {cpu_after_cuda:.2f} GB")
    # Step 2: Convert to mmap tensor
    print("\n2. Converting to mmap tensor...")
    mmap_tensor = to_mmap(original_data)
    del original_data
    gc.collect()
    torch.cuda.empty_cache()
    print(f"   Device: {mmap_tensor.device}")
    print(f"   Sum: {mmap_tensor.sum().item():.2f}")
    # Verify GPU memory is freed
    gpu_after_mmap = torch.cuda.memory_allocated() / (1024 ** 3)
    cpu_after_mmap = get_process_memory_gb()
    print(f"   GPU memory freed: {gpu_before_mmap - gpu_after_mmap:.2f} GB")
    print(f"   CPU memory: {cpu_after_mmap:.2f} GB")
    # Verify GPU memory is freed
    assert gpu_after_mmap < 0.1, f"GPU memory should be freed, but {gpu_after_mmap:.2f} GB still allocated"
    # Verify CPU memory increase is minimal (should be close to 0 due to mmap)
    cpu_increase = cpu_after_mmap - cpu_after_cuda
    print(f"   CPU memory increase: {cpu_increase:.2f} GB")
    assert cpu_increase < 0.1, f"CPU memory should increase minimally, but increased by {cpu_increase:.2f} GB"
    # Get the temp file path (we'll check if it gets cleaned up)
    # The file should exist at this point
    temp_files_before = len([f for f in os.listdir(tempfile.gettempdir()) if f.startswith('comfy_mmap_')])
    print(f"   Temp mmap files exist: {temp_files_before}")
    # Step 3: Move back to CUDA
    print("\n3. Moving back to CUDA...")
    cuda_tensor = mmap_tensor.to('cuda')
    torch.cuda.synchronize()
    print(f"   Device: {cuda_tensor.device}")
    final_sum = cuda_tensor.sum().item()
    print(f"   Sum: {final_sum:.2f}")
    # Verify GPU memory is used again
    gpu_after_cuda = torch.cuda.memory_allocated() / (1024 ** 3)
    print(f"   GPU memory: {gpu_after_cuda:.2f} GB")
    # Step 4: Verify data consistency
    print("\n4. Verifying data consistency...")
    sum_diff = abs(original_sum - final_sum)
    print(f"   Original sum: {original_sum:.2f}")
    print(f"   Final sum: {final_sum:.2f}")
    print(f"   Difference: {sum_diff:.6f}")
    assert sum_diff < 0.01, f"Data should be consistent, but difference is {sum_diff:.6f}"
    # Step 5: Verify file cleanup
    print("\n5. Verifying file cleanup...")
    gc.collect()
    import time
    time.sleep(0.1)  # Give OS time to clean up
    temp_files_after = len([f for f in os.listdir(tempfile.gettempdir()) if f.startswith('comfy_mmap_')])
    print(f"   Temp mmap files after: {temp_files_after}")
    # File should be cleaned up when moved to CUDA
    assert temp_files_after <= temp_files_before, "mmap file should be cleaned up after moving to CUDA"
    print("\n✓ Test passed!")
    print("  CUDA -> mmap -> CUDA cycle works correctly")
    print(f"  CPU memory increase: {cpu_increase:.2f} GB < 0.1 GB (mmap efficiency)")
    print("  Data consistency maintained")
    print("  File cleanup successful")
    # Cleanup
    del mmap_tensor, cuda_tensor
    gc.collect()
    torch.cuda.empty_cache()
 if __name__ == "__main__":
    # Run the tests directly
    test_model_to_mmap_memory_efficiency()
    test_to_mmap_cuda_cycle()