mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-12-17 10:02:59 +08:00
use native mmap
This commit is contained in:
parent
fff56de63c
commit
08e094ed81
@ -27,7 +27,10 @@ import uuid
|
|||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import tensordict
|
import os
|
||||||
|
import tempfile
|
||||||
|
import weakref
|
||||||
|
import gc
|
||||||
|
|
||||||
import comfy.float
|
import comfy.float
|
||||||
import comfy.hooks
|
import comfy.hooks
|
||||||
@ -39,8 +42,77 @@ from comfy.comfy_types import UnetWrapperFunction
|
|||||||
from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
|
from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
|
||||||
from comfy.model_management import get_free_memory
|
from comfy.model_management import get_free_memory
|
||||||
|
|
||||||
def to_mmap(t: torch.Tensor) -> tensordict.MemoryMappedTensor:
|
|
||||||
return tensordict.MemoryMappedTensor.from_tensor(t)
|
def to_mmap(t: torch.Tensor, filename: Optional[str] = None) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Convert a tensor to a memory-mapped CPU tensor using PyTorch's native mmap support.
|
||||||
|
"""
|
||||||
|
# Move to CPU if needed
|
||||||
|
if t.is_cuda:
|
||||||
|
cpu_tensor = t.cpu()
|
||||||
|
else:
|
||||||
|
cpu_tensor = t
|
||||||
|
|
||||||
|
# Create temporary file
|
||||||
|
if filename is None:
|
||||||
|
temp_file = tempfile.mktemp(suffix='.pt', prefix='comfy_mmap_')
|
||||||
|
else:
|
||||||
|
temp_file = filename
|
||||||
|
|
||||||
|
# Save tensor to file
|
||||||
|
torch.save(cpu_tensor, temp_file)
|
||||||
|
|
||||||
|
# If we created a CPU copy from CUDA, delete it to free memory
|
||||||
|
if t.is_cuda:
|
||||||
|
del cpu_tensor
|
||||||
|
gc.collect()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
# Load with mmap - this doesn't load all data into RAM
|
||||||
|
mmap_tensor = torch.load(temp_file, map_location='cpu', mmap=True, weights_only=False)
|
||||||
|
|
||||||
|
# Register cleanup callback
|
||||||
|
def _cleanup():
|
||||||
|
try:
|
||||||
|
if os.path.exists(temp_file):
|
||||||
|
os.remove(temp_file)
|
||||||
|
logging.debug(f"Cleaned up mmap file: {temp_file}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
weakref.finalize(mmap_tensor, _cleanup)
|
||||||
|
|
||||||
|
# Save original 'to' method
|
||||||
|
original_to = mmap_tensor.to
|
||||||
|
|
||||||
|
# Create custom 'to' method that cleans up file when moving to CUDA
|
||||||
|
def custom_to(*args, **kwargs):
|
||||||
|
# Determine target device
|
||||||
|
target_device = None
|
||||||
|
if len(args) > 0:
|
||||||
|
if isinstance(args[0], torch.device):
|
||||||
|
target_device = args[0]
|
||||||
|
elif isinstance(args[0], str):
|
||||||
|
target_device = torch.device(args[0])
|
||||||
|
if 'device' in kwargs:
|
||||||
|
target_device = kwargs['device']
|
||||||
|
if isinstance(target_device, str):
|
||||||
|
target_device = torch.device(target_device)
|
||||||
|
|
||||||
|
# Call original 'to' method first to move data
|
||||||
|
result = original_to(*args, **kwargs)
|
||||||
|
|
||||||
|
# If moved to CUDA, cleanup the mmap file after the move
|
||||||
|
if target_device is not None and target_device.type == 'cuda':
|
||||||
|
_cleanup()
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Replace the 'to' method
|
||||||
|
mmap_tensor.to = custom_to
|
||||||
|
|
||||||
|
return mmap_tensor
|
||||||
|
|
||||||
def model_to_mmap(model: torch.nn.Module):
|
def model_to_mmap(model: torch.nn.Module):
|
||||||
"""Convert all parameters and buffers to memory-mapped tensors
|
"""Convert all parameters and buffers to memory-mapped tensors
|
||||||
|
|||||||
280
tests/execution/test_model_mmap.py
Normal file
280
tests/execution/test_model_mmap.py
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import psutil
|
||||||
|
import os
|
||||||
|
import gc
|
||||||
|
import tempfile
|
||||||
|
from comfy.model_patcher import model_to_mmap, to_mmap
|
||||||
|
|
||||||
|
|
||||||
|
class LargeModel(nn.Module):
|
||||||
|
"""A simple model with large parameters for testing memory mapping"""
|
||||||
|
|
||||||
|
def __init__(self, size_gb=10):
|
||||||
|
super().__init__()
|
||||||
|
# Calculate number of float32 elements needed for target size
|
||||||
|
# 1 GB = 1024^3 bytes, float32 = 4 bytes
|
||||||
|
bytes_per_gb = 1024 * 1024 * 1024
|
||||||
|
elements_per_gb = bytes_per_gb // 4 # float32 is 4 bytes
|
||||||
|
total_elements = int(size_gb * elements_per_gb)
|
||||||
|
|
||||||
|
# Create a large linear layer
|
||||||
|
# Split into multiple layers to avoid single tensor size limits
|
||||||
|
self.layers = nn.ModuleList()
|
||||||
|
elements_per_layer = 500 * 1024 * 1024 # 500M elements per layer (~2GB)
|
||||||
|
num_layers = (total_elements + elements_per_layer - 1) // elements_per_layer
|
||||||
|
|
||||||
|
for i in range(num_layers):
|
||||||
|
if i == num_layers - 1:
|
||||||
|
# Last layer gets the remaining elements
|
||||||
|
remaining = total_elements - (i * elements_per_layer)
|
||||||
|
in_features = int(remaining ** 0.5)
|
||||||
|
out_features = (remaining + in_features - 1) // in_features
|
||||||
|
else:
|
||||||
|
in_features = int(elements_per_layer ** 0.5)
|
||||||
|
out_features = (elements_per_layer + in_features - 1) // in_features
|
||||||
|
|
||||||
|
# Create layer without bias to control size precisely
|
||||||
|
self.layers.append(nn.Linear(in_features, out_features, bias=False))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for layer in self.layers:
|
||||||
|
x = layer(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
def get_process_memory_gb():
|
||||||
|
"""Get current process memory usage in GB"""
|
||||||
|
process = psutil.Process(os.getpid())
|
||||||
|
mem_info = process.memory_info()
|
||||||
|
return mem_info.rss / (1024 ** 3) # Convert to GB
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_size_gb(model):
|
||||||
|
"""Calculate model size in GB"""
|
||||||
|
total_size = 0
|
||||||
|
for param in model.parameters():
|
||||||
|
total_size += param.nelement() * param.element_size()
|
||||||
|
for buffer in model.buffers():
|
||||||
|
total_size += buffer.nelement() * buffer.element_size()
|
||||||
|
return total_size / (1024 ** 3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_model_to_mmap_memory_efficiency():
|
||||||
|
"""Test that model_to_mmap reduces memory usage for a 10GB model to less than 1GB
|
||||||
|
|
||||||
|
The typical use case is:
|
||||||
|
1. Load a large model on CUDA
|
||||||
|
2. Convert to mmap to offload from GPU to disk-backed memory
|
||||||
|
3. This frees GPU memory and reduces CPU RAM usage
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Check if CUDA is available
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
pytest.skip("CUDA is not available, skipping test")
|
||||||
|
|
||||||
|
# Force garbage collection before starting
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
# Record initial memory
|
||||||
|
initial_cpu_memory = get_process_memory_gb()
|
||||||
|
initial_gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3)
|
||||||
|
print(f"\nInitial CPU memory: {initial_cpu_memory:.2f} GB")
|
||||||
|
print(f"Initial GPU memory: {initial_gpu_memory:.2f} GB")
|
||||||
|
|
||||||
|
# Create a 10GB model
|
||||||
|
print("Creating 10GB model...")
|
||||||
|
model = LargeModel(size_gb=10)
|
||||||
|
|
||||||
|
# Verify model size
|
||||||
|
model_size = get_model_size_gb(model)
|
||||||
|
print(f"Model size: {model_size:.2f} GB")
|
||||||
|
assert model_size >= 9.5, f"Model size {model_size:.2f} GB is less than expected 10 GB"
|
||||||
|
|
||||||
|
# Move model to CUDA
|
||||||
|
print("Moving model to CUDA...")
|
||||||
|
model = model.cuda()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Memory after moving to CUDA
|
||||||
|
cpu_after_cuda = get_process_memory_gb()
|
||||||
|
gpu_after_cuda = torch.cuda.memory_allocated() / (1024 ** 3)
|
||||||
|
print(f"CPU memory after moving to CUDA: {cpu_after_cuda:.2f} GB")
|
||||||
|
print(f"GPU memory after moving to CUDA: {gpu_after_cuda:.2f} GB")
|
||||||
|
|
||||||
|
# Convert to mmap (this should move model from GPU to disk-backed memory)
|
||||||
|
# Note: model_to_mmap modifies the model in-place via _apply()
|
||||||
|
# so model and model_mmap will be the same object
|
||||||
|
print("Converting model to mmap...")
|
||||||
|
model_mmap = model_to_mmap(model)
|
||||||
|
|
||||||
|
# Verify that model and model_mmap are the same object (in-place modification)
|
||||||
|
assert model is model_mmap, "model_to_mmap should modify the model in-place"
|
||||||
|
|
||||||
|
# Force garbage collection and clear CUDA cache
|
||||||
|
# The original CUDA tensors should be automatically freed when replaced
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Memory after mmap conversion
|
||||||
|
cpu_after_mmap = get_process_memory_gb()
|
||||||
|
gpu_after_mmap = torch.cuda.memory_allocated() / (1024 ** 3)
|
||||||
|
print(f"CPU memory after mmap: {cpu_after_mmap:.2f} GB")
|
||||||
|
print(f"GPU memory after mmap: {gpu_after_mmap:.2f} GB")
|
||||||
|
|
||||||
|
# Calculate memory changes from CUDA state (the baseline we're converting from)
|
||||||
|
cpu_increase = cpu_after_mmap - cpu_after_cuda
|
||||||
|
gpu_decrease = gpu_after_cuda - gpu_after_mmap # Should be positive (freed)
|
||||||
|
print(f"\nCPU memory increase from CUDA: {cpu_increase:.2f} GB")
|
||||||
|
print(f"GPU memory freed: {gpu_decrease:.2f} GB")
|
||||||
|
|
||||||
|
# Verify that CPU memory usage increase is less than 1GB
|
||||||
|
# The mmap should use disk-backed storage, keeping CPU RAM usage low
|
||||||
|
# We use 1.5 GB threshold to account for overhead
|
||||||
|
assert cpu_increase < 1.5, (
|
||||||
|
f"CPU memory increase after mmap ({cpu_increase:.2f} GB) should be less than 1.5 GB. "
|
||||||
|
f"CUDA state: {cpu_after_cuda:.2f} GB, After mmap: {cpu_after_mmap:.2f} GB"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify that GPU memory has been freed
|
||||||
|
# We expect at least 9 GB to be freed (original 10GB model with some tolerance)
|
||||||
|
assert gpu_decrease > 9.0, (
|
||||||
|
f"GPU memory should be freed after mmap. "
|
||||||
|
f"Freed: {gpu_decrease:.2f} GB (from {gpu_after_cuda:.2f} to {gpu_after_mmap:.2f} GB), expected > 9 GB"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the model is still functional (basic sanity check)
|
||||||
|
assert model_mmap is not None
|
||||||
|
assert len(list(model_mmap.parameters())) > 0
|
||||||
|
|
||||||
|
print(f"\n✓ Test passed!")
|
||||||
|
print(f" CPU memory increase: {cpu_increase:.2f} GB < 1.5 GB")
|
||||||
|
print(f" GPU memory freed: {gpu_decrease:.2f} GB > 9.0 GB")
|
||||||
|
print(f" Model successfully offloaded from GPU to disk-backed memory")
|
||||||
|
|
||||||
|
# Cleanup (model and model_mmap are the same object)
|
||||||
|
del model, model_mmap
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def test_to_mmap_cuda_cycle():
|
||||||
|
"""Test CUDA -> mmap -> CUDA cycle
|
||||||
|
|
||||||
|
This test verifies:
|
||||||
|
1. CUDA tensor can be converted to mmap tensor
|
||||||
|
2. CPU memory increase is minimal when using mmap (< 0.1 GB)
|
||||||
|
3. GPU memory is freed when converting to mmap
|
||||||
|
4. mmap tensor can be moved back to CUDA
|
||||||
|
5. Data remains consistent throughout the cycle
|
||||||
|
6. mmap file is automatically cleaned up when moved to CUDA
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Check if CUDA is available
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
pytest.skip("CUDA is not available, skipping test")
|
||||||
|
|
||||||
|
# Force garbage collection
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
print("\nTest: CUDA -> mmap -> CUDA cycle")
|
||||||
|
|
||||||
|
# Record initial CPU memory
|
||||||
|
initial_cpu_memory = get_process_memory_gb()
|
||||||
|
print(f"Initial CPU memory: {initial_cpu_memory:.2f} GB")
|
||||||
|
|
||||||
|
# Step 1: Create a CUDA tensor
|
||||||
|
print("\n1. Creating CUDA tensor...")
|
||||||
|
original_data = torch.randn(5000, 5000).cuda()
|
||||||
|
original_sum = original_data.sum().item()
|
||||||
|
print(f" Shape: {original_data.shape}")
|
||||||
|
print(f" Device: {original_data.device}")
|
||||||
|
print(f" Sum: {original_sum:.2f}")
|
||||||
|
|
||||||
|
# Record GPU and CPU memory after CUDA allocation
|
||||||
|
cpu_after_cuda = get_process_memory_gb()
|
||||||
|
gpu_before_mmap = torch.cuda.memory_allocated() / (1024 ** 3)
|
||||||
|
print(f" GPU memory: {gpu_before_mmap:.2f} GB")
|
||||||
|
print(f" CPU memory: {cpu_after_cuda:.2f} GB")
|
||||||
|
|
||||||
|
# Step 2: Convert to mmap tensor
|
||||||
|
print("\n2. Converting to mmap tensor...")
|
||||||
|
mmap_tensor = to_mmap(original_data)
|
||||||
|
del original_data
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
print(f" Device: {mmap_tensor.device}")
|
||||||
|
print(f" Sum: {mmap_tensor.sum().item():.2f}")
|
||||||
|
|
||||||
|
# Verify GPU memory is freed
|
||||||
|
gpu_after_mmap = torch.cuda.memory_allocated() / (1024 ** 3)
|
||||||
|
cpu_after_mmap = get_process_memory_gb()
|
||||||
|
print(f" GPU memory freed: {gpu_before_mmap - gpu_after_mmap:.2f} GB")
|
||||||
|
print(f" CPU memory: {cpu_after_mmap:.2f} GB")
|
||||||
|
|
||||||
|
# Verify GPU memory is freed
|
||||||
|
assert gpu_after_mmap < 0.1, f"GPU memory should be freed, but {gpu_after_mmap:.2f} GB still allocated"
|
||||||
|
|
||||||
|
# Verify CPU memory increase is minimal (should be close to 0 due to mmap)
|
||||||
|
cpu_increase = cpu_after_mmap - cpu_after_cuda
|
||||||
|
print(f" CPU memory increase: {cpu_increase:.2f} GB")
|
||||||
|
assert cpu_increase < 0.1, f"CPU memory should increase minimally, but increased by {cpu_increase:.2f} GB"
|
||||||
|
|
||||||
|
# Get the temp file path (we'll check if it gets cleaned up)
|
||||||
|
# The file should exist at this point
|
||||||
|
temp_files_before = len([f for f in os.listdir(tempfile.gettempdir()) if f.startswith('comfy_mmap_')])
|
||||||
|
print(f" Temp mmap files exist: {temp_files_before}")
|
||||||
|
|
||||||
|
# Step 3: Move back to CUDA
|
||||||
|
print("\n3. Moving back to CUDA...")
|
||||||
|
cuda_tensor = mmap_tensor.to('cuda')
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
print(f" Device: {cuda_tensor.device}")
|
||||||
|
final_sum = cuda_tensor.sum().item()
|
||||||
|
print(f" Sum: {final_sum:.2f}")
|
||||||
|
|
||||||
|
# Verify GPU memory is used again
|
||||||
|
gpu_after_cuda = torch.cuda.memory_allocated() / (1024 ** 3)
|
||||||
|
print(f" GPU memory: {gpu_after_cuda:.2f} GB")
|
||||||
|
|
||||||
|
# Step 4: Verify data consistency
|
||||||
|
print("\n4. Verifying data consistency...")
|
||||||
|
sum_diff = abs(original_sum - final_sum)
|
||||||
|
print(f" Original sum: {original_sum:.2f}")
|
||||||
|
print(f" Final sum: {final_sum:.2f}")
|
||||||
|
print(f" Difference: {sum_diff:.6f}")
|
||||||
|
assert sum_diff < 0.01, f"Data should be consistent, but difference is {sum_diff:.6f}"
|
||||||
|
|
||||||
|
# Step 5: Verify file cleanup
|
||||||
|
print("\n5. Verifying file cleanup...")
|
||||||
|
gc.collect()
|
||||||
|
import time
|
||||||
|
time.sleep(0.1) # Give OS time to clean up
|
||||||
|
temp_files_after = len([f for f in os.listdir(tempfile.gettempdir()) if f.startswith('comfy_mmap_')])
|
||||||
|
print(f" Temp mmap files after: {temp_files_after}")
|
||||||
|
# File should be cleaned up when moved to CUDA
|
||||||
|
assert temp_files_after <= temp_files_before, "mmap file should be cleaned up after moving to CUDA"
|
||||||
|
|
||||||
|
print("\n✓ Test passed!")
|
||||||
|
print(" CUDA -> mmap -> CUDA cycle works correctly")
|
||||||
|
print(f" CPU memory increase: {cpu_increase:.2f} GB < 0.1 GB (mmap efficiency)")
|
||||||
|
print(" Data consistency maintained")
|
||||||
|
print(" File cleanup successful")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
del mmap_tensor, cuda_tensor
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run the tests directly
|
||||||
|
test_model_to_mmap_memory_efficiency()
|
||||||
|
test_to_mmap_cuda_cycle()
|
||||||
|
|
||||||
Loading…
Reference in New Issue
Block a user