Update cfz_patcher.py

This commit is contained in:
patientx 2025-06-06 18:07:06 +03:00 committed by GitHub
parent 8beae68f3f
commit e6367f7183
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -350,25 +350,35 @@ class UNetQuantizationPatcher:
CATEGORY = "Model Patching" CATEGORY = "Model Patching"
OUTPUT_NODE = False OUTPUT_NODE = False
def get_model_memory_usage(self, model): def get_model_memory_usage(self, model, force_calculation=False):
"""Calculate actual memory usage of model parameters""" """Calculate memory usage of model parameters (CPU + GPU)"""
total_memory = 0 total_memory = 0
param_count = 0 param_count = 0
gpu_memory = 0
# Count all parameters (CPU + GPU)
for param in model.parameters(): for param in model.parameters():
memory_bytes = param.data.element_size() * param.data.nelement()
total_memory += memory_bytes
param_count += param.data.nelement()
if param.data.is_cuda: if param.data.is_cuda:
# Get actual memory usage on GPU gpu_memory += memory_bytes
memory_bytes = param.data.element_size() * param.data.nelement()
total_memory += memory_bytes
param_count += param.data.nelement()
# Also check for quantized buffers # Also check for quantized buffers
for name, buffer in model.named_buffers(): for name, buffer in model.named_buffers():
if buffer.is_cuda and ('int8_weight' in name or 'scale' in name or 'zero_point' in name): if 'int8_weight' in name or 'scale' in name or 'zero_point' in name:
memory_bytes = buffer.element_size() * buffer.nelement() memory_bytes = buffer.element_size() * buffer.nelement()
total_memory += memory_bytes total_memory += memory_bytes
if buffer.is_cuda:
gpu_memory += memory_bytes
return total_memory, param_count # If force_calculation is True and nothing on GPU, return total memory as estimate
if force_calculation and gpu_memory == 0:
return total_memory, param_count, total_memory
return total_memory, param_count, gpu_memory
def format_memory_size(self, bytes_size): def format_memory_size(self, bytes_size):
"""Format memory size in human readable format""" """Format memory size in human readable format"""
@ -385,10 +395,14 @@ class UNetQuantizationPatcher:
# Measure original memory usage # Measure original memory usage
if show_memory_usage: if show_memory_usage:
original_memory, original_params = self.get_model_memory_usage(model.model) original_memory, original_params, original_gpu = self.get_model_memory_usage(model.model, force_calculation=True)
print(f"📊 Original Model Memory Usage:") print(f"📊 Original Model Memory Usage:")
print(f" Parameters: {original_params:,}") print(f" Parameters: {original_params:,}")
print(f" VRAM Usage: {self.format_memory_size(original_memory)}") print(f" Total Size: {self.format_memory_size(original_memory)}")
if original_gpu > 0:
print(f" GPU Memory: {self.format_memory_size(original_gpu)}")
else:
print(f" GPU Memory: Not loaded (will use ~{self.format_memory_size(original_memory)} when loaded)")
quantized_model = copy.deepcopy(model) quantized_model = copy.deepcopy(model)
@ -414,27 +428,26 @@ class UNetQuantizationPatcher:
# Measure quantized memory usage # Measure quantized memory usage
if show_memory_usage: if show_memory_usage:
# Force GPU memory allocation by moving model to device if needed quantized_memory, quantized_params, quantized_gpu = self.get_model_memory_usage(quantized_model.model, force_calculation=True)
if torch.cuda.is_available():
device = next(quantized_model.model.parameters()).device
quantized_model.model.to(device)
quantized_memory, quantized_params = self.get_model_memory_usage(quantized_model.model)
memory_saved = original_memory - quantized_memory memory_saved = original_memory - quantized_memory
memory_reduction_pct = (memory_saved / original_memory) * 100 if original_memory > 0 else 0 memory_reduction_pct = (memory_saved / original_memory) * 100 if original_memory > 0 else 0
print(f"📊 Quantized Model Memory Usage:") print(f"📊 Quantized Model Memory Usage:")
print(f" Parameters: {quantized_params:,}") print(f" Parameters: {quantized_params:,}")
print(f" VRAM Usage: {self.format_memory_size(quantized_memory)}") print(f" Total Size: {self.format_memory_size(quantized_memory)}")
if quantized_gpu > 0:
print(f" GPU Memory: {self.format_memory_size(quantized_gpu)}")
else:
print(f" GPU Memory: Not loaded (will use ~{self.format_memory_size(quantized_memory)} when loaded)")
print(f" Memory Saved: {self.format_memory_size(memory_saved)} ({memory_reduction_pct:.1f}%)") print(f" Memory Saved: {self.format_memory_size(memory_saved)} ({memory_reduction_pct:.1f}%)")
# Show CUDA memory info if available # Show CUDA memory info if available
if torch.cuda.is_available(): if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() allocated = torch.cuda.memory_allocated()
reserved = torch.cuda.memory_reserved() reserved = torch.cuda.memory_reserved()
print(f"📊 Total GPU Memory:") print(f"📊 Total GPU Memory Status:")
print(f" Allocated: {self.format_memory_size(allocated)}") print(f" Currently Allocated: {self.format_memory_size(allocated)}")
print(f" Reserved: {self.format_memory_size(reserved)}") print(f" Reserved by PyTorch: {self.format_memory_size(reserved)}")
return (quantized_model,) return (quantized_model,)
@ -527,4 +540,4 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"UNetQuantizationPatcher": "CFZ UNet Quantization Patcher", "UNetQuantizationPatcher": "CFZ UNet Quantization Patcher",
} }
__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS'] __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']