Update cfz_patcher.py

2026-03-01 07:17:32 +08:00 · 2025-06-06 18:07:06 +03:00 · 2025-06-06 18:07:06 +03:00 · e6367f7183
commit e6367f7183
parent 8beae68f3f
1 changed files with 34 additions and 21 deletions
--- a/cfz/cfz_patcher.py
+++ b/cfz/cfz_patcher.py
@ -350,25 +350,35 @@ class UNetQuantizationPatcher:
    CATEGORY = "Model Patching"
    OUTPUT_NODE = False

-    def get_model_memory_usage(self, model):
-        """Calculate actual memory usage of model parameters"""
+    def get_model_memory_usage(self, model, force_calculation=False):
+        """Calculate memory usage of model parameters (CPU + GPU)"""
        total_memory = 0
        param_count = 0
+        gpu_memory = 0
        
+        # Count all parameters (CPU + GPU)
        for param in model.parameters():
+            memory_bytes = param.data.element_size() * param.data.nelement()
+            total_memory += memory_bytes
+            param_count += param.data.nelement()
+            
            if param.data.is_cuda:
-                # Get actual memory usage on GPU
-                memory_bytes = param.data.element_size() * param.data.nelement()
-                total_memory += memory_bytes
-                param_count += param.data.nelement()
+                gpu_memory += memory_bytes
        
        # Also check for quantized buffers
        for name, buffer in model.named_buffers():
-            if buffer.is_cuda and ('int8_weight' in name or 'scale' in name or 'zero_point' in name):
+            if 'int8_weight' in name or 'scale' in name or 'zero_point' in name:
                memory_bytes = buffer.element_size() * buffer.nelement()
                total_memory += memory_bytes
+                
+                if buffer.is_cuda:
+                    gpu_memory += memory_bytes
        
-        return total_memory, param_count
+        # If force_calculation is True and nothing on GPU, return total memory as estimate
+        if force_calculation and gpu_memory == 0:
+            return total_memory, param_count, total_memory
+        
+        return total_memory, param_count, gpu_memory

    def format_memory_size(self, bytes_size):
        """Format memory size in human readable format"""
@ -385,10 +395,14 @@ class UNetQuantizationPatcher:
        
        # Measure original memory usage
        if show_memory_usage:
-            original_memory, original_params = self.get_model_memory_usage(model.model)
+            original_memory, original_params, original_gpu = self.get_model_memory_usage(model.model, force_calculation=True)
            print(f"📊 Original Model Memory Usage:")
            print(f"   Parameters: {original_params:,}")
-            print(f"   VRAM Usage: {self.format_memory_size(original_memory)}")
+            print(f"   Total Size: {self.format_memory_size(original_memory)}")
+            if original_gpu > 0:
+                print(f"   GPU Memory: {self.format_memory_size(original_gpu)}")
+            else:
+                print(f"   GPU Memory: Not loaded (will use ~{self.format_memory_size(original_memory)} when loaded)")
        
        quantized_model = copy.deepcopy(model)
        
@ -414,27 +428,26 @@ class UNetQuantizationPatcher:
        
        # Measure quantized memory usage
        if show_memory_usage:
-            # Force GPU memory allocation by moving model to device if needed
-            if torch.cuda.is_available():
-                device = next(quantized_model.model.parameters()).device
-                quantized_model.model.to(device)
-            
-            quantized_memory, quantized_params = self.get_model_memory_usage(quantized_model.model)
+            quantized_memory, quantized_params, quantized_gpu = self.get_model_memory_usage(quantized_model.model, force_calculation=True)
            memory_saved = original_memory - quantized_memory
            memory_reduction_pct = (memory_saved / original_memory) * 100 if original_memory > 0 else 0
            
            print(f"📊 Quantized Model Memory Usage:")
            print(f"   Parameters: {quantized_params:,}")
-            print(f"   VRAM Usage: {self.format_memory_size(quantized_memory)}")
+            print(f"   Total Size: {self.format_memory_size(quantized_memory)}")
+            if quantized_gpu > 0:
+                print(f"   GPU Memory: {self.format_memory_size(quantized_gpu)}")
+            else:
+                print(f"   GPU Memory: Not loaded (will use ~{self.format_memory_size(quantized_memory)} when loaded)")
            print(f"   Memory Saved: {self.format_memory_size(memory_saved)} ({memory_reduction_pct:.1f}%)")
            
            # Show CUDA memory info if available
            if torch.cuda.is_available():
                allocated = torch.cuda.memory_allocated()
                reserved = torch.cuda.memory_reserved()
-                print(f"📊 Total GPU Memory:")
-                print(f"   Allocated: {self.format_memory_size(allocated)}")
-                print(f"   Reserved: {self.format_memory_size(reserved)}")
+                print(f"📊 Total GPU Memory Status:")
+                print(f"   Currently Allocated: {self.format_memory_size(allocated)}")
+                print(f"   Reserved by PyTorch: {self.format_memory_size(reserved)}")
        
        return (quantized_model,)
    
@ -527,4 +540,4 @@ NODE_DISPLAY_NAME_MAPPINGS = {
    "UNetQuantizationPatcher": "CFZ UNet Quantization Patcher",
 }

-__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
+__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']